<a href="https://colab.research.google.com/github/varshachawan/PySparkApplication/blob/master/InferredAndExplicitSchemas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Inferred and explicit schemas

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirror.olnevhost.net/pub/apache/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz

In [2]:
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"]="/content/spark-2.4.5-bin-hadoop2.7"
!echo $JAVA_HOME
import findspark
findspark.init()

/usr/lib/jvm/java-8-openjdk-amd64


In [0]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Inferred and explicit schemas") \
    .getOrCreate()

In [0]:
from pyspark.sql.types import Row
sc = spark.sparkContext

#### Inferring schema

---



---



---



In [0]:
lines = sc.textFile("./students.txt")

In [7]:
lines

./students.txt MapPartitionsRDD[3] at textFile at NativeMethodAccessorImpl.java:0

In [8]:
parts = lines.map(lambda l: l.split(","))

parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [0]:
students = parts.map(lambda p: Row(name=p[0], math=int(p[1]), english=int(p[2]), science=int(p[3])))

In [10]:
students.collect()

[Row(english=55, math=44, name='Emily', science=78),
 Row(english=34, math=47, name='Andy', science=89),
 Row(english=78, math=55, name='Rick', science=55),
 Row(english=34, math=66, name='Aaron', science=98)]

In [0]:
schemaStudents = spark.createDataFrame(students)

schemaStudents.createOrReplaceTempView("students")

In [12]:
schemaStudents.columns

['english', 'math', 'name', 'science']

In [13]:
schemaStudents.schema

StructType(List(StructField(english,LongType,true),StructField(math,LongType,true),StructField(name,StringType,true),StructField(science,LongType,true)))

In [14]:
spark.sql("SELECT * FROM students").show()

+-------+----+-----+-------+
|english|math| name|science|
+-------+----+-----+-------+
|     55|  44|Emily|     78|
|     34|  47| Andy|     89|
|     78|  55| Rick|     55|
|     34|  66|Aaron|     98|
+-------+----+-----+-------+



#### Explicit schema

In [15]:
parts.collect()

[['Emily', '44', '55', '78'],
 ['Andy', '47', '34', '89'],
 ['Rick', '55', '78', '55'],
 ['Aaron', '66', '34', '98']]

In [0]:
schemaString = "name math english science"

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, LongType

fields = [StructField('name', StringType(), True),
          StructField('math', LongType(), True),
          StructField('english', LongType(), True),
          StructField('science', LongType(), True),
]

In [0]:
schema = StructType(fields)

In [0]:
schemaStudents = spark.createDataFrame(parts, schema)

In [0]:
schemaStudents.columns

['name', 'math', 'english', 'science']

In [0]:
schemaStudents.schema

StructType(List(StructField(name,StringType,true),StructField(math,LongType,true),StructField(english,LongType,true),StructField(science,LongType,true)))

In [0]:
spark.sql("SELECT * FROM students").show()

+-------+----+-----+-------+
|english|math| name|science|
+-------+----+-----+-------+
|     55|  44|Emily|     78|
|     34|  47| Andy|     89|
|     78|  55| Rick|     55|
|     34|  66|Aaron|     98|
+-------+----+-----+-------+

