In [None]:
!pip install pyspark py4j



In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("pyspark_basics").getOrCreate()

#Create a DataFrame

In [None]:
%%writefile user_simple.json
{"name":"Bob"}
{"name":"Jim", "age":40}
{"name":"Mary", "age": 24}

Writing user_simple.json


In [None]:
df = spark.read.json("user_simple.json")
df

DataFrame[age: bigint, name: string]

#Show DataFrame

In [None]:
df.show()

+----+----+
| age|name|
+----+----+
|NULL| Bob|
|  40| Jim|
|  24|Mary|
+----+----+



In [None]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [None]:
df.columns

['age', 'name']

In [None]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [None]:
df.describe().show()

+-------+------------------+----+
|summary|               age|name|
+-------+------------------+----+
|  count|                 2|   3|
|   mean|              32.0|NULL|
| stddev|11.313708498984761|NULL|
|    min|                24| Bob|
|    max|                40|Mary|
+-------+------------------+----+



#Specifying Schema Structure

In [None]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType
data_schema = [StructField("age", IntegerType(), True), StructField("name",StringType(), True)]
final_struc = StructType(fields=data_schema)
df = spark.read.json("user_simple.json", schema=final_struc)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [None]:
df.show()

+----+----+
| age|name|
+----+----+
|NULL| Bob|
|  40| Jim|
|  24|Mary|
+----+----+



#Create new Columns and Replace Columns

In [None]:
df.withColumn("newAge", df['age']).show()

+----+----+------+
| age|name|newAge|
+----+----+------+
|NULL| Bob|  NULL|
|  40| Jim|    40|
|  24|Mary|    24|
+----+----+------+



In [None]:
df.show()

+----+----+
| age|name|
+----+----+
|NULL| Bob|
|  40| Jim|
|  24|Mary|
+----+----+



In [None]:
df.withColumnRenamed("name","firstName").show()

+----+---------+
| age|firstName|
+----+---------+
|NULL|      Bob|
|  40|      Jim|
|  24|     Mary|
+----+---------+



In [None]:
df.show()

+----+----+
| age|name|
+----+----+
|NULL| Bob|
|  40| Jim|
|  24|Mary|
+----+----+



#Closing the session

In [None]:
spark.stop()