In [1]:
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

Spark Context is used to work with RDD

Spark Session is used to work with Dataframes

In [2]:
#setting Spark Configuration
conf = SparkConf().setAppName("first").setMaster("local")

#creating Spark Context
sc = SparkContext(conf = conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/22 03:42:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
#creating Spark Session
spark = SparkSession.builder.getOrCreate()

## Dataframe

In [4]:
# working with purchases.csv file to create a dataframe during Read
#Dataframe

df = spark.read.format("csv").option("inferSchema",True).option("header",True).load("/home/labuser/Downloads/purchases.csv")

                                                                                

In [5]:
#printing the dataframe
df.show()

+------+------+-------+
|   _c0|apples|oranges|
+------+------+-------+
|  June|     3|      0|
|Robert|     2|      3|
|  Lily|     0|      7|
| David|     1|      2|
+------+------+-------+



23/09/22 03:43:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , apples, oranges
 Schema: _c0, apples, oranges
Expected: _c0 but found: 
CSV file: file:///home/labuser/Downloads/purchases.csv


In [6]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- apples: integer (nullable = true)
 |-- oranges: integer (nullable = true)



## Row RDD in Spark to create Dataframes

In [17]:
from pyspark.sql.types import StructType,IntegerType,StructField,StringType,DoubleType

udfSchema = StructType([StructField("Rankejriohwerhwjkbe",IntegerType(),True),
                        StructField("Title",StringType(), True),
                        StructField("Genre",StringType(), True),
                        StructField("Description",StringType(), True),
                        StructField("Director",StringType(), True),
                        StructField("Actors",StringType(), True),
                        StructField("Year",StringType(), True),
                        StructField("Runtime (Minutes)",StringType(), True),
                        StructField("Rating",StringType(), True),
                        StructField("Revenue (Millions)",DoubleType(), True),
                        StructField("Metascore",DoubleType(), True)
                        ])

In [25]:
"""
movie_file = sc.textFile("/home/labuser/Downloads/IMDB-Movie-Data.csv")

mapsplit = movie_file.map(lambda x: x.split(","))

rowrdd = mapsplit.map(lambda x: Row(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10)))

movie_df = spark.createDataFrame(rowrdd, udfSchema)
movie_df.show()
"""

'\nmovie_file = sc.textFile("/home/labuser/Downloads/IMDB-Movie-Data.csv")\n\nmapsplit = movie_file.map(lambda x: x.split(","))\n\nrowrdd = mapsplit.map(lambda x: Row(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9), x(10)))\n\nmovie_df = spark.createDataFrame(rowrdd, udfSchema)\nmovie_df.show()\n'

In [8]:
#instead of using the load method, .csv is directly used which makes it clear that the file format is csv
#hence, .format need not be used here
#in the next example, .format is needed without which execution fails

movie = spark.read.schema(udfSchema).option("header",True).csv("/home/labuser/Downloads/IMDB-Movie-Data.csv")

In [9]:
movie.show()

23/09/22 03:43:44 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 12, schema size: 11
CSV file: file:///home/labuser/Downloads/IMDB-Movie-Data.csv


+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------+------------------+---------+
|Rankejriohwerhwjkbe|               Title|               Genre|         Description|            Director|              Actors|                Year|Runtime (Minutes)|Rating|Revenue (Millions)|Metascore|
+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------------+------+------------------+---------+
|                  1|Guardians of the ...|Action,Adventure,...|A group of interg...|          James Gunn|Chris Pratt, Vin ...|                2014|              121|   8.1|          757074.0|   333.13|
|                  2|          Prometheus|Adventure,Mystery...|Following clues t...|        Ridley Scott|Noomi Rapace, Log...|                2012|              124|     7|          485820.0| 

[Stage 3:>                                                          (0 + 1) / 1]                                                                                

In [11]:
from pyspark.sql.types import *


purchase_schema = StructType([StructField("Name", StringType(), True),
                       StructField("Apples", IntegerType(), True),
                       StructField("Oranges", IntegerType(), True)])

purchase = spark.read.format("csv").schema(purchase_schema).option("header",True).load('/home/labuser/Downloads/purchases.csv')

In [12]:
purchase.show()

+------+------+-------+
|  Name|Apples|Oranges|
+------+------+-------+
|  June|     3|      0|
|Robert|     2|      3|
|  Lily|     0|      7|
| David|     1|      2|
+------+------+-------+



23/09/22 03:44:16 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , apples, oranges
 Schema: Name, Apples, Oranges
Expected: Name but found: 
CSV file: file:///home/labuser/Downloads/purchases.csv


## Transformation Chaining

### withColumn

In [13]:
#withColumn

purchase = purchase.withColumnRenamed("Name", "FirstName")
purchase.show()

+---------+------+-------+
|FirstName|Apples|Oranges|
+---------+------+-------+
|     June|     3|      0|
|   Robert|     2|      3|
|     Lily|     0|      7|
|    David|     1|      2|
+---------+------+-------+



23/09/22 03:44:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , apples, oranges
 Schema: Name, Apples, Oranges
Expected: Name but found: 
CSV file: file:///home/labuser/Downloads/purchases.csv


### withColumnRenamed

In [15]:
from pyspark.sql.functions import *
purchase = purchase.withColumn("NewApple", col("Apples")+5)
purchase.show()

+---------+------+-------+--------+
|FirstName|Apples|Oranges|NewApple|
+---------+------+-------+--------+
|     June|     3|      0|       8|
|   Robert|     2|      3|       7|
|     Lily|     0|      7|       5|
|    David|     1|      2|       6|
+---------+------+-------+--------+



23/09/22 03:44:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , apples, oranges
 Schema: Name, Apples, Oranges
Expected: Name but found: 
CSV file: file:///home/labuser/Downloads/purchases.csv


### lit and withColumn

In [16]:
purchase = purchase.withColumn("LitApple", lit(0))
purchase.show()

+---------+------+-------+--------+--------+
|FirstName|Apples|Oranges|NewApple|LitApple|
+---------+------+-------+--------+--------+
|     June|     3|      0|       8|       0|
|   Robert|     2|      3|       7|       0|
|     Lily|     0|      7|       5|       0|
|    David|     1|      2|       6|       0|
+---------+------+-------+--------+--------+



23/09/22 03:44:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , apples, oranges
 Schema: Name, Apples, Oranges
Expected: Name but found: 
CSV file: file:///home/labuser/Downloads/purchases.csv
