In [27]:
#importing pyspark
import pyspark
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [28]:
#importing sparksession
from pyspark.sql import SparkSession 

In [29]:
#creating a sparksession object and providing appName
spark=SparkSession.builder.appName("dataframe1").getOrCreate()

<h1> Ways of Creating dataframe in pyspark </h1>
<h1> Create Dataframe from Tuples </h1>

In [30]:
data = [(1, 'Dhoni', 'Male', 38, 10),
        (2, 'Virat', 'Male', 30, 5),
        (3, 'Smirit', 'Female', 25, 9),
        (4, 'Sachin', 'Male', 45, 50),
        (5, 'Mitali', 'Female', 35, 7)
        ]
columns = ["ID", "Cricketer Name", "Gender", "Age", "Century"]

In [31]:
df = spark.createDataFrame(data,columns)

In [32]:
display(df)

DataFrame[ID: bigint, Cricketer Name: string, Gender: string, Age: bigint, Century: bigint]

In [33]:
df.show()

+---+--------------+------+---+-------+
| ID|Cricketer Name|Gender|Age|Century|
+---+--------------+------+---+-------+
|  1|         Dhoni|  Male| 38|     10|
|  2|         Virat|  Male| 30|      5|
|  3|        Smirit|Female| 25|      9|
|  4|        Sachin|  Male| 45|     50|
|  5|        Mitali|Female| 35|      7|
+---+--------------+------+---+-------+



In [63]:
spark.stop()

<h1> Create Dataframe from Dictionary </h1>

In [42]:
data = [{'Flower':'Rose', 'price': 100, 'quantity':10},
        {'Flower':'Tulip', 'price': 500, 'quantity':10},
        {'Flower':'Sunflower', 'price': 400, 'quantity':6},
        {'Flower':'Jasmine', 'price': 100, 'quantity':12}]

In [43]:
#Creating a dataframe
df1 = spark.createDataFrame(data)

In [44]:
type(df1)

pyspark.sql.dataframe.DataFrame

In [45]:
df1.show()

+---------+-----+--------+
|   Flower|price|quantity|
+---------+-----+--------+
|     Rose|  100|      10|
|    Tulip|  500|      10|
|Sunflower|  400|       6|
|  Jasmine|  100|      12|
+---------+-----+--------+



In [46]:
#importing pandas library
import pandas as pd

<h1> Create Pandas Dataframe </h1>

In [48]:
pandasDF = pd.read_csv("airlines1.csv")

In [49]:
pandasDF.shape

(50001, 110)

In [50]:
pandasDF

Unnamed: 0.1,Unnamed: 0,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,DOT_ID_Reporting_Airline,IATA_CODE_Reporting_Airline,...,Div4WheelsOff,Div4TailNum,Div5Airport,Div5AirportID,Div5AirportSeqID,Div5WheelsOn,Div5TotalGTime,Div5LongestGTime,Div5WheelsOff,Div5TailNum
0,0,1998,1,1,2,5,1998-01-02,NW,19386,NW,...,,,,,,,,,,
1,1,2009,2,5,28,4,2009-05-28,FL,20437,FL,...,,,,,,,,,,
2,2,2013,2,6,29,6,2013-06-29,MQ,20398,MQ,...,,,,,,,,,,
3,3,2010,3,8,31,2,2010-08-31,DL,19790,DL,...,,,,,,,,,,
4,4,2006,1,1,15,7,2006-01-15,US,20355,US,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49996,49996,1999,4,10,12,2,1999-10-12,WN,19393,WN,...,,,,,,,,,,
49997,49997,2010,4,11,1,1,2010-11-01,OH,20417,OH,...,,,,,,,,,,
49998,49998,2006,2,4,30,7,2006-04-30,UA,19977,UA,...,,,,,,,,,,
49999,49999,2009,3,7,8,3,2009-07-08,F9,20436,F9,...,,,,,,,,,,


<h1> Create spark Dataframe from Pandas Dataframe </h1>

<h1> we need to cast all the columns in the pandas df to string type to overcome this datatype issue while converting pandas df to spark df </h1>

In [51]:
sparkDF = spark.createDataFrame(pandasDF.astype(str))

In [52]:
sparkDF.printSchema()

root
 |-- Unnamed: 0: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Quarter: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- FlightDate: string (nullable = true)
 |-- Reporting_Airline: string (nullable = true)
 |-- DOT_ID_Reporting_Airline: string (nullable = true)
 |-- IATA_CODE_Reporting_Airline: string (nullable = true)
 |-- Tail_Number: string (nullable = true)
 |-- Flight_Number_Reporting_Airline: string (nullable = true)
 |-- OriginAirportID: string (nullable = true)
 |-- OriginAirportSeqID: string (nullable = true)
 |-- OriginCityMarketID: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- OriginCityName: string (nullable = true)
 |-- OriginState: string (nullable = true)
 |-- OriginStateFips: string (nullable = true)
 |-- OriginStateName: string (nullable = true)
 |-- OriginWac: string (nullable = true)
 |-- DestAirportID: string (nullable 

In [54]:
display(sparkDF)

DataFrame[Unnamed: 0: string, Year: string, Quarter: string, Month: string, DayofMonth: string, DayOfWeek: string, FlightDate: string, Reporting_Airline: string, DOT_ID_Reporting_Airline: string, IATA_CODE_Reporting_Airline: string, Tail_Number: string, Flight_Number_Reporting_Airline: string, OriginAirportID: string, OriginAirportSeqID: string, OriginCityMarketID: string, Origin: string, OriginCityName: string, OriginState: string, OriginStateFips: string, OriginStateName: string, OriginWac: string, DestAirportID: string, DestAirportSeqID: string, DestCityMarketID: string, Dest: string, DestCityName: string, DestState: string, DestStateFips: string, DestStateName: string, DestWac: string, CRSDepTime: string, DepTime: string, DepDelay: string, DepDelayMinutes: string, DepDel15: string, DepartureDelayGroups: string, DepTimeBlk: string, TaxiOut: string, WheelsOff: string, WheelsOn: string, TaxiIn: string, CRSArrTime: string, ArrTime: string, ArrDelay: string, ArrDelayMinutes: string, Arr

In [53]:
sparkDF.show()

+----------+----+-------+-----+----------+---------+----------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+--------------------+---------+-------------+--------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+--------------+-----------------+-------+-------+--------+-------------+------------+------------+--------+-------------+-----------------+------------+-------------+---------------+------------------+--------------+--------------------+-----------+-----------+-----------+-------------+----------------+------------+----------

In [56]:
sparknewdf = sparkDF.select("Year", "Month", "DayofMonth", "FlightDate", "Tail_Number", "Flight_Number_Reporting_Airline")

In [57]:
sparknewdf.printSchema()

root
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- DayofMonth: string (nullable = true)
 |-- FlightDate: string (nullable = true)
 |-- Tail_Number: string (nullable = true)
 |-- Flight_Number_Reporting_Airline: string (nullable = true)



<h1> Create Pandas Dataframe from Spark Dataframe </h1>

In [59]:
pdf = sparknewdf.toPandas()

In [60]:
pdf

Unnamed: 0,Year,Month,DayofMonth,FlightDate,Tail_Number,Flight_Number_Reporting_Airline
0,1998,1,2,1998-01-02,N297US,675
1,2009,5,28,2009-05-28,N946AT,671
2,2013,6,29,2013-06-29,N665MQ,3297
3,2010,8,31,2010-08-31,N6705Y,1806
4,2006,1,15,2006-01-15,N504AU,465
...,...,...,...,...,...,...
49996,1999,10,12,1999-10-12,N311,1723
49997,2010,11,1,2010-11-01,N940CA,6601
49998,2006,4,30,2006-04-30,N509UA,493
49999,2009,7,8,2009-07-08,N804FR,573


In [61]:
#To create dataframe form External datasets
df = spark.read.option("header", "true").csv("airlines1.csv")

In [62]:
df.show()

+---+----+-------+-----+----------+---------+----------+-----------------+------------------------+---------------------------+-----------+-------------------------------+---------------+------------------+------------------+------+--------------------+-----------+---------------+---------------+---------+-------------+----------------+----------------+----+--------------------+---------+-------------+--------------+-------+----------+-------+--------+---------------+--------+--------------------+----------+-------+---------+--------+------+----------+-------+--------+---------------+--------+------------------+----------+---------+----------------+--------+--------------+-----------------+-------+-------+--------+-------------+------------+------------+--------+-------------+-----------------+------------+-------------+---------------+------------------+--------------+--------------------+-----------+-----------+-----------+-------------+----------------+------------+--------------+--