In [1]:
#For LocalFileSystem
import findspark
findspark.init()
findspark.find()

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.master('local').appName('initDataFrames').enableHiveSupport().getOrCreate()

In [1]:
spark

''

In [4]:
sc = spark.sparkContext

In [5]:
orders = spark.read.csv('retail_db//orders.csv')

### Creation of Dataframes in PySpark - Method 1

In [6]:
spark.createDataFrame([(1,)]).show()
#Creation of dataframes are enabled by list of tuples. Note that while mentioning tuples with one element then we need to specify comma at the end as above.

+---+
| _1|
+---+
|  1|
+---+



#### Using List

In [10]:
l = [('Alice', 1)]
spark.createDataFrame(l).show()

+-----+---+
|   _1| _2|
+-----+---+
|Alice|  1|
+-----+---+



In [11]:
spark.createDataFrame(l, ['name', 'age']).show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
+-----+---+



#### Using Dictionary (deprecated)

In [None]:
d = [{'name': 'Alice', 'age': 1}]

In [None]:
spark.createDataFrame(d).show()

#### Using rdd

In [None]:
l = [('Alice', 1)]
rdd = sc.parallelize(l)
spark.createDataFrame(rdd).show()

In [None]:
spark.createDataFrame(rdd, ['name', 'age']).show()

In [None]:
spark.createDataFrame(rdd, "a: string, b: int").show()

#### Using Row

In [None]:
Person = Row('name', 'age')
person = rdd.map(lambda x: Person(*x))
spark.createDataFrame(person).show()

### Creation of Pyspark Dataframes - Method 2
#### By loading the file as RDD and explicitly manipulating the fields and specifying datatypes

In [None]:
ordersRDD = sc.textFile('retail_db//orders.csv')

In [None]:
orders = ordersRDD.map(lambda x:(Row(x.split(',')[0],x.split(',')[1],x.split(',')[2],x.split(',')[3]))).toDF(). \
        toDF('order_id','order_date','order_customer_id','order_status')
orders.printSchema()

In [None]:
orders.show()

### Creation of Pyspark Dataframes - Method 3
#### By loading the file as csv and mentioning the schema

In [None]:
ordersSchema = StructType([
StructField('order_id',IntegerType(),False),
StructField('order_date',TimestampType(),False),
StructField('order_customer_id',IntegerType(),False),
StructField('order_status',StringType(),False)
])
orders = spark.read.csv('retail_db//orders.csv',header=True,schema=ordersSchema)
orders.printSchema()

In [None]:
orders.show()

#### By loading the file as csv and modify datatypes

In [None]:
orders = spark.read.csv('retail_db//orders.csv',header=True).toDF('order_id','order_date','order_customer_id','order_status')
orders.printSchema()

In [None]:
orders.withColumn('order_id',orders.order_date.cast('int')).\
       withColumn('order_date',orders.order_date.cast('timestamp')).\
       withColumn('order_customer_id',orders.order_customer_id.cast('int')).\
       withColumn('order_status',orders.order_status.cast('string'))
orders.printSchema()

In [None]:
orders.show()

#### By loading the file as csv and inferring schema automatically


In [None]:
orders = spark.read.csv('retail_db//orders.csv',header=True,inferSchema=True,sep=',')
orders.printSchema()

In [None]:
orders.show()