In [2]:
import findspark as fs
fs.init()
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import isnan, when, count, col, translate
from pyspark.sql.types import IntegerType, FloatType
from pyspark.sql import functions as F
spark=SparkSession.builder.appName('GoogleData').master('local').getOrCreate()
sc=spark.sparkContext
sql=SQLContext(sc)

In [3]:
orders = spark.read.csv('retail_db//orders.csv')

### Creation of Dataframes in PySpark - Method 1

In [10]:
spark.createDataFrame(data=[(1,2),(3,4)],schema=("1","2")).show()
#Creation of dataframes are enabled by list of tuples. Note that while mentioning tuples with one element then we need to specify comma at the end as above.

+---+---+
| _1| _2|
+---+---+
|  1|  2|
|  3|  4|
+---+---+



#### Using List

In [11]:
l = [('Alice', 1)]
spark.createDataFrame(l).show()

+-----+---+
|   _1| _2|
+-----+---+
|Alice|  1|
+-----+---+



In [12]:
spark.createDataFrame(l, ['name', 'age']).show()

+-----+---+
| name|age|
+-----+---+
|Alice|  1|
+-----+---+



#### Using Dictionary (deprecated)

In [None]:
d = [{'name': 'Alice', 'age': 1}]

In [None]:
spark.createDataFrame(d).show()

#### Using rdd

In [13]:
l = [('Alice', 1)]
rdd = sc.parallelize(l)
spark.createDataFrame(rdd).show()

+-----+---+
|   _1| _2|
+-----+---+
|Alice|  1|
+-----+---+



In [17]:
spark.createDataFrame(rdd, ['name', 'age']).printSchema()


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [18]:
spark.createDataFrame(rdd, "Name: string, Age: int").printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



### Creation of Pyspark Dataframes - Method 2
#### By loading the file as csv and mentioning the schema

In [33]:
orders = spark.read.csv('retail_db//orders.csv',header=True,inferSchema=True)
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [30]:
orders.show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|

#### By loading the file as csv and modify datatypes

In [42]:
orders = spark.read.csv('retail_db//orders.csv',header=True)
orders=orders.toDF('order_id','order_date','order_customer_id','order_status')
orders.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: string (nullable = true)
 |-- order_status: string (nullable = true)



In [49]:
orders=orders.withColumn('order_id',orders.order_id.cast('int')).\
       withColumn('order_date',orders.order_date.cast('timestamp')).\
       withColumn('order_customer_id',orders.order_customer_id.cast('int')).\
       withColumn('order_status',orders.order_status.cast('string'))
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [47]:
orders.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

#### By loading the file as csv and inferring schema automatically


In [51]:
orders = spark.read.csv('retail_db//orders.csv',header=True,inferSchema=True,sep=',')
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [52]:
orders.show()

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|             1837|         CLOSED|
|      13|