In [5]:
# checking spark using for SparkContext
# SparkContext, SparkUI, Spark Version and AppName gets ouputted.
# SparkContext is entry point of any Spark application and acts as Master Node
sc

In [3]:
# importing row library 
# helps in fetching rows from the datasets
from pyspark.sql.types import Row
# importing datatime library
from datetime import datetime

In [3]:
# SparkContext has method that covert any list into RDD using sc.paralleize()
list_data = sc.parallelize([1, 'Alice', 50])
list_data
# outputting info as RDD created successfully

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195

In [4]:
# getting counts of elements in the list
list_data.count()
# we have three items in the list list_data

3

In [5]:
# getting first item
list_data.first()
# displaying 0th index data from the list

1

In [6]:
# using take to get set of rows or values from starting postion/index
list_data.take(2)
# returns two items from start position

[1, 'Alice']

In [7]:
# getting all data from list using collect
list_data.collect()
# returns all the data items in the list

# All these operations done like count, collect, take these are actions and quite expensive

[1, 'Alice', 50]

In [8]:
# in actual Saprk2 has dataframes so we can covert RDDs into dataframes
# only those RDD having proper schema (tabular form) can be converted into dataframe
df = list_data.toDF()
# getting error as [Can not infer schema for type: <class 'int'>] as list_data list has no schema

TypeError: Can not infer schema for type: <class 'int'>

In [9]:
# now lets work with structured/schema data
records = sc.parallelize([[1, 'Alice', 50], [2, 'Bob', 75], [3, 'Tuna', 100]])
records

ParallelCollectionRDD[8] at parallelize at PythonRDD.scala:195

In [10]:
# displaying RDD records
records.collect()
# this is structured data as each records have same entry as id, name and pay

[[1, 'Alice', 50], [2, 'Bob', 75], [3, 'Tuna', 100]]

In [11]:
# converting structured data RDD to dataframe
dataframe = records.toDF()
dataframe
# spark first infer the schema for the resultant dataframe

DataFrame[_1: bigint, _2: string, _3: bigint]

In [12]:
# show methods to view the data
dataframe.show()
# showing data in tabular form as dataframe

+---+-----+---+
| _1|   _2| _3|
+---+-----+---+
|  1|Alice| 50|
|  2|  Bob| 75|
|  3| Tuna|100|
+---+-----+---+



In [13]:
# here label of data is not readable so use row object to create labeled dataframe
row = sc.parallelize([Row(id=1,
                           name='Alice',
                           score=50
                          ),
                       Row(id=1,
                           name='Bob',
                           score=100
                          ),
                       Row(id=1,
                           name='Charlie',
                           score=80
                          )
                      ])
row

ParallelCollectionRDD[19] at parallelize at PythonRDD.scala:195

In [14]:
# collecting data
row.collect()

[Row(id=1, name='Alice', score=50),
 Row(id=1, name='Bob', score=100),
 Row(id=1, name='Charlie', score=80)]

In [15]:
# converting it into dataframe
data = row.toDF()
data

DataFrame[id: bigint, name: string, score: bigint]

In [16]:
# showing dataframe
data.show()
# now the column name of the table has changed to more readable form

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  1|    Bob|  100|
|  1|Charlie|   80|
+---+-------+-----+



In [17]:
# working with complex records
complex_record = sc.parallelize([Row(
                                    col_list = [1, 50],
                                    col_dict = {'name':'Bob', 'score':100},
                                    col_row = Row(a=10, b=20, c=30),
                                    col_time = datetime(2019, 7, 19, 1, 7)
                                 ),
                                 Row(
                                    col_list = [1, 30],
                                    col_dict = {'name':'Marry', 'score':100, 'age':30},
                                    col_row = Row(a=50, b=20, c=30),
                                    col_time = datetime(2019, 9, 10, 1, 3)
                                 ),
                                 Row(
                                    col_list = [1, 40],
                                    col_dict = {'name':'Bob', 'score':100},
                                    col_row = Row(a=90, b=20, c=30),
                                    col_time = datetime(2019, 7, 19, 1, 7)
                                 ),
                                 Row(
                                    col_list = [1, 50, 100],
                                    col_dict = {'name':'July', 'score':100},
                                    col_row = Row(a=600, b=20, c=30),
                                    col_time = datetime(2019, 5, 22, 6, 2)
                                 )
                                ])
complex_record

ParallelCollectionRDD[30] at parallelize at PythonRDD.scala:195

In [18]:
# converting RDD into dataframe
complex_df = complex_record.toDF()
complex_df

DataFrame[col_dict: map<string,string>, col_list: array<bigint>, col_row: struct<a:bigint,b:bigint,c:bigint>, col_time: timestamp]

In [19]:
# displaying the entire data table
complex_df.show()

+--------------------+------------+-------------+-------------------+
|            col_dict|    col_list|      col_row|           col_time|
+--------------------+------------+-------------+-------------------+
|[name -> Bob, sco...|     [1, 50]| [10, 20, 30]|2019-07-19 01:07:00|
|[name -> Marry, s...|     [1, 30]| [50, 20, 30]|2019-09-10 01:03:00|
|[name -> Bob, sco...|     [1, 40]| [90, 20, 30]|2019-07-19 01:07:00|
|[name -> July, sc...|[1, 50, 100]|[600, 20, 30]|2019-05-22 06:02:00|
+--------------------+------------+-------------+-------------------+



In [20]:
# working on particular cell of dataframe
cell_data = complex_df.collect()[0][2]
cell_data
# getting data from 1st row and 2nd column

Row(a=10, b=20, c=30)

In [21]:
# 3rd row and 1st column data
cell_list = complex_df.collect()[3][1]
cell_list

[1, 50, 100]

In [22]:
# adding new item to list
cell_list.append(200)
cell_list

[1, 50, 100, 200]

In [62]:
complex_df.show()
# here we can see the original df is not changing by append operation

+--------------------+------------+-------------+-------------------+
|            col_dict|    col_list|      col_row|           col_time|
+--------------------+------------+-------------+-------------------+
|[name -> Bob, sco...|     [1, 50]| [10, 20, 30]|2019-07-19 01:07:00|
|[name -> Marry, s...|     [1, 30]| [50, 20, 30]|2019-09-10 01:03:00|
|[name -> Bob, sco...|     [1, 40]| [90, 20, 30]|2019-07-19 01:07:00|
|[name -> July, sc...|[1, 50, 100]|[600, 20, 30]|2019-05-22 06:02:00|
+--------------------+------------+-------------+-------------------+



In [31]:
# getting specific column using map()
complex_df.rdd\
            .map(lambda x: (x.col_list, x.col_row))\
            .collect()

[([1, 50], Row(a=10, b=20, c=30)),
 ([1, 30], Row(a=50, b=20, c=30)),
 ([1, 40], Row(a=90, b=20, c=30)),
 ([1, 50, 100], Row(a=600, b=20, c=30))]

In [32]:
# showing specific column using select
complex_df.select(
            'col_list',
            'col_row'
).show()

+------------+-------------+
|    col_list|      col_row|
+------------+-------------+
|     [1, 50]| [10, 20, 30]|
|     [1, 30]| [50, 20, 30]|
|     [1, 40]| [90, 20, 30]|
|[1, 50, 100]|[600, 20, 30]|
+------------+-------------+



In [35]:
# converting dataframe to pandas dataframe
panda_data = complex_df.toPandas()
panda_data

Unnamed: 0,col_dict,col_list,col_row,col_time
0,"{'name': 'Bob', 'score': '100'}","[1, 50]","(10, 20, 30)",2019-07-19 01:07:00
1,"{'name': 'Marry', 'score': '100', 'age': '30'}","[1, 30]","(50, 20, 30)",2019-09-10 01:03:00
2,"{'name': 'Bob', 'score': '100'}","[1, 40]","(90, 20, 30)",2019-07-19 01:07:00
3,"{'name': 'July', 'score': '100'}","[1, 50, 100]","(600, 20, 30)",2019-05-22 06:02:00


In [23]:
 # SQLcontext is on the top of SparkContext
sqlContext =  SQLContext(sc)
sqlContext

<pyspark.sql.context.SQLContext at 0x7f67aa3d5f98>

In [24]:
# range function for creating dataframe
sql_df = sqlContext.range(5)
sql_df
# creating a dataframe inference with datatype/schema

DataFrame[id: bigint]

In [25]:
# displaying dataframe
sql_df.show()
# single column dataframe is created using range

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [26]:
# all methods are same for sqlcontext also
sql_df.count()

5

In [7]:
# creating list for given data
sql_data = [Row(id=1, name='Alice', score=50
            ),
            Row(id=1, name='Bob', score=100
            ),
            Row(id=1, name='Charlie', score=80
            )
           ]
sql_data

[Row(id=1, name='Alice', score=50),
 Row(id=1, name='Bob', score=100),
 Row(id=1, name='Charlie', score=80)]

In [9]:
# converting it into sqlcontest
sqlContext.createDataFrame(sql_data).show()
# directly setting up dataframe from given data

+---+-------+-----+
| id|   name|score|
+---+-------+-----+
|  1|  Alice|   50|
|  1|    Bob|  100|
|  1|Charlie|   80|
+---+-------+-----+



In [36]:
# creating RDD in sparkcontest
pandas_df = sqlContext.createDataFrame(panda_data).show()

+--------------------+------------+-------------+-------------------+
|            col_dict|    col_list|      col_row|           col_time|
+--------------------+------------+-------------+-------------------+
|[name -> Bob, sco...|     [1, 50]| [10, 20, 30]|2019-07-19 01:07:00|
|[name -> Marry, s...|     [1, 30]| [50, 20, 30]|2019-09-10 01:03:00|
|[name -> Bob, sco...|     [1, 40]| [90, 20, 30]|2019-07-19 01:07:00|
|[name -> July, sc...|[1, 50, 100]|[600, 20, 30]|2019-05-22 06:02:00|
+--------------------+------------+-------------+-------------------+

