In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("PySpark Data Frame").getOrCreate()

In [5]:
# Spark web view to check core and executes

In [4]:
spark

In [6]:
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("We are working with", cores, "core(s)")

We are working with 1 core(s)


In [8]:
# Reading data
# A DataFrame is equivalent to a relational table in Spark SQL, and can be created using various functions in SparkSession.
# First let's try reading in a csv file containing a list of students and their grades.
# 
# **Source:** https://www.kaggle.com/spscientist/students-performance-in-exams


In [9]:
path ="/home/nyalazone/Desktop/pyspark/Pyspark_Module/data/StudentsPerformance.csv"
dataset = spark.read.csv(path,inferSchema=True,header=True)
# see the data schema
dataset

DataFrame[gender: string, race/ethnicity: string, parental level of education: string, lunch: string, test preparation course: string, math score: int, reading score: int, writing score: int]

In [10]:
# display data in pandas view
# dataset.show(12) like pandas.head(12)

In [13]:
dataset.limit(12).toPandas()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [14]:
# Read parqut file

In [15]:
path = '/home/nyalazone/Desktop/pyspark/Pyspark_Module/data/'
partitioned = spark.read.parquet(path+'userdata1.parquet')
partitioned.columns 

['registration_dttm',
 'id',
 'first_name',
 'last_name',
 'email',
 'gender',
 'ip_address',
 'cc',
 'country',
 'birthdate',
 'salary',
 'title',
 'comments']

In [16]:
# fetch multi broken file

In [17]:
partitioned2 = spark.read.parquet(path+'userdata1*')
partitioned2.show(5)

+-------------------+---+----------+---------+--------------------+------+--------------+----------------+------------+---------+---------+--------------------+--------+
|  registration_dttm| id|first_name|last_name|               email|gender|    ip_address|              cc|     country|birthdate|   salary|               title|comments|
+-------------------+---+----------+---------+--------------------+------+--------------+----------------+------------+---------+---------+--------------------+--------+
|2016-02-03 13:25:29|  1|    Amanda|   Jordan|    ajordan0@com.com|Female|   1.197.201.2|6759521864920116|   Indonesia| 3/8/1971| 49756.53|    Internal Auditor|   1E+02|
|2016-02-03 22:34:03|  2|    Albert|  Freeman|     afreeman1@is.gd|  Male|218.111.175.34|                |      Canada|1/16/1968|150280.17|       Accountant IV|        |
|2016-02-03 06:39:31|  3|    Evelyn|   Morgan|emorgan2@altervis...|Female|  7.161.136.94|6767119071901597|      Russia| 2/1/1960|144972.51| Structural

In [18]:
partitioned2.limit(4).toPandas()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 13:25:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 22:34:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-03 06:39:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-03 06:06:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,


In [19]:
# fetch multi file using name

In [20]:
partitioned1_2 = spark.read.parquet(path+'userdata1.parquet',path+'userdata2.parquet')
partitioned1_2.limit(5).toPandas()

Unnamed: 0,registration_dttm,id,first_name,last_name,email,gender,ip_address,cc,country,birthdate,salary,title,comments
0,2016-02-03 13:25:29,1,Amanda,Jordan,ajordan0@com.com,Female,1.197.201.2,6759521864920116.0,Indonesia,3/8/1971,49756.53,Internal Auditor,100.0
1,2016-02-03 22:34:03,2,Albert,Freeman,afreeman1@is.gd,Male,218.111.175.34,,Canada,1/16/1968,150280.17,Accountant IV,
2,2016-02-03 06:39:31,3,Evelyn,Morgan,emorgan2@altervista.org,Female,7.161.136.94,6767119071901597.0,Russia,2/1/1960,144972.51,Structural Engineer,
3,2016-02-03 06:06:21,4,Denise,Riley,driley3@gmpg.org,Female,140.35.109.83,3576031598965625.0,China,4/8/1997,90263.05,Senior Cost Accountant,
4,2016-02-03 10:35:31,5,Carlos,Burns,cburns4@miitbeian.gov.cn,,169.113.235.40,5602256255204850.0,South Africa,,,,


In [21]:
# check data type of dataset

In [22]:
print(type(dataset))
studentsPdf = dataset.toPandas()
print(type(studentsPdf))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [23]:
# A Solid Summary of your data:

In [24]:
dataset.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: integer (nullable = true)
 |-- reading score: integer (nullable = true)
 |-- writing score: integer (nullable = true)



In [25]:
# Columns of Dataframe

In [26]:
dataset.columns

['gender',
 'race/ethnicity',
 'parental level of education',
 'lunch',
 'test preparation course',
 'math score',
 'reading score',
 'writing score']

In [29]:
# Description
dataset.describe().toPandas()

Unnamed: 0,summary,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,count,1000,1000,1000,1000,1000,1000.0,1000.0,1000.0
1,mean,,,,,,66.089,69.169,68.054
2,stddev,,,,,,15.163080096009454,14.600191937252225,15.19565701086966
3,min,female,group A,associate's degree,free/reduced,completed,0.0,17.0,10.0
4,max,male,group E,some high school,standard,none,100.0,100.0,100.0


In [30]:
# If you need to get the type of just ONE column by name you can use this function:

In [32]:
dataset.schema['math score'].dataType
# dataset.schema['gender'].dataType

IntegerType

In [33]:
# describe Function
dataset.describe(['math score']).show()

+-------+------------------+
|summary|        math score|
+-------+------------------+
|  count|              1000|
|   mean|            66.089|
| stddev|15.163080096009454|
|    min|                 0|
|    max|               100|
+-------+------------------+



In [34]:
## Summary function

In [35]:
dataset.select("math score", "reading score","writing score").summary("count", "min", "25%", "75%", "max").show()

+-------+----------+-------------+-------------+
|summary|math score|reading score|writing score|
+-------+----------+-------------+-------------+
|  count|      1000|         1000|         1000|
|    min|         0|           17|           10|
|    25%|        57|           59|           57|
|    75%|        77|           79|           79|
|    max|       100|          100|          100|
+-------+----------+-------------+-------------+



In [36]:
# ## How to specify data types as you read in datasets.
# 
# Some data types make it easier to infer schema (like tabular formats such as csv which we will show later). 
# 
# However you often have to set the schema yourself if you aren't dealing with a .read method that doesn't have inferSchema() built-in.
# 
# Spark has all the tools you need for this, it just requires a very specific structure:

In [37]:
from pyspark.sql.types import StructField,StringType,IntegerType,StructType,DateType
# Next we need to create the list of Structure fields
#     * :param name: string, name of the field.
#     * :param dataType: :class:`DataType` of the field.
#     * :param nullable: boolean, whether the field can be null (None) or not.

schema_definition = [StructField("name", StringType(), True),
               StructField("email", StringType(), True),
               StructField("city", StringType(), True),
               StructField("mac", StringType(), True),
               StructField("timestamp", DateType(), True),
               StructField("creditcard", StringType(), True),
               StructField("Price",IntegerType())
              ]

In [38]:
final_struc = StructType(fields=schema_definition)
final_struc

StructType(List(StructField(name,StringType,true),StructField(email,StringType,true),StructField(city,StringType,true),StructField(mac,StringType,true),StructField(timestamp,DateType,true),StructField(creditcard,StringType,true),StructField(Price,IntegerType,true)))

In [39]:

json_path = '/home/nyalazone/Desktop/pyspark/Pyspark_Module/data/people.json'
people = spark.read.json(json_path, schema=final_struc)
people.limit(10).toPandas()

Unnamed: 0,name,email,city,mac,timestamp,creditcard,Price
0,,,,,,,
1,Keeley Bosco,katlyn@jenkinsmaggio.net,Lake Gladysberg,08:fd:0b:cd:77:f7,2015-04-25,1228-1221-1221-1431,
2,Rubye Jerde,juvenal@johnston.name,,90:4d:fa:42:63:a2,2015-04-25,1228-1221-1221-1431,
3,Miss Darian Breitenberg,,,f9:0e:d3:40:cb:e9,2015-04-25,,
4,Celine Ankunding,emery_kunze@rogahn.net,,3a:af:c9:0b:5c:08,2015-04-25,1228-1221-1221-1431,
5,Dr. Araceli Lang,mavis_lehner@jacobi.name,Yvettemouth,9e:ea:28:41:2a:50,2015-04-25,1211-1221-1234-2201,
6,Esteban Von,,,2d:e4:f0:dd:90:96,2015-04-25,,
7,Everette Swift,gielle_jacobs@flatleyboehm.biz,,29:e0:54:7a:b7:ca,2015-04-25,,
8,Terrell Boyle,augustine.conroy@keebler.name,Port Reaganfort,c5:32:09:5a:f7:15,2015-04-25,1228-1221-1221-1431,
9,Miss Emmie Muller,,Kaleyhaven,be:dc:d2:57:81:8b,2015-04-25,,


In [40]:
# Show Shema
people.printSchema()

root
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- city: string (nullable = true)
 |-- mac: string (nullable = true)
 |-- timestamp: date (nullable = true)
 |-- creditcard: string (nullable = true)
 |-- Price: integer (nullable = true)



In [42]:
# Writing File
writeFile = "/home/nyalazone/Desktop/pyspark/writeFile/"
people.write.mode("overwrite").csv(writeFile + 'user.csv')

In [43]:
#  Create DataFrame:

In [44]:
values = [('Pear',10),('Orange',36),('Banana',123),('Kiwi',48),('Peach',16),('Strawberry',1)]
data_frame = spark.createDataFrame(values,['fruit','quantity'])
data_frame.toPandas()

Unnamed: 0,fruit,quantity
0,Pear,10
1,Orange,36
2,Banana,123
3,Kiwi,48
4,Peach,16
5,Strawberry,1


In [45]:
emp = [
  {"Name":"Khan","city":"Meerut","salary":1010101,"dob":"02-11-1991"},
  {"Name":"Mango","city":"Delhi","salary":123345,"dob":"02-11-1991"},
  {"Name":"Orange","city":"Kanpur","salary":91928,"dob":"02-11-1991"},
]
df1 = spark.createDataFrame(emp)
df1.show()



+------+------+----------+-------+
|  Name|  city|       dob| salary|
+------+------+----------+-------+
|  Khan|Meerut|02-11-1991|1010101|
| Mango| Delhi|02-11-1991| 123345|
|Orange|Kanpur|02-11-1991|  91928|
+------+------+----------+-------+



In [46]:
df1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- salary: long (nullable = true)

