# PySpark DataFrame Tutorial Introduction to DataFrames

### Source
https://www.edureka.co/blog/pyspark-dataframe-tutorial/#why

In [3]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/37/98/244399c0daa7894cdf387e7007d5e8b3710a79b67f3fd991c0b0b644822d/pyspark-2.4.3.tar.gz (215.6MB)
[K     |████████████████████████████████| 215.6MB 115kB/s 
[?25hCollecting py4j==0.10.7 (from pyspark)
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 37.7MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.3-py2.py3-none-any.whl size=215964963 sha256=3904df0a1a143c449c7066e86f2433a18c22d98cd31e711d18b991fd7713e235
  Stored in directory: /root/.cache/pip/wheels/8d/20/f0/b30e2024226dc112e256930dd2cd4f06d00ab053c86278dcf3
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 py

In [0]:
from IPython.display import display
from pyspark.sql import *
spark = SparkSession.builder.appName('test').getOrCreate()

### Create Employee and Department

In [0]:
Employee = Row('firstName', 'lastName', 'email', 'salary')

employee1 = Employee('Basher', 'armbrust', 'bash@edureka.co', 10000)
employee2 = Employee('Daniel', 'meng', 'daniel@standford.edu', 12000)
employee3 = Employee('Muriel', None, 'muriel@waterloo.edu', 14000)
employee4 = Employee('Rachel', 'wendell', 'rach_3@edureka.co', 16000)
employee5 = Employee('Zach', 'galifianakis', 'zach_g@edureka.co', 16000)

In [6]:
Employee[0]

'firstName'

In [7]:
employee3

Row(firstName='Muriel', lastName=None, email='muriel@waterloo.edu', salary=14000)

In [0]:
department1 = Row(id='123456', name='HR')
department2 = Row(id='789012', name='OPS')
department3 = Row(id='345678', name='FN')
department4 = Row(id='901234', name='DEV')

In [0]:
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2, employee5])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])
departmentWithEmployees3 = Row(department=department3, employees=[employee1, employee4, employee3])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])

In [10]:
departmentWithEmployees_Seq = [departmentWithEmployees1, departmentWithEmployees2]
df = spark.createDataFrame(departmentWithEmployees_Seq)
display(df)
df.show()

DataFrame[department: struct<id:string,name:string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:bigint>>]

+-------------+--------------------+
|   department|           employees|
+-------------+--------------------+
| [123456, HR]|[[Basher, armbrus...|
|[789012, OPS]|[[Muriel,, muriel...|
+-------------+--------------------+



### Pyspark Dataframes Example1: FIFA World Cup Dataset

In [0]:
df_fifa = spark.read.csv('fifa_players.csv', inferSchema=True, header=True)
df_fifa.show()

In [0]:
# Schema
df_fifa.printSchema()

In [0]:
df_fifa.columns # column names
df_fifa.count() # row count
len(df_fifa.columns) # column count

In [0]:
# Statistics
df_fifa.describe('Coach Name').show()
df_fifa.describe('Position').show()

In [0]:
# Select
df_fifa.select('Player Name', 'Coach Name').show()
df_fifa.select('Player Name', 'Coach Name').distinct().show() # select distinct multiple columns

In [0]:
# Filter
df_fifa.filter(df_fifa.MatchID=='1096').show()
df_fifa.filter(df_fifa.MatchID=='1096').count()
df_fifa.filter((df_fifa.Position=='C') & (df_fifa.Event=='G40')).show()

In [0]:
# Sort
df_fifa.orderBy(df_fifa.MatchID).show()

### Pyspark Dataframes Example2: Superheros Dataset

In [0]:
df_superhero = spark.read.csv('superhero.csv', inferSchema=True, header=True)
df_superhero.show()

In [0]:
# Filter
df_superhero.filter(df_superhero.Gender=='Male').count()
df_superhero.filter(df_superhero.Gender=='Female').count()

In [0]:
# Grouping
df_race = df_superhero.groupby('Race').count().show()

In [0]:
# SQL queries
# Need to create a table from dataframe using the registerTempTable
# Use sqlContext.sql() to pass SQL queries
df_superhero.registerTempTable('superhero_table')
sqlContext.sql('SELECT * FROM superhero_table').show()
sqlContext.sql('SELECT DISTINCT(Eye_color) FROM superhero_table').show()
sqlContext.sql('SELECT DISTINCT(Eye_color) FROM superhero_table').count()
sqlContext.sql('SELECT MAX(Weight) FROM superhero_table').show()