In [1]:
# In this session we are going to cover some basic functionality of Pyspark
#  Introduce PySparks SQL funtions library 
#  - Select method
#  - Order By
#  - Like Operator (for searching a string)
#  - Substring Search
#  - Is In Operator
#  - Starts with, Ends with
#  - Slicing
#  - Filtering
#  - Collecting Results as Objects

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Pyspark SQL function").getOrCreate()

In [4]:
spark

In [12]:
# ## About this dataframe
# 
# The **fifa19.csv** dataset includes a list of all the FIFA 2019 players and their attributes listed below: 
# 
#  - **General**: Age, Nationality, Overall, Potential, Club
#  - **Metrics:** Value, Wage
#  - **Player Descriptive:** Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight
#  - **Possition:** LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, 
#  - **Other:** Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.
# 
# **Source:** https://www.kaggle.com/karangadiya/fifa19

In [6]:
file_path = "/home/nyalazone/Desktop/pyspark/Pyspark_Module/data/"
fifa = spark.read.csv(file_path+'fifa19.csv',inferSchema=True,header=True)

In [7]:
fifa.limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [8]:
# Number of rows
fifa.count()

18207

In [11]:
# Column List
fifa.columns

['_c0',
 'ID',
 'Name',
 'Age',
 'Photo',
 'Nationality',
 'Flag',
 'Overall',
 'Potential',
 'Club',
 'Club Logo',
 'Value',
 'Wage',
 'Special',
 'Preferred Foot',
 'International Reputation',
 'Weak Foot',
 'Skill Moves',
 'Work Rate',
 'Body Type',
 'Real Face',
 'Position',
 'Jersey Number',
 'Joined',
 'Loaned From',
 'Contract Valid Until',
 'Height',
 'Weight',
 'LS',
 'ST',
 'RS',
 'LW',
 'LF',
 'CF',
 'RF',
 'RW',
 'LAM',
 'CAM',
 'RAM',
 'LM',
 'LCM',
 'CM',
 'RCM',
 'RM',
 'LWB',
 'LDM',
 'CDM',
 'RDM',
 'RWB',
 'LB',
 'LCB',
 'CB',
 'RCB',
 'RB',
 'Crossing',
 'Finishing',
 'HeadingAccuracy',
 'ShortPassing',
 'Volleys',
 'Dribbling',
 'Curve',
 'FKAccuracy',
 'LongPassing',
 'BallControl',
 'Acceleration',
 'SprintSpeed',
 'Agility',
 'Reactions',
 'Balance',
 'ShotPower',
 'Jumping',
 'Stamina',
 'Strength',
 'LongShots',
 'Aggression',
 'Interceptions',
 'Positioning',
 'Vision',
 'Penalties',
 'Composure',
 'Marking',
 'StandingTackle',
 'SlidingTackle',
 'GKDiving',
 

In [14]:
# Shema of Dataset
print(fifa.printSchema())

root
 |-- _c0: integer (nullable = true)
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Photo: string (nullable = true)
 |-- Nationality: string (nullable = true)
 |-- Flag: string (nullable = true)
 |-- Overall: integer (nullable = true)
 |-- Potential: integer (nullable = true)
 |-- Club: string (nullable = true)
 |-- Club Logo: string (nullable = true)
 |-- Value: string (nullable = true)
 |-- Wage: string (nullable = true)
 |-- Special: integer (nullable = true)
 |-- Preferred Foot: string (nullable = true)
 |-- International Reputation: integer (nullable = true)
 |-- Weak Foot: integer (nullable = true)
 |-- Skill Moves: integer (nullable = true)
 |-- Work Rate: string (nullable = true)
 |-- Body Type: string (nullable = true)
 |-- Real Face: string (nullable = true)
 |-- Position: string (nullable = true)
 |-- Jersey Number: integer (nullable = true)
 |-- Joined: string (nullable = true)
 |-- Loaned From: string (nu

In [15]:
# ## Pyspark Select functiin
# There are a variety of functions you can import from pyspark.sql.functions. Check out the documentation for the full list available:
# http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions

In [17]:
from pyspark.sql.functions import *
# Since this is a sql function, the calls are pretty intuitive....

In [19]:
fifa.select(['Nationality','Name','Age','Photo']).show(5)

+-----------+-----------------+---+--------------------+
|Nationality|             Name|Age|               Photo|
+-----------+-----------------+---+--------------------+
|  Argentina|         L. Messi| 31|https://cdn.sofif...|
|   Portugal|Cristiano Ronaldo| 33|https://cdn.sofif...|
|     Brazil|        Neymar Jr| 26|https://cdn.sofif...|
|      Spain|           De Gea| 27|https://cdn.sofif...|
|    Belgium|     K. De Bruyne| 27|https://cdn.sofif...|
+-----------+-----------------+---+--------------------+
only showing top 5 rows



In [20]:
# To discard wrapping Text
fifa.select(['Nationality','Name','Age','Photo']).show(5,False)

+-----------+-----------------+---+----------------------------------------------+
|Nationality|Name             |Age|Photo                                         |
+-----------+-----------------+---+----------------------------------------------+
|Argentina  |L. Messi         |31 |https://cdn.sofifa.org/players/4/19/158023.png|
|Portugal   |Cristiano Ronaldo|33 |https://cdn.sofifa.org/players/4/19/20801.png |
|Brazil     |Neymar Jr        |26 |https://cdn.sofifa.org/players/4/19/190871.png|
|Spain      |De Gea           |27 |https://cdn.sofifa.org/players/4/19/193080.png|
|Belgium    |K. De Bruyne     |27 |https://cdn.sofifa.org/players/4/19/192985.png|
+-----------+-----------------+---+----------------------------------------------+
only showing top 5 rows



In [21]:
fifa.select(['Nationality','Name','Age']).show(5)

+-----------+-----------------+---+
|Nationality|             Name|Age|
+-----------+-----------------+---+
|  Argentina|         L. Messi| 31|
|   Portugal|Cristiano Ronaldo| 33|
|     Brazil|        Neymar Jr| 26|
|      Spain|           De Gea| 27|
|    Belgium|     K. De Bruyne| 27|
+-----------+-----------------+---+
only showing top 5 rows



In [22]:
# OrderBY
# Who is youngester players
fifa.select(['Nationality','Name','Age']).orderBy(['Age']).show(5)

+-------------+------------+---+
|  Nationality|        Name|Age|
+-------------+------------+---+
|       Sweden|   B. Nygren| 16|
|       Sweden|H. Andersson| 16|
|       Turkey|    A. Doğan| 16|
|United States|  C. Bassett| 16|
|      England|    B. Mumba| 16|
+-------------+------------+---+
only showing top 5 rows



In [28]:
# OrderBy descending order
fifa.select(['Nationality','Name','Age']).orderBy(fifa['Age'].desc()).show(6)


+-----------------+-------------+---+
|      Nationality|         Name|Age|
+-----------------+-------------+---+
|           Mexico|     O. Pérez| 45|
|Trinidad & Tobago|    T. Warner| 44|
|          England|K. Pilkington| 44|
|            Japan|  S. Narazaki| 42|
|         Paraguay|    J. Villar| 41|
|     Saudi Arabia| H. Sulaimani| 41|
+-----------------+-------------+---+
only showing top 6 rows



In [30]:
# Wild card search
# If we wanted to look for all players that had "Barcelona" in their club title 
# We could use the like operator
fifa.select("Name","Club").where(fifa.Club.like("%Barcelona%")).show(5, False)

+---------------+------------+
|Name           |Club        |
+---------------+------------+
|L. Messi       |FC Barcelona|
|L. Suárez      |FC Barcelona|
|M. ter Stegen  |FC Barcelona|
|Sergio Busquets|FC Barcelona|
|Coutinho       |FC Barcelona|
+---------------+------------+
only showing top 5 rows



In [32]:
# **Substrings**
# 
# .substr(starting postion,length)
# 
# Use this if you want to return a particular portion within a string
fifa.select('Photo').limit(3).toPandas()

Unnamed: 0,Photo
0,https://cdn.sofifa.org/players/4/19/158023.png
1,https://cdn.sofifa.org/players/4/19/20801.png
2,https://cdn.sofifa.org/players/4/19/190871.png


In [33]:
fifa.select("Photo",fifa.Photo.substr(-4,4)).show(5,False)

+----------------------------------------------+-----------------------+
|Photo                                         |substring(Photo, -4, 4)|
+----------------------------------------------+-----------------------+
|https://cdn.sofifa.org/players/4/19/158023.png|.png                   |
|https://cdn.sofifa.org/players/4/19/20801.png |.png                   |
|https://cdn.sofifa.org/players/4/19/190871.png|.png                   |
|https://cdn.sofifa.org/players/4/19/193080.png|.png                   |
|https://cdn.sofifa.org/players/4/19/192985.png|.png                   |
+----------------------------------------------+-----------------------+
only showing top 5 rows



In [35]:
 # we can also give name to computed subtring columns
    fifa.select("Photo",fifa.Photo.substr(-4,4).alias('sub_str')).show(5,False)

+----------------------------------------------+-------+
|Photo                                         |sub_str|
+----------------------------------------------+-------+
|https://cdn.sofifa.org/players/4/19/158023.png|.png   |
|https://cdn.sofifa.org/players/4/19/20801.png |.png   |
|https://cdn.sofifa.org/players/4/19/190871.png|.png   |
|https://cdn.sofifa.org/players/4/19/193080.png|.png   |
|https://cdn.sofifa.org/players/4/19/192985.png|.png   |
+----------------------------------------------+-------+
only showing top 5 rows



In [38]:
# We can also use ISIN to search for a list of options within a column.
fifa[fifa.Nationality.isin("Argentina","Portugal")].limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,15,211110,P. Dybala,24,https://cdn.sofifa.org/players/4/19/211110.png,Argentina,https://cdn.sofifa.org/flags/52.png,89,94,Juventus,...,84,23,20,20,5,4,4,5,8,€153.5M
3,23,153079,S. Agüero,30,https://cdn.sofifa.org/players/4/19/153079.png,Argentina,https://cdn.sofifa.org/flags/52.png,89,89,Manchester City,...,90,30,20,12,13,15,6,11,14,€119.3M


In [39]:
# To Limit columsn
fifa[fifa.Nationality.isin("Argentina","Portugal")].select(['Name','Age','Nationality']).limit(4).toPandas()

Unnamed: 0,Name,Age,Nationality
0,L. Messi,31,Argentina
1,Cristiano Ronaldo,33,Portugal
2,P. Dybala,24,Argentina
3,S. Agüero,30,Argentina


In [40]:
# **Starts with Ends with**
# 
# Search for a specific case - begins with "x" and ends with "x"

In [41]:
fifa.select("Name","Club").where(fifa.Name.startswith("L"))

DataFrame[Name: string, Club: string]

In [42]:
fifa.select("Name","Club").where(fifa.Name.startswith("L")).where(fifa.Name.endswith("i")).limit(4).toPandas()

Unnamed: 0,Name,Club
0,L. Messi,FC Barcelona
1,L. Bonucci,Juventus
2,L. Fabiański,West Ham United
3,L. Pellegrini,Roma


In [43]:
#### Slicing a Dataframe

In [48]:
# Slice rows
df2 = fifa.limit(300)
print('Sliced row count:',df2.count())


Sliced row count: 300


In [49]:
# Slice columns
cols_list = fifa.columns[0:5]
df3 = fifa.select(cols_list)
print('Sliced column count:',len(df3.columns))

Sliced column count: 5


In [50]:
# **Slicing Method**
# 
# pyspark.sql.functions.slice(x, start, length)[source] <br>
# Returns an array containing all the elements in x from index start (or starting from the end if start is negative) with the specified length.  <br>
# <br>
# *Note: indexing starts at 1 here*

In [51]:
from pyspark.sql.functions import slice
df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x']) 
df.show()

+---------+
|        x|
+---------+
|[1, 2, 3]|
|   [4, 5]|
+---------+



In [52]:
df.select(slice(df.x, 2, 2).alias("sliced")).show()

+------+
|sliced|
+------+
|[2, 3]|
|   [5]|
+------+



In [54]:
fifa.limit(5).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68,15,21,13,90,85,87,88,94,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88,68,58,51,15,13,5,10,13,€196.4M


In [53]:
# ## Filtering Data
# 
# A large part of working with DataFrames is the ability to quickly filter out data based on conditions. Spark DataFrames are built on top of the Spark SQL platform, which means that is you already know SQL, you can quickly and easily grab that data using SQL commands, or using the DataFram methods (which is what we focus on in this course).


In [56]:
fifa.filter('Overall<50').limit(4).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,18118,243725,D. Collins,17,https://cdn.sofifa.org/players/4/19/243725.png,Republic of Ireland,https://cdn.sofifa.org/flags/25.png,49,62,Sligo Rovers,...,50,39,29,27,6,9,5,13,8,€109K
1,18119,240668,J. Egan,19,https://cdn.sofifa.org/players/4/19/240668.png,England,https://cdn.sofifa.org/flags/14.png,49,62,Carlisle United,...,42,23,25,27,12,9,10,8,12,€143K
2,18120,241443,Xie Xiaofan,20,https://cdn.sofifa.org/players/4/19/241443.png,China PR,https://cdn.sofifa.org/flags/155.png,49,61,Jiangsu Suning FC,...,42,53,40,42,13,12,7,12,12,€118K
3,18121,246051,B. Buckley,17,https://cdn.sofifa.org/players/4/19/246051.png,England,https://cdn.sofifa.org/flags/14.png,49,61,Grimsby Town,...,49,30,40,40,10,8,14,11,11,€98K


In [57]:
# Using SQL with .select()
fifa.filter("Overall>80").select(['ID','Name','Nationality','Overall']).limit(4).toPandas()

Unnamed: 0,ID,Name,Nationality,Overall
0,158023,L. Messi,Argentina,94
1,20801,Cristiano Ronaldo,Portugal,94
2,190871,Neymar Jr,Brazil,92
3,193080,De Gea,Spain,91


In [58]:

fifa.select(['Nationality','Name','Age','Overall']).filter("Overall>70").orderBy(fifa["Overall"].desc()).show()


+-----------+-----------------+---+-------+
|Nationality|             Name|Age|Overall|
+-----------+-----------------+---+-------+
|  Argentina|         L. Messi| 31|     94|
|   Portugal|Cristiano Ronaldo| 33|     94|
|     Brazil|        Neymar Jr| 26|     92|
|    Belgium|     K. De Bruyne| 27|     91|
|    Belgium|        E. Hazard| 27|     91|
|    Croatia|        L. Modrić| 32|     91|
|    Uruguay|        L. Suárez| 31|     91|
|      Spain|     Sergio Ramos| 32|     91|
|      Spain|           De Gea| 27|     91|
|   Slovenia|         J. Oblak| 25|     90|
|    Uruguay|         D. Godín| 32|     90|
|    Germany|         T. Kroos| 28|     90|
|     Poland|   R. Lewandowski| 29|     90|
|      Spain|      David Silva| 32|     90|
|     France|         N. Kanté| 27|     89|
|  Argentina|        P. Dybala| 24|     89|
|    England|          H. Kane| 24|     89|
|    Belgium|      T. Courtois| 26|     89|
|     France|     A. Griezmann| 27|     89|
|      Spain|  Sergio Busquets| 

In [61]:
# ### Collecting Results as Objects
# The last thing we need to cover is collecting results as objects. If we wanted to say print individual names from an output, we need to essentially remove the item from the dataframe into an object. Like this

# Collecting results as Python objects
# you need the ".collect()" call at the end to "collect" the results

In [62]:
result = fifa.select(['Nationality','Name','Age','Overall']).filter("Overall>70").orderBy(fifa["Overall"].desc()).collect()


In [64]:
type(result[0])

pyspark.sql.types.Row

In [65]:
print("Best Player Over 70: ",result[0][1])
print("Nationality of Best Player Over 70: ",result[0][0])
print("")
print("Worst Player Over 70: ",result[-1][1])
print("Nationality of Worst Player Over 70: ",result[-1][0])

Best Player Over 70:  L. Messi
Nationality of Best Player Over 70:  Argentina

Worst Player Over 70:  Zapater
Nationality of Worst Player Over 70:  Spain


In [72]:
# Rows can also be called to turn into dictionaries if needed
result[0].asDict()

{'Nationality': 'Argentina', 'Name': 'L. Messi', 'Age': 31, 'Overall': 94}

In [77]:
# we can make list of dictionary if needed
list1 = []
for row in result:
    list1.append(row.asDict())
    

In [None]:

# Check out this link for more info on other methods:
# http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark-sql-module