## Search and Filter DataFrames 

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Data Frame with filter").getOrCreate()
spark

In [3]:
path = "../../data/fifa19.csv"
dataset = spark.read.csv(path,header = True, inferSchema = True)

### About this dataframe

**The fifa19.csv dataset includes a list of all the FIFA 2019 players and their attributes listed below:**
<ul>
<li>General: Age, Nationality, Overall, Potential, Club</li>
<li>**Metrics:** Value, Wage</li>
<li>**Player Descriptive:** Preferred Foot, International Reputation, Weak Foot, Skill Moves, Work Rate, Position, Jersey Number, Joined, Loaned From, Contract Valid Until, Height, Weight</li>
<li> **Possition:** LS, ST, RS, LW, LF, CF, RF, RW, LAM, CAM, RAM, LM, LCM, CM, RCM, RM, LWB, LDM, CDM, RDM, RWB, LB, LCB, CB, RCB, RB, </li>
<li> **Other:** Crossing, Finishing, Heading, Accuracy, ShortPassing, Volleys, Dribbling, Curve, FKAccuracy, LongPassing, BallControl, Acceleration, SprintSpeed, Agility, Reactions, Balance, ShotPower, Jumping, Stamina, Strength, LongShots, Aggression, Interceptions, Positioning, Vision, Penalties, Composure, Marking, StandingTackle, SlidingTackle, GKDiving, GKHandling, GKKicking, GKPositioning, GKReflexes, and Release Clause.</li>
<\ul>
**Source:** <a href="https://www.kaggle.com/karangadiya/fifa19">Fifa</a>

In [4]:
dataset.limit(3).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M


In [6]:
#dataset.printSchema()

In [7]:
from pyspark.sql.functions import *

In [8]:

dataset.limit(3).toPandas()

Unnamed: 0,_c0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96,33,28,26,6,11,15,14,8,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95,28,31,23,7,11,15,14,11,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94,27,24,33,9,9,15,15,11,€228.1M


In [9]:
#Select the Name and Position of each player in the dataframe
dataset.select(['Name','Position']).show(4,False)

+-----------------+--------+
|Name             |Position|
+-----------------+--------+
|L. Messi         |RF      |
|Cristiano Ronaldo|ST      |
|Neymar Jr        |LW      |
|De Gea           |GK      |
+-----------------+--------+
only showing top 4 rows



In [10]:
#Display the same results from above sorted by the players names
dataset.select(['Name','Position']).orderBy(dataset['Name'].desc()).show(4,False)


+--------------+--------+
|Name          |Position|
+--------------+--------+
|Óscar Whalley |GK      |
|Óscar Valentín|CDM     |
|Óscar Plano   |LM      |
|Óscar Pinchi  |LM      |
+--------------+--------+
only showing top 4 rows



In [11]:
#Select only the players who belong to a club begining with FC
dataset.select(['Name','Position','Club']).where(dataset.Club.startswith("FC")).show(3)


+--------------+--------+-----------------+
|          Name|Position|             Club|
+--------------+--------+-----------------+
|      L. Messi|      RF|     FC Barcelona|
|     L. Suárez|      RS|     FC Barcelona|
|R. Lewandowski|      ST|FC Bayern München|
+--------------+--------+-----------------+
only showing top 3 rows



In [12]:
# Who is the oldest player in the dataset and how old are they?
dataset.select(['Name','Age']).orderBy(dataset.Age.desc()).limit(3).toPandas()

Unnamed: 0,Name,Age
0,O. Pérez,45
1,T. Warner,44
2,K. Pilkington,44


In [13]:
#Select only the following players from the dataframe:
#L. Messi
#Cristiano Ronaldo

dataset.select("Name","Age").where(dataset.Name.like("%Cristiano Ronaldo%")).show(3, False)

+-----------------+---+
|Name             |Age|
+-----------------+---+
|Cristiano Ronaldo|33 |
+-----------------+---+



In [59]:
# Can you select the first character from the Release Clause variable which indicates the currency used?
dataset.select("Release Clause",dataset['Release Clause'].substr(0,1)).show(3)


+--------------+-------------------------------+
|Release Clause|substring(Release Clause, 0, 1)|
+--------------+-------------------------------+
|       €226.5M|                              €|
|       €127.1M|                              €|
|       €228.1M|                              €|
+--------------+-------------------------------+
only showing top 3 rows



In [14]:
# Can you select only the players who are over the age of 40?
dataset.select(['Age']).filter('Age >40').limit(3).toPandas()

Unnamed: 0,Age
0,41
1,41
2,45


In [None]:
#==========================End Work Great==========================