# La Liga

### Importing the necessaries dependencies

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
pd.options.display.max_columns = 80
pd.set_option("display.max_rows", 600)
import missingno as mnso

### Loading the dataset

datase provided by : Álvaro Bartolomé
github acount : alvarobartt

In [73]:
filename = 'https://raw.githubusercontent.com/alvarobartt/laliga-dataset/master/dataset/laliga_player_stats_english.csv'

In [74]:
df = pd.read_csv(filename, index_col = None)

### Checking for Missing Values

In [75]:
mask = df.isnull()
total= mask.sum()
percent = 100*mask.mean()
missing_data = pd.concat([total, percent], axis= 1, join = 'outer', keys=['count_missing', 'perc_missing'])
missing_data.sort_values(by= 'perc_missing', ascending= False, inplace = True)
missing_data.head()

Unnamed: 0,count_missing,perc_missing
Shirt number,30,5.395683
Team,0,0.0
Goals with right foot,0,0.0
Penalties won,0,0.0
Penalties given away,0,0.0


In [76]:
### Plotting the Nullity Matrix to visualize the absence of the 30 values in shirt number

In [77]:
nullable_columns = df.columns[mask.any()].tolist()

In [78]:
#fig = msno.matrix(df[nullable_columns].sample(500))

In [79]:
#fig_copy = fig.get_figure() 

#fig_copy.savefig('./Images/nullitymatrix.png')

### Checking data types

In [80]:
df.dtypes

Team                                      object
Position                                  object
Shirt number                             float64
Name                                      object
Minutes played                           float64
Games played                               int64
Percentage of games played                object
Full games played                          int64
Percentage of full games played           object
Games started                              int64
Percentage of games started               object
Games where substituted                    int64
Percentage of games where substituted     object
Yellow Cards                               int64
Red Cards                                  int64
Second Yellows                             int64
Goals scored                               int64
Penalties scored                           int64
Own goals                                  int64
Goals conceded while player on pitch       int64
Tackles             

In [81]:
### Filling missing values with '0'
df['Shirt number']= df['Shirt number'].fillna(value = 0)



In [82]:
# Most data types should be integer, not float 

In [83]:
for col in df[[col for col in df.columns if df[col].dtypes == 'float64']]:
    df[col] = df[col].astype('int64')

In [90]:
df

Unnamed: 0,Team,Position,Shirt number,Name,Minutes played,Games played,Percentage of games played,Full games played,Percentage of full games played,Games started,Percentage of games started,Games where substituted,Percentage of games where substituted,Yellow Cards,Red Cards,Second Yellows,Goals scored,Penalties scored,Own goals,Goals conceded while player on pitch,Tackles,Interceptions,Recoveries,Clearances,Successful tackles,Unssuccessful tackles,Last man,Successful duels,Duels lost,Successful aerial challenges,Unsuccessful aerial challenges,Offsides,Fouls suffered,Fouls committed,Penalties won,Penalties given away,Handballs committed,Fouls committed per card,Shots,Shots on target,Assists,Successful dribbles,Unsuccessful dribbles,Goals scored.1,From inside the area,From outside the area,Goals with left foot,Goals with right foot,Penalties scored.1,Goals scored with header,Goals from set piece,Crosses,Corners,Tackles.1,Duels,Man-to-man duels,Aerial duels,Passes,Short passes,Long passes,Through balls,Goals scored per attempt,Float_Game_Played,Float_Game_Started,Float_Game_Substituted
0,Athletic Club,Goalkeeper,0,Hodei Oleaga,0,0,"0,0%",0,"0,0%",0,"0,0%",0,"0,0%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
1,Athletic Club,Goalkeeper,1,A. Remiro,0,0,"0,0%",0,"0,0%",0,"0,0%",0,"0,0%",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0
2,Athletic Club,Goalkeeper,13,Herrerín,2,31,"82,0%",31,"82,0%",31,"82,0%",0,"0,0%",1,0,0,0,0,0,32,0,0,228,27,0,0,0,3,3,18,1,0,4,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,25,6,19,887,128,759,1,0,0.82,0.82,0.0
3,Athletic Club,Goalkeeper,25,Unai Simón,630,7,"18,0%",7,"18,0%",7,"18,0%",0,"0,0%",2,0,0,0,0,0,13,0,1,54,3,0,0,0,0,2,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,2,1,155,49,106,0,0,0.18,0.18,0.0
4,Athletic Club,Defender,3,Núñez,1,12,"32,0%",11,"29,0%",11,"29,0%",1,"3,0%",4,0,0,0,0,0,15,12,28,50,57,11,4,0,19,19,43,26,0,2,16,0,0,1,7,9,3,0,2,0,0,0,0,0,0,0,0,0,1,0,15,107,38,69,536,457,78,1,0,0.29,0.29,0.03
5,Athletic Club,Defender,4,I. Martínez,2,33,"87,0%",32,"84,0%",33,"87,0%",1,"3,0%",12,0,0,0,0,0,40,14,68,182,143,26,27,0,108,54,94,73,0,55,38,2,3,1,21,21,4,0,4,1,0,0,0,0,0,0,0,0,15,0,53,329,162,167,1,1,361,5,0,0.84,0.87,0.03
6,Athletic Club,Defender,5,Yeray,2,30,"79,0%",28,"74,0%",30,"79,0%",2,"5,0%",10,0,0,0,0,0,30,16,63,138,144,30,25,2,76,50,101,69,0,24,31,0,0,0,21,7,3,0,5,1,0,0,0,0,0,0,0,0,4,0,55,296,126,170,1,1,233,0,0,0.74,0.79,0.05
7,Athletic Club,Defender,6,San José,1,33,"87,0%",15,"39,0%",16,"42,0%",18,"47,0%",7,0,0,0,0,0,23,6,21,99,34,25,17,0,52,89,66,36,2,8,34,0,0,0,16,11,3,0,5,7,0,0,0,0,0,0,0,0,11,0,42,243,141,102,707,637,64,6,0,0.39,0.42,0.47
8,Athletic Club,Defender,12,Yuri B.,3,35,"92,0%",33,"87,0%",35,"92,0%",2,"5,0%",9,1,0,2,0,0,40,9,43,270,71,32,15,0,154,100,78,41,3,78,38,0,0,0,12,22,6,2,32,24,2,0,1,1,0,0,0,0,100,2,48,373,254,119,1,1,92,2,1,0.87,0.92,0.05
9,Athletic Club,Defender,15,I. Lekue,210,4,"11,0%",2,"5,0%",2,"5,0%",2,"5,0%",1,0,0,0,0,0,4,0,4,11,10,2,3,0,10,10,5,4,0,3,1,0,0,0,1,0,0,0,3,1,0,0,0,0,0,0,0,0,10,2,5,29,20,9,85,75,10,0,0,0.05,0.05,0.05


In [85]:
#Convert Shirt number to category 
df['Shirt number'] = df['Shirt number'] .astype('category')

In [86]:
#Converting Percentage to float

In [91]:
def p2f(x):
    return float(x.split(',')[0])/100

df['Float_Game_Played'] = df['Percentage of games played'].apply(p2f)

df['Float_Full_ Game_Played'] = df['Percentage of full games played'].apply(p2f)

df['Float_Game_Started'] = df['Percentage of games started'].apply(p2f)

df['Float_Game_Substituted'] = df['Percentage of games where substituted'].apply(p2f)

In [92]:
df['Float_Game_Played'].dtypes

dtype('float64')

In [93]:
df.to_csv('laliga_prepared.csv', index = False)