<a href="https://colab.research.google.com/github/vamsimalineni96/Fifa18_data_analytics/blob/main/FIFA18_data_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans as kmeans
from sklearn.preprocessing import StandardScaler as ss
from sklearn.preprocessing import OneHotEncoder 


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
source_path=r"/content/drive/MyDrive/FIFA18_official_data.csv"
players=pd.read_csv(source_path)

## Displaying column names from the data frame

In [4]:
print(players.columns)

Index(['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall',
       'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
       'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Best Position', 'Best Overall Rating', 'Release Clause'],
      dtype='object')


## Dropping the irrelevant data from the players attributes

In [5]:
drop_list=['ID','Flag','Special','Real Face','Club Logo']

players.drop(drop_list,inplace=True,axis=1)
print(players.columns)

Index(['Name', 'Age', 'Photo', 'Nationality', 'Overall', 'Potential', 'Club',
       'Value', 'Wage', 'Preferred Foot', 'International Reputation',
       'Weak Foot', 'Skill Moves', 'Work Rate', 'Body Type', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
       'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Best Position', 'Best Overall Rating', 'Release Clause'],
      dtype='object')


In [6]:
def position_dataframe(players):
    '''
    Removing NANs from the dataframe, converting the scraped text for position
    into interpretable ones.
    '''
    # Dropping the rows with Nan's in position column from the main dataframe 
    players=players[players['Position'].notna()]
    print("Check if there are nans: ",players['Position'].isnull().values.any()) 
    
    # Removing unnecessary info from the positions column
    pos=players['Position']
    post=pos.tolist()
    a=[]
    for i in range(len(post)):
        ind=post[i].index('>')
        a.append(post[i][ind+1:])
    posit=pd.DataFrame(a,columns=['Position'])

    # Deleting position column from the main data frame
    del players['Position']
    
    # concating the refined position column in the main dataframe
    players=pd.concat([players, posit], axis=1)
    
    # Dropping the nans in Position column
    players=players[players['Position'].notna()]
    
    return players

In [7]:
def refining_dataframe(df):
    ''' 
    Removing NANs from the respective columns :
    'Preferred Foot' ,'Work Rate', 'Body Type'
    '''
    df1=df[df['Preferred Foot'].notna()]
    df2=df1[df1['Work Rate'].notna()]
    df3=df2[df2['Body Type'].notna()]
    
    return df3

In [8]:
'''
Modifying the positions column for the main dataframe.
'''
# Use players_refined for future operations

players_refined = position_dataframe(players)
print(players_refined.columns)

Check if there are nans:  False
Index(['Name', 'Age', 'Photo', 'Nationality', 'Overall', 'Potential', 'Club',
       'Value', 'Wage', 'Preferred Foot', 'International Reputation',
       'Weak Foot', 'Skill Moves', 'Work Rate', 'Body Type', 'Jersey Number',
       'Joined', 'Loaned From', 'Contract Valid Until', 'Height', 'Weight',
       'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Best Position', 'Best Overall Rating', 'Release Clause', 'Position'],
      dtype='object')


In [9]:
# Refining the dataframe further with relevant categorical datatypes
players_refined=refining_dataframe(players_refined)

In [47]:
# making a copy of refined players data frame
test=players_refined.copy()
# Dropping the Best Position column, as it is not revelant for analysis
test.drop(['Best Position'],inplace=True,axis=1)

In [48]:
def categorical_marking(df):
    '''
    This function takes the input as the column of the data frame 
    and returns the categorically marked column dataframe
    '''
    #Finding the list of unique features in the categorical column dataframe
    unq_list=df.unique()
    # Assigning integer markings to the unique features
    arange=[i for i in range(len(unq_list))]
    dicti= dict(zip(unq_list,arange))
    df_list=df.tolist()
    df_final=[]
    
    for i in range(len(df_list)):
        df_final.append(dicti[df_list[i]])
    return pd.DataFrame(df_final)

In [49]:
# These are the columns that must be included into the test dataframe, these are categorically marked
hot_body=categorical_marking(test['Body Type'])
hot_foot=categorical_marking(test['Preferred Foot'])
hot_work=categorical_marking(test['Work Rate'])
hot_posi=categorical_marking(test['Position'])
# Appending the above categorically marked columns into single dataframe and naming the columns
cat_cols=pd.concat([hot_body,hot_work,hot_foot,hot_posi],axis=1)
cat_cols.columns=['Body Type','Work Rate','Preferred Foot','Position']

In [50]:
# Excluding the players who's current position is either reserves or substitutes
res_sub=['RES','SUB']
test=test[~test.Position.isin(res_sub)]

In [51]:
# Extracting the dataframe containing only the numeric data from the main dataframe
test_numeric=test.select_dtypes([np.number])

In [53]:
# Concatenating the categorically marked columns to the numeric dataframe
test_numeric=pd.concat([test_numeric,cat_cols],axis=1)
print(test_numeric.head())
print("The number of players in test_numeric dataframe:",len(test_numeric))

    Age  Overall  Potential  International Reputation  Weak Foot  Skill Moves  \
0  30.0     92.0       92.0                       5.0        4.0          4.0   
1  29.0     86.0       86.0                       3.0        3.0          3.0   
2  26.0     82.0       83.0                       3.0        3.0          3.0   
3  25.0     85.0       87.0                       4.0        4.0          3.0   
4  24.0     88.0       92.0                       4.0        4.0          5.0   

   Jersey Number  Crossing  Finishing  HeadingAccuracy  ...  GKReflexes  \
0            9.0      77.0       94.0             77.0  ...        37.0   
1            4.0      73.0       76.0             59.0  ...        11.0   
2           24.0      77.0       76.0             70.0  ...        10.0   
3           27.0      81.0       63.0             75.0  ...         9.0   
4            6.0      80.0       73.0             75.0  ...         3.0   

   Best Overall Rating  Body Type  Work Rate  Preferred Foot  

In [54]:
# Checking if there are any NaNs in the test_numeric dataframe
print("Checking if there are any Nans in the dataframe:",test_numeric.isnull().values.any())
# Finding the columns which have NaNs in the test_numeric dataframe
nan_columns = test_numeric.columns[test_numeric.isna().any()].tolist()
print("The number of columns which have nans are :", len(nan_columns))
# 

Checking if there are any Nans in the dataframe: True
The number of columns which have nans are : 42


In [55]:
# this is indicating that all the columns have nans in the test_numeric dataframe, now drop the rows with nans
print("Dropping the rows which have NaN values")
test_numeric=test_numeric.dropna()
print("Checking if there are any Nans in the dataframe:",test_numeric.isnull().values.any())
print("The number of rows after removing Nans:",len(test_numeric))

Dropping the rows which have NaN values
Checking if there are any Nans in the dataframe: False
The number of rows after removing Nans: 7143


# Numeric data

In [56]:
# Printing the names of the columns in the test_numeric dataframe
print(test_numeric.columns)

Index(['Age', 'Overall', 'Potential', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Jersey Number', 'Crossing', 'Finishing',
       'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve',
       'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
       'GKKicking', 'GKPositioning', 'GKReflexes', 'Best Overall Rating',
       'Body Type', 'Work Rate', 'Preferred Foot', 'Position', 'Body Type',
       'Work Rate', 'Preferred Foot', 'Position'],
      dtype='object')


In [59]:
'''
Performing Kmeans clustering on the numeric data to find the clusters in the given data
'''
# standardising the numeric data available
scaler=ss()
scaler.fit(test_numeric)
test_scaled=scaler.transform(test_numeric)
print(test_scaled)

[[ 1.02818921  3.5505257   3.20782783 ... -1.49247437 -0.57373994
  -1.48239702]
 [ 0.80882375  2.66001129  2.23523341 ... -0.91415472 -0.57373994
  -1.32528674]
 [ 0.15072738  2.06633502  1.74893619 ... -1.49247437 -0.57373994
  -1.16817645]
 ...
 [-1.82356174 -2.23781795 -0.19625266 ...  0.24248458 -0.57373994
  -0.53973531]
 [-1.60419628 -1.79256075 -1.00674802 ...  0.24248458 -0.57373994
   0.40292641]
 [-1.82356174 -2.38623702 -0.19625266 ...  0.24248458 -0.57373994
  -0.53973531]]


## Creating a database for Real Madrid for analysis

In [None]:
# Finding the number of clubs in the database
club_list=(pd.unique(players['Club']))
print("Number of clubs in the database:",len(club_list))
club='Club'
club_count=players.groupby(club).size()
print("\nThe number of players in each club in the database:")
print(club_count)

Number of clubs in the database: 634

The number of players in each club in the database:
Club
1. FC Heidenheim    25
1. FC Köln          26
1. FC Nürnberg      28
1. FSV Mainz 05     32
1860 München        28
                    ..
Águilas Doradas     28
Çaykur Rizespor     30
Örebro SK           23
Östersunds FK       23
Śląsk Wrocław       23
Length: 634, dtype: int64


In [None]:
print(club_list)

['Real Madrid' 'FC Barcelona' 'FC Bayern' 'Manchester Utd' 'Chelsea'
 'Arsenal' 'Juventus' 'PSG' 'Manchester City' 'Atlético Madrid'
 'Bor. Dortmund' 'Spurs' 'Inter' 'Bayer 04' 'Liverpool' 'Roma'
 'Olym. Lyonnais' 'Everton' 'Olym. Marseille' 'Napoli' 'AS Saint-Étienne'
 'Athletic Bilbao' 'Leicester City' 'AS Monaco' 'Villarreal CF'
 'FC Schalke 04' 'Crystal Palace' 'Real Sociedad' 'Milan' 'Sporting CP'
 'FC Porto' '1. FC Köln' 'Sevilla FC' 'Genoa' 'Celta Vigo' 'Free Agents'
 'Toronto FC' "Bor. M'gladbach" 'SL Benfica' 'West Ham' 'Valencia CF'
 'Fiorentina' 'Torino' 'Galatasaray' 'Beşiktaş' 'UD Las Palmas'
 'Southampton' 'Stoke City' 'Sassuolo' 'Lazio' 'VfL Wolfsburg'
 'Bournemouth' 'Shakhtar Donetsk' 'Zenit' 'Swansea City' 'Udinese'
 'Lokomotiv Moscow' 'Sampdoria' 'Stade Rennais' 'RCD Espanyol' 'Atalanta'
 'LOSC Lille' 'Orlando City' 'NYCFC' 'Middlesbrough' '1899 Hoffenheim'
 'Spartak Moscow' 'Watford' 'Eint. Frankfurt' 'West Brom' 'Málaga CF'
 'CSKA Moscow' 'OGC Nice' 'Fenerbahçe' 'Re

## Analysing players of the club

In [None]:
def return_club_data(players,club_name):
    # extracting the details of the players for the respective club
    main_clubs=[club_name]
    club=players[players['Club'].isin(main_clubs)]
    
    # Creating different dataframes for subsitutes, reserve players and starting 
    subs=['Sub']
    res =['Res']
    start=subs+res
    drop_list=['Club_Position']
    
    starting     =club[~club['Club_Position'].isin(start)]
    
    substitutes  =club[club['Club_Position'].isin(subs)]
    substitutes.drop(drop_list,inplace=True,axis=1)
    
    reserves     =club[club['Club_Position'].isin(res)]
    reserves.drop(drop_list,inplace=True,axis=1)
    
    details=[starting, substitutes,reserves]
    return details

In [None]:
real_madrid=return_club_data(players,'Real Madrid')
real_starting    =real_madrid[0]
real_substitutes =real_madrid[1]
real_reserves    =real_madrid[2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
features=['Preffered_Position','Rating']