<a href="https://colab.research.google.com/github/vamsimalineni96/Fifa18_data_analytics/blob/main/FIFA18_data_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans as kmeans
from sklearn.preprocessing import StandardScaler as ss
from sklearn.preprocessing import OneHotEncoder 


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
source_path=r"/content/drive/MyDrive/FIFA18_official_data.csv"
players=pd.read_csv(source_path)

## Displaying column names from the data frame

In [None]:
print(players.columns)

Index(['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall',
       'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
       'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Best Position', 'Best Overall Rating', 'Release Clause'],
      dtype='object')


## Dropping the irrelevant data from the players attributes

In [None]:
drop_list=['ID','Flag','Special','Real Face','Club Logo']

players.drop(drop_list,inplace=True,axis=1)
print(players.columns)

Index(['Name', 'Age', 'Photo', 'Nationality', 'Overall', 'Potential', 'Club',
       'Value', 'Wage', 'Preferred Foot', 'International Reputation',
       'Weak Foot', 'Skill Moves', 'Work Rate', 'Body Type', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'Crossing', 'Finishing', 'HeadingAccuracy',
       'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
       'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility',
       'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength',
       'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision',
       'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle',
       'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Best Position', 'Best Overall Rating', 'Release Clause'],
      dtype='object')


In [None]:
def position_dataframe(players):
    '''
    Removing NANs from the dataframe, converting the scraped text for position
    into interpretable ones.
    '''
    # Dropping the rows with Nan's in position column from the main dataframe 
    players=players[players['Position'].notna()]
    print("Check if there are nans: ",players['Position'].isnull().values.any()) 
    
    # Removing unnecessary info from the positions column
    pos=players['Position']
    post=pos.tolist()
    a=[]
    for i in range(len(post)):
        ind=post[i].index('>')
        a.append(post[i][ind+1:])
    posit=pd.DataFrame(a,columns=['Position'])

    # Deleting position column from the main data frame
    del players['Position']
    
    # concating the refined position column in the main dataframe
    players=pd.concat([players, posit], axis=1)
    
    # Dropping the nans in Position column
    players=players[players['Position'].notna()]
    
    return players

In [None]:
def refining_dataframe(df):
    ''' 
    Removing NANs from the respective columns :
    'Preferred Foot' ,'Work Rate', 'Body Type'
    '''
    df1=df[df['Preferred Foot'].notna()]
    df2=df1[df1['Work Rate'].notna()]
    df3=df2[df2['Body Type'].notna()]
    
    return df3

In [None]:
'''
Modifying the positions column for the main dataframe.
'''
# Use players_refined for future operations

players_refined = position_dataframe(players)
print(players_refined.columns)

Check if there are nans:  False
Index(['Name', 'Age', 'Photo', 'Nationality', 'Overall', 'Potential', 'Club',
       'Value', 'Wage', 'Preferred Foot', 'International Reputation',
       'Weak Foot', 'Skill Moves', 'Work Rate', 'Body Type', 'Jersey Number',
       'Joined', 'Loaned From', 'Contract Valid Until', 'Height', 'Weight',
       'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Best Position', 'Best Overall Rating', 'Release Clause', 'Position'],
      dtype='object')


In [None]:
# Refining the dataframe further with relevant categorical datatypes

players_refined=refining_dataframe(players_refined)

In [None]:
# Finding the list of unique features in the categorical datatypes
pref_foot =players_refined['Preferred Foot'].unique()
work_rate =players_refined['Work Rate'].unique()
body_type =players_refined['Body Type'].unique()
position  =players_refined['Position'].unique()

# making a copy of refined players data frame
test=players_refined.copy()

In [None]:
print(test)

                 Name   Age                                             Photo  \
0           L. Suárez  30.0  https://cdn.sofifa.com/players/176/580/18_60.png   
1       R. Nainggolan  29.0  https://cdn.sofifa.com/players/178/518/18_60.png   
2         A. Florenzi  26.0  https://cdn.sofifa.com/players/203/551/18_60.png   
3            D. Alaba  25.0  https://cdn.sofifa.com/players/197/445/18_60.png   
4            P. Pogba  24.0  https://cdn.sofifa.com/players/195/864/18_60.png   
...               ...   ...                                               ...   
17903         C. Heim  16.0  https://cdn.sofifa.com/players/242/765/18_60.png   
17904     B. Gradecki  17.0  https://cdn.sofifa.com/players/242/145/18_60.png   
17905    Jesús Godino  21.0  https://cdn.sofifa.com/players/243/033/18_60.png   
17906       M. Sutton  17.0  https://cdn.sofifa.com/players/242/086/18_60.png   
17907     Y. Arboleda  18.0  https://cdn.sofifa.com/players/241/604/18_60.png   

       Nationality  Overall

In [None]:
def categorical_marking(df):
    '''
    This function takes the input as the column of the data frame 
    and returns the categorically marked column dataframe
    '''
    unq_list=df.unique()
    arange=[i for i in range(len(unq_list))]
    dicti= dict(zip(unq_list,arange))
    df_list=df.tolist()
    df_final=[]
    for i in range(len(df_list)):
        df_final.append(dicti[df_list[i]])
    return pd.DataFrame(df_final)

In [None]:
hot_body=categorical_marking(test['Body Type'])
hot_foot=categorical_marking(test['Preferred Foot'])
hot_work=categorical_marking(test['Work Rate'])
hot_posi=categorical_marking(test['Position'])

In [None]:
# one hot encoding preferred foot in the test dataframe
# this method is a brute force method, youll have to manually type all the
# cases not good!
# you can use a dictionary and check with the elements of dictionary.

bod=[i for i in range(len(body_type))]
body_dict=(dict(zip(body_type,bod)))
body_df=test['Body Type'].tolist()
final_body=[]
for i in range(len(body_df)):
    final_body.append(body_dict[body_df[i]])

print(body_dict)
print(body_df[150])
print(final_body[150])

{'Normal (170-185)': 0, 'Stocky (170-185)': 1, 'Lean (170-185)': 2, 'Lean (185+)': 3, 'Normal (170-)': 4, 'Normal (185+)': 5, 'C. Ronaldo': 6, 'Lean (170-)': 7, 'Messi': 8, 'Shaqiri': 9, 'Neymar': 10, 'Normal (190+)': 11, 'Stocky (170-)': 12, 'Stocky (185+)': 13, 'Akinfenwa': 14, 'Courtois': 15}
Stocky (170-185)
1


In [None]:
# Excluding the players who's current position is either reserves or substitutes
res_sub=['RES','SUB']
test=test[~test.Position.isin(res_sub)]

In [None]:
test.drop(['Best Position'],inplace=True,axis=1)

In [None]:
print(test.columns)

Index(['Name', 'Age', 'Photo', 'Nationality', 'Overall', 'Potential', 'Club',
       'Value', 'Wage', 'Preferred Foot', 'International Reputation',
       'Weak Foot', 'Skill Moves', 'Work Rate', 'Body Type', 'Jersey Number',
       'Joined', 'Loaned From', 'Contract Valid Until', 'Height', 'Weight',
       'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
       'Best Overall Rating', 'Release Clause', 'Position'],
      dtype='object')


In [None]:
'''
Finding out the relevant categorical datatypes and encoding them
''' 

players_cat = players_refined.select_dtypes(exclude=['float64','int64'])
print(players_cat.head())


'''
The relevant categorical datatypes are:
Preferred Foot, Work Rate, Body Type, Position 
These are to be one hot encoded.

''' 

             Name                                             Photo  \
0       L. Suárez  https://cdn.sofifa.com/players/176/580/18_60.png   
1   R. Nainggolan  https://cdn.sofifa.com/players/178/518/18_60.png   
2     A. Florenzi  https://cdn.sofifa.com/players/203/551/18_60.png   
3        D. Alaba  https://cdn.sofifa.com/players/197/445/18_60.png   
4        P. Pogba  https://cdn.sofifa.com/players/195/864/18_60.png   

  Nationality               Club   Value   Wage Preferred Foot     Work Rate  \
0     Uruguay       FC Barcelona    €97M  €500K          Right  High/ Medium   
1     Belgium               Roma  €42.5M  €130K          Right    High/ High   
2       Italy               Roma    €21M   €70K          Right  High/ Medium   
3     Austria  FC Bayern München  €38.5M  €130K           Left  High/ Medium   
4      France  Manchester United    €76M  €250K          Right     High/ Low   

          Body Type        Joined Loaned From Contract Valid Until Height  \
0  Normal (170-

'\nThe relevant categorical datatypes are:\nPreferred Foot, Work Rate, Body Type, Position \nThese are to be one hot encoded.\n\n'

# Numeric data

In [None]:
'''
Creating a dataframe exclusively for numeric data 
'''
# Drop 

# Dropping the alphabetic attributes
players_numeric=players_refined.select_dtypes([np.number])

# Drop the irrelevant attributes from the players for clustering
irrelevant_data=['Potential','International Reputation','Jersey Number',
                'Best Overall Rating']
players_numeric.drop(irrelevant_data,inplace=True,axis=1)

print(players_numeric)

        Age  Overall  Weak Foot  Skill Moves  Crossing  Finishing  \
0      30.0     92.0        4.0          4.0      77.0       94.0   
1      29.0     86.0        3.0          3.0      73.0       76.0   
2      26.0     82.0        3.0          3.0      77.0       76.0   
3      25.0     85.0        4.0          3.0      81.0       63.0   
4      24.0     88.0        4.0          5.0      80.0       73.0   
...     ...      ...        ...          ...       ...        ...   
17903  16.0     48.0        2.0          1.0      13.0        6.0   
17904  17.0     52.0        2.0          1.0      10.0        8.0   
17905  21.0     53.0        3.0          1.0      11.0       10.0   
17906  17.0     50.0        2.0          1.0      14.0        7.0   
17907  18.0     52.0        2.0          1.0      14.0        6.0   

       HeadingAccuracy  ShortPassing  Volleys  Dribbling  ...  Penalties  \
0                 77.0          83.0     88.0       86.0  ...       85.0   
1                 5

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# Scanning the data frame for nan values
check_nan=players_numeric.isnull().values.any()

print(check_nan)

False


## Creating a database for Real Madrid for analysis

In [None]:
# Finding the number of clubs in the database
club_list=(pd.unique(players['Club']))
print("Number of clubs in the database:",len(club_list))
club='Club'
club_count=players.groupby(club).size()
print("\nThe number of players in each club in the database:")
print(club_count)

Number of clubs in the database: 634

The number of players in each club in the database:
Club
1. FC Heidenheim    25
1. FC Köln          26
1. FC Nürnberg      28
1. FSV Mainz 05     32
1860 München        28
                    ..
Águilas Doradas     28
Çaykur Rizespor     30
Örebro SK           23
Östersunds FK       23
Śląsk Wrocław       23
Length: 634, dtype: int64


In [None]:
print(club_list)

['Real Madrid' 'FC Barcelona' 'FC Bayern' 'Manchester Utd' 'Chelsea'
 'Arsenal' 'Juventus' 'PSG' 'Manchester City' 'Atlético Madrid'
 'Bor. Dortmund' 'Spurs' 'Inter' 'Bayer 04' 'Liverpool' 'Roma'
 'Olym. Lyonnais' 'Everton' 'Olym. Marseille' 'Napoli' 'AS Saint-Étienne'
 'Athletic Bilbao' 'Leicester City' 'AS Monaco' 'Villarreal CF'
 'FC Schalke 04' 'Crystal Palace' 'Real Sociedad' 'Milan' 'Sporting CP'
 'FC Porto' '1. FC Köln' 'Sevilla FC' 'Genoa' 'Celta Vigo' 'Free Agents'
 'Toronto FC' "Bor. M'gladbach" 'SL Benfica' 'West Ham' 'Valencia CF'
 'Fiorentina' 'Torino' 'Galatasaray' 'Beşiktaş' 'UD Las Palmas'
 'Southampton' 'Stoke City' 'Sassuolo' 'Lazio' 'VfL Wolfsburg'
 'Bournemouth' 'Shakhtar Donetsk' 'Zenit' 'Swansea City' 'Udinese'
 'Lokomotiv Moscow' 'Sampdoria' 'Stade Rennais' 'RCD Espanyol' 'Atalanta'
 'LOSC Lille' 'Orlando City' 'NYCFC' 'Middlesbrough' '1899 Hoffenheim'
 'Spartak Moscow' 'Watford' 'Eint. Frankfurt' 'West Brom' 'Málaga CF'
 'CSKA Moscow' 'OGC Nice' 'Fenerbahçe' 'Re

## Analysing players of the club

In [None]:
def return_club_data(players,club_name):
    # extracting the details of the players for the respective club
    main_clubs=[club_name]
    club=players[players['Club'].isin(main_clubs)]
    
    # Creating different dataframes for subsitutes, reserve players and starting 
    subs=['Sub']
    res =['Res']
    start=subs+res
    drop_list=['Club_Position']
    
    starting     =club[~club['Club_Position'].isin(start)]
    
    substitutes  =club[club['Club_Position'].isin(subs)]
    substitutes.drop(drop_list,inplace=True,axis=1)
    
    reserves     =club[club['Club_Position'].isin(res)]
    reserves.drop(drop_list,inplace=True,axis=1)
    
    details=[starting, substitutes,reserves]
    return details

In [None]:
real_madrid=return_club_data(players,'Real Madrid')
real_starting    =real_madrid[0]
real_substitutes =real_madrid[1]
real_reserves    =real_madrid[2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
features=['Preffered_Position','Rating']