In [1]:
import pandas as pd

## Reading data

In [2]:
year = "2021" # change this accordingly!

importPath = "../data/uefa-data/raw/" + year + "_"
exportPath = "../data/uefa-data/processed/" + year + "_"

mv2020 = pd.read_csv("../data/uefa-data/raw/2020_" + "mv.csv")
mv2020 = mv2020.drop(columns = ["transm id", "Player"])
mv2020.to_csv("../data/uefa-data/processed/"+ "2020_mv.csv", index = False)

mv = pd.read_csv(importPath + "mv.csv")

defense = pd.read_csv(importPath + "defense.csv", header = [0,1], skiprows = [2])
goalkeeping = pd.read_csv(importPath + "goalkeeping.csv", header = [0,1], skiprows = [2])
passing = pd.read_csv(importPath + "passing.csv", header = [0,1], skiprows = [2])
shooting =pd.read_csv(importPath + "shooting.csv", header = [0,1], skiprows = [2])

def clean_header(data: list):
    header = []
    for i in data:
        if i[0].startswith('Unnamed: '):
            header.append(i[1])
        else:
            header.append(' '.join(i))
    return header

defense.columns = clean_header(defense.columns)
goalkeeping.columns = clean_header(goalkeeping.columns)
passing.columns = clean_header(passing.columns)
shooting.columns = clean_header(shooting.columns)



## Creating player stats file

In this step, we combine basic player information common to all data files and mv into one file.

The predictors 'Nation', 'Squad' and 'Born' are removed as they are not very helpful in predicting Raritan market values

In [3]:
playerStats = passing.iloc[:, 0:7]
print(playerStats)
# players will be identified by their fbref id 
playerStats["Player"] = playerStats["Player"].apply(lambda x: x.split("\\")[1])
playerStats["mv"] = mv["mv"]

# quick sanity check because everything should be in order
for i in range(len(mv)):
    if mv["fbref id"][i] != playerStats["Player"][i]:
        print(str(i) + ": id mismatch")

# separate positions
validPos = ["DF", "FW", "GK", "MF"]
isPos = {key: [0] * len(playerStats) for key in validPos}

for i in range(len(playerStats["Pos"])):
    pos = playerStats["Pos"][i];
    # split Pos by two letter segments
    pos = [pos[j:j + 2] for j in range(0, len(pos), 2)]
    
    for j in pos:
        isPos[j][i] = 1;

playerStats = pd.concat([playerStats.drop(columns = "Pos"), pd.DataFrame(isPos)], axis = 1)

# remove unnecessary predictors
playerStats = playerStats.drop(columns = ["Nation", "Squad", "Born"])

playerStats.head()
playerStats.to_csv(exportPath + "player_stats.csv", index = False)


                        Player   Nation   Pos             Squad  Age  Born  \
0     Pape Abou Cisse\8bc373ea   sn SEN    DF     gr Olympiacos   24  1995   
1       Tammy Abraham\f586779e  eng ENG    FW       eng Chelsea   22  1997   
2    Francesco Acerbi\b96b595c   it ITA    DF          it Lazio   32  1988   
3        Marcos Acuña\81442ecb   ar ARG    DF        es Sevilla   28  1991   
4         Tyler Adams\2b09d998   us USA  DFMF     de RB Leipzig   21  1999   
..                         ...      ...   ...               ...  ...   ...   
731    Joshua Zirkzee\028e70b9   nl NED    MF  de Bayern Munich   19  2001   
732      Hakim Ziyech\6622454d   ma MAR  FWMF       eng Chelsea   27  1993   
733        Kurt Zouma\ce4246f5   fr FRA    DF       eng Chelsea   25  1994   
734  Oleksandr Zubkov\adfbe1f5   ua UKR  FWMF    hu Ferencváros   23  1996   
735   Martin Ødegaard\79300479   no NOR    MF    es Real Madrid   21  1998   

     90s  
0    5.0  
1    2.6  
2    8.0  
3    3.6  
4    3.2

## Cleaning other files

In [4]:



# this section is a terrible mess of hard coding, please don't ever do this
playerDefense = pd.concat([defense["Player"].apply(lambda x: x.split("\\")[1]), defense.iloc[:, 6:]], axis = 1)
playerPassing = pd.concat([passing["Player"].apply(lambda x: x.split("\\")[1]), passing.iloc[:, 6:]], axis = 1)
playerGoalkeeping = pd.concat([goalkeeping["Player"].apply(lambda x: x.split("\\")[1]), goalkeeping.iloc[:, 6:]], axis = 1)
playerShooting = pd.concat([shooting["Player"].apply(lambda x: x.split("\\")[1]), shooting.iloc[:, 6:]], axis = 1)

# change to per 90 stats
# convertTo90s list contains every column that we dont want to divide by 90s
convertTo90s = playerDefense.columns.difference(["Player", "90s", "Vs D Tkl%", "Pres %"])
playerDefense[convertTo90s] = playerDefense[convertTo90s].div(playerDefense["90s"], axis = 0).round(decimals = 2)

convertTo90s = playerShooting.columns.difference(["Player", "90s", "Stan SoT%", "Stan Sh/90", "Stan SoT/90", 
    "Stan G/Sh", "Stan G/SoT", "Stan Dist", "Expe npxG/Sh"])
playerShooting[convertTo90s] = playerShooting[convertTo90s].div(playerShooting["90s"], axis = 0).round(decimals = 2)

convertTo90s = playerPassing.columns.difference(["Player", "90s", "Tota Cmp%", "Shor Cmp%", "Medi Cmp%", "Long Cmp%"])
playerPassing[convertTo90s] = playerPassing[convertTo90s].div(playerPassing["90s"], axis = 0).round(decimals = 2)

convertTo90s = playerGoalkeeping.columns.difference(["Player", "Play MP", "Play Starts", "Play Min","Play 90s", "Perf GA90", "Perf Save%", "Perf CS%"])
playerGoalkeeping[convertTo90s] = playerGoalkeeping[convertTo90s].div(playerGoalkeeping["Play 90s"], axis = 0).round(decimals = 2)


playerGoalkeeping.drop(columns = "Play 90s").to_csv(exportPath + "goalkeeping.csv", index = False)
playerShooting.drop(columns = "90s").to_csv(exportPath + "shooting.csv", index = False)
playerPassing.drop(columns = "90s").to_csv(exportPath + "passing.csv", index = False)
playerDefense.drop(columns = "90s").to_csv(exportPath + "defense.csv", index = False)


