In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import os, re
import math
import networkx as nx
import warnings
warnings.filterwarnings("ignore")

### 1. Get countries' players' clean histories

In [2]:
player_name = 'Edouard Mendy'
with open(f'players_history_raw/{player_name}_raw.txt', encoding='utf-8') as f:
    soup = bs(f.read())
    
def get_market_value(x):
    try:
        num = float("".join(re.findall("[\d.]+", str(x))))
    except:
        num = np.nan
    unit = "".join(re.findall("[a-zA-Z]+", str(x)))
    if unit == "m":
        num *= 1000000
    elif unit == "k":
        num *= 1000
    return num



In [3]:
def get_transfer_history(player_name: str) -> pd.DataFrame:
    """
        Reads in the raw player files and parses their transfer history.
    """
    columns = {
        'season': 'tm-player-transfer-history-grid__season',
        'date': 'tm-player-transfer-history-grid__date',
        'old_team': 'tm-player-transfer-history-grid__old-club',
        'new_team': 'tm-player-transfer-history-grid__new-club',
        'market_value': 'tm-player-transfer-history-grid__market-value',
        'transfer_fee': 'tm-player-transfer-history-grid__fee'
    }
    
    with open(f'players_history_raw/{player_name}_raw.txt', encoding='utf-8') as f:
        soup = bs(f.read())
        
    transfer_history = soup.find('div', {'class' : 'box'})
    t_table = []
    for i, (k, v) in enumerate(columns.items()):
        t_items = []
        for j in transfer_history.find_all('div', {'class': v}):
            t_items.append(j.text.strip())
        t_table.append(t_items)
    current_league = soup.find('span', {'class': 'data-header__league'})
    current_league = current_league.text.strip() if not pd.isnull(current_league) else np.nan
        
    df = pd.DataFrame(t_table).T[1:]
    df.columns = columns.keys()
    df = clean_transfer_history(df)
    df['name'] = player_name
    df['current_league'] = current_league
    df = df[['name', 'from_date', 'to_date', 'old_team', 'new_team', 'market_value', 'transfer_fee', 'current_league']]
    
    return df

def clean_transfer_history(df):
    """
        Cleans the transfer history and only keeps relevant values.
    """
    df.dropna(inplace=True)
    df = pd.DataFrame([[np.nan] * len(df.columns)], columns=df.columns).append(df, ignore_index=True)
    df['from_date'] = df['date'].shift(-1)
    df.rename(columns={'date': 'to_date'}, inplace=True)
    df.drop(columns=['season'], inplace=True)
    df['old_team'][0] = df['new_team'][1]
    df.replace('-', np.nan, inplace=True)
    df.replace('free transfer', 0, inplace=True)
    for c in ['market_value', 'transfer_fee']:
        df[c] = df[c].replace('[\€,]', '', regex=True)
        nums = []
        for i, r in df.iterrows():
            if not pd.isnull(r[c]) and 'loan' not in str(r[c]).lower() and '?' not in str(r[c]):
                '''try:
                    num = float("".join(re.findall("[\d.]+", str(r[c]))))
                except:
                    num = np.nan
                unit = "".join(re.findall("[a-zA-Z]+", str(r[c])))
                if unit == "m":
                    num *= 1000000
                elif unit == "k":
                    num *= 1000
                '''
                num = get_market_value(r[c])
                nums.append(num)
            else:
                nums.append(np.nan)
        df[c] = nums
    
    return df

def clean_countries_players(csv):
    """
        Reads the csv files for each country and cleans the file.
    """
    df = pd.read_csv(f'countries_players/{csv}')
    df.drop(columns=['Unnamed: 0', 'url'], inplace=True)
    df.rename(columns={'tm-shirt-number': 'shirt_number', 'current-team': 'current_team', 
                       'market-value': 'current_market_value'}, inplace=True)
    
    return df

In [4]:
for country_csv in os.listdir('countries_players/'):
    country_df = clean_countries_players(country_csv)
    dfs = []
    for i, r in country_df.iterrows():
        dfs.append(country_df.merge(get_transfer_history(r['name']), how='inner', on='name'))
    master_df = pd.concat(dfs).reset_index(drop=True)
    if not os.path.exists('country_player_history/'):
        os.makedirs('country_player_history/')
    master_df.to_csv(f"country_player_history/{country_csv.split('_')[0]}.csv", index=False, header=True)

### 2. Draw the club-club network

In [5]:
edges = {}

for fn in os.listdir('country_player_history'):
    if '.DS' not in fn:
        df = pd.read_csv('country_player_history/' + fn, index_col = 0)

        
        names = sorted(list(set(df.name.to_list())))

        for name in names:

            df_name = df[df.name==name].dropna(subset = ['market_value'])
            new_team = df_name.new_team.to_list()
            old_team = df_name.old_team.to_list()
            market_value = df_name.market_value.to_list()

            for i in range(len(new_team)):
                edge = new_team[i] +  '  <-  ' + old_team[i]
                if edge not in edges:
                    edges[edge] = market_value[i]
                else:
                    edges[edge] += market_value[i]        
        
  
D = nx.DiGraph()
ww = 0
for uv, w in edges.items():
    ww += w
    u, v = uv.split('  <-  ')
    D.add_edge(v, u, weight = w)
 
nx.write_gexf(D, 'club_migration_network.gexf')
D.number_of_nodes(), D.number_of_edges(), ww  

(1054, 3135, 21750265000.0)

### 3. Draw the teammate network

In [6]:
def get_year(x):
    try:
        return min([int(a) for a in re.findall(r"[0-9]{4}", x)])
    except:
        return 2022


dfs = []
for fn in os.listdir('country_player_history'):
    if '.DS' not in fn:
        df = pd.read_csv('country_player_history/' + fn, index_col = 0)
        dfs.append(df)
        
dfs = pd.concat(dfs)
names = sorted(list(set(dfs.name.to_list())))
len(names)

830

In [7]:
edges = {}

for nnd, name1 in enumerate(names):
    
    if nnd % 50 == 0:
        print(nnd)
        
    for name2 in names[nnd+1:]:

        df_name1 = dfs[dfs.name==name1]
        df_name2 = dfs[dfs.name==name2]

        teams1 = set(df_name1.old_team)
        teams2 = set(df_name2.old_team)

        shared_teams = list(teams1.intersection(teams2))
        
        if len(shared_teams)>0:

            for shared_team in shared_teams:
                df_team1 = df_name1[df_name1.old_team==shared_team]
                df_team2 = df_name2[df_name2.old_team==shared_team]
                years1 = set(range(get_year(df_team1.from_date.to_list()[0]), get_year(df_team1.to_date.to_list()[0])+1))
                years2 = set(range(get_year(df_team2.from_date.to_list()[0]), get_year(df_team2.to_date.to_list()[0])+1))

                shared_years = years1.intersection(years2)
                if len(shared_years)>0:
                    edge = '\t'.join(sorted([name1, name2]))
                    if edge not in edges:
                        edges[edge] = len(shared_years)
                    edges[edge] += len(shared_years)    

               
      


0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800


In [7]:
G = nx.Graph()
for e, w in edges.items():
    e1, e2 = e.split('\t')
    G.add_edge(e1, e2, weight = w)
    
G.number_of_nodes(), G.number_of_edges()

(818, 6442)

In [9]:
nx.write_gexf(G, 'player_teammate_network.gexf')

In [21]:
df_value = dfs.drop_duplicates(subset = ['name']).set_index('name')[['current_market_value']]
df_value['current_market_value'] = df_value['current_market_value'].apply(get_market_value)
df_value['log_market_value'] = [math.log(x+1) for x in df_value['current_market_value']]
df_value.index.name = 'Id'
df_value.to_csv('node_values.csv')
df_value

Unnamed: 0_level_0,current_market_value,log_market_value
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
Andries Noppert,2000000.0,14.508658
Matthijs de Ligt,70000000.0,18.064006
Jurrien Timber,45000000.0,17.622173
Stefan de Vrij,15000000.0,16.523561
Daley Blind,6000000.0,15.607270
...,...,...
Mohamed Ali Ben Romdhane,2700000.0,14.808763
Ghaylen Chaaleli,1200000.0,13.997833
Naïm Sliti,6000000.0,15.607270
Issam Jebali,500000.0,13.122365
