In [13]:
from requests import get
from bs4 import BeautifulSoup
from html.parser import HTMLParser
import pandas as pd
import numpy as np
import pickle 


## Scrape data

In [55]:
club_power_index = get('https://projects.fivethirtyeight.com/global-club-soccer-rankings/', 'lxml')


''' Grabbing the country names, scores and ranks '''

club_power_index = BeautifulSoup(club_power_index.text, 'html.parser')


team_list=[p.text for p in club_power_index.findAll('div', attrs={'class':'name'})] 
score_list=[p.text for p in club_power_index.findAll('td', attrs={'class':'num'}) if len(p.text)>3] 
league_list=[p.text.strip() for p in club_power_index.findAll('td', attrs={'class':'league drop-5'}) ] 
country_list=[p.text.strip() for p in club_power_index.findAll('td', attrs={'class':'country drop-1'}) ] 


In [56]:
team_list=team_list[:600]
score_list=score_list[:600]
league_list=league_list[:600]
country_list=country_list[:600]


In [57]:
club_score_df=pd.DataFrame(list(zip(score_list, league_list, country_list)),
             index=team_list, columns=["score", "league", "country"])
club_score_df.head(10)

Unnamed: 0,score,league,country
Liverpool,94.1,Premier League,England
Man. City,93.9,Premier League,England
Bayern Munich,93.4,Bundesliga,Germany
PSG,90.5,Ligue 1,France
Real Madrid,90.1,La Liga,Spain
Barcelona,89.0,La Liga,Spain
RB Leipzig,86.3,Bundesliga,Germany
Atlético Madrid,86.1,La Liga,Spain
Chelsea,85.0,Premier League,England
Juventus,85.0,Serie A,Italy


## Clean raw data

In [58]:
club_score_df.reset_index(level=0, inplace=True)
club_score_df.rename(columns={'index':'club'}, inplace=True)

In [59]:
pickle.dump(club_score_df[['club','score']], open('./temporary_pkl/club_score_df.pkl', 'wb'))

In [60]:
# keep only teams in 7 major european leagues 
# we find that Austrian top soccer league is also named Bundasliga and Russian's league named 
# Premier League. To avoid ambiguity, we use country as a selection criteria


mask2=club_score_df['country'].isin(['England', 'Germany', 'France', 'Spain', 'Italy', 'Netherlands',
       'Portugal'])

club_score_df=club_score_df[mask2]

club_score_df['score']=pd.to_numeric(club_score_df['score'])



In [61]:
club_score_df

Unnamed: 0,club,score,league,country
0,Liverpool,94.1,Premier League,England
1,Man. City,93.9,Premier League,England
2,Bayern Munich,93.4,Bundesliga,Germany
3,PSG,90.5,Ligue 1,France
4,Real Madrid,90.1,La Liga,Spain
...,...,...,...,...
585,Wimbledon,15.2,League One,England
586,Colchester,15.1,League Two,England
594,Tranmere Rovers,13.5,League One,England
597,Northampton,13.3,League Two,England


In [62]:
# load transfer data , for future merge use
pkl_file = open('./temporary_pkl/transfer_span_reduced.pkl','rb')
transfer_span_reduced = pickle.load(pkl_file)
pkl_file.close()

In [63]:
club_std_name=list(transfer_span_reduced['club_name'].unique())

#club_std_name looks like this:
# ['Arsenal', 'Stoke City', 'Wolverhampton Wanderers', 'Chelsea',
#        'Newcastle United', 'Fulham', 'West Bromwich Albion',
#        'Manchester United', 'Wigan Athletic', 'Aston Villa',
#        'West Ham United', 'Everton', 'Liverpool', 'Tottenham Hotspur',
#        'Blackburn Rovers', 'Manchester City', 'Sunderland AFC',
#        'Birmingham City', 'Bolton Wanderers', 'Blackpool', 'AC Milan',
#        'AS Roma', 'Brescia Calcio', 'Internazionale', 'Parma', 'GenoaC',
#        'AS Bari', 'SS Lazio', 'Udinese Calcio', 'AFiorentina',
#        'US Palermo', 'Juventus', 'Chievo Verona', 'US Lecce', 'Bologna ',
#        'UC Sampdoria', 'Cagliari Calcio', 'AC Cesena', 'Calcio Catania',
#        'SSC Napoli', 'Girondins Bordeaux', 'AS Monaco',
#        'Olympique Marseille', 'Stade Rennais', 'AJ Auxerre', 'Toulouse',
#        'OGC Nice', 'Paris Saint-Germain', 'AS Saint-Étienne',
#        'Sochaux-Montbéliard', 'RC Lens', 'HSC Montpellier',
#        'Olympique Lyon', 'LOSC Lille', 'Lorient', 'AS Nancy-Lorraine',
#        'SM Caen', 'Valenciennes', 'Stade Brest ',
#        'Athlétic Club Arlésien', 'SL Benfica', 'Sporting CP', 'Porto',
#        'CD Nacional', 'SC Braga', 'Vitória Setúbal', 'CS Marítimo',
#        'SC Beira-Mar', 'Vitória Guimarães SC', 'Rio Ave',
#        'União de Leiria', 'Académica Coimbra', 'Paços de Ferreira',
#        'SC Olhanense', 'Naval º de Maio', 'Portimonense SC',
#        'Borussia Dortmund', 'Bayern Munich', 'Schalke ',
#        'Borussia Mönchengladbach', 'Bayer  Leverkusen', 'VfB Stuttgart',
#        'Hannover ', 'VfL Wolfsburg', 'SV Werder Bremen', '.Nuremberg',
#        'Eintracht Frankfurt', 'TSG  Hoffenheim', 'SC Freiburg',
#        '.FSV Mainz ', 'Hamburger SV', '. Köln', '.Kaiserslautern',
#        'St. Pauli', 'Atlético Madrid', 'Barcelona', 'Real Zaragoza',
#        'RCD Mallorca', 'CA Osasuna', 'Sevilla', 'Real Madrid',
#        'Racing Santander', 'Real Sociedad', 'RCD Espanyol Barcelona',
#        'Deportivo de La Coruña', 'Valencia', 'Villarreal', 'Málaga',
#        'Sporting Gijón', 'UD Almería', 'Levante UD', 'Getafe', 'Hércules',
#        'Athletic Bilbao', 'NAC Breda', 'Roda JC Kerkrade', 'Utrecht',
#        'Groningen', 'Feyenoord Rotterdam', 'SC Heerenveen',
#        'Twente Enschede', 'PSV Eindhoven', 'Willem II Tilburg',
#        'NEC Nijmegen', 'Vitesse Arnhem', 'Ajax Amsterdam',
#        'De Graafschap Doetinchem', 'SBV Excelsior Rotterdam',
#        'AZ Alkmaar', 'ADO Den Haag', 'Heracles Almelo', 'VVV-Venlo',
#        'Queens Park Rangers', 'Norwich City', 'Swansea City',
#        'Atalanta BC', 'AC Siena', 'Novara Calcio ', 'AC Ajaccio',
#        'FCO Dijon', 'Évian Thonon Gaillard', 'Gil Vicente', 'CD Feirense',
#        'Augsburg', 'Hertha BSC', 'Real Betis Balompié', 'Rayo Vallecano',
#        'Granada', 'RKC Waalwijk', 'Southampton', 'Reading',
#        'Delfino Pescara ', 'Torino', 'SC Bastia', 'ES Troyes AC',
#        'Stade Reims', 'GD Estoril', 'Moreirense', 'SpVgg Greuther Fürth',
#        'Fortuna Düsseldorf', 'Real Valladolid', 'Celta de Vigo',
#        'PEC Zwolle', 'Cardiff City', 'Hull City', 'Crystal Palace',
#        'Hellas Verona', 'AS Livorno', 'US Sassuolo', 'EA Guingamp',
#        'Nantes', 'Belenenses SAD', 'Arouca', 'Eintracht Braunschweig',
#        'Elche', 'SC Cambuur-Leeuwarden', 'Go Ahead Eagles Deventer',
#        'Burnley', 'Leicester City', 'Empoli', 'Inter Milan', 'Metz',
#        'Penafiel', 'Boavista Porto', 'SC Paderborn ', 'SD Eibar',
#        'Córdoba', 'Dordrecht', 'ABournemouth', 'Watford', 'Carpi ',
#        'Frosinone Calcio', 'SCO Angers', 'GAjaccio', 'CD Tondela',
#        'União Madeira', 'Ingolstadt ', 'SV Darmstadt ', 'UD Las Palmas',
#        'Middlesbrough', 'Crotone', 'GD Chaves', 'RB Leipzig',
#        'Deportivo Alavés', 'CD Leganés', 'Sparta Rotterdam',
#        'Brighton & Hove Albion', 'Huddersfield Town', 'Benevento Calcio',
#        'SPAL ', 'SC Amiens', 'RC Strasbourg Alsace', 'Desportivo Aves',
#        'Girona', 'Parma Calcio ', 'SPAL', 'Nîmes Olympique',
#        'CD Santa Clara', 'SD Huesca', 'Emmen', 'Fortuna Sittard',
#        'Sheffield United', 'Famalicão', '.Union Berlin']

from fuzzywuzzy import process
def rename(club,choices):
    if club=='PSG':
        return 'Paris Saint-Germain'
    else:
        return process.extractOne(club, choices)[0]

club_score_cleaned_df=club_score_df.copy()
club_score_cleaned_df['club']=club_score_df['club'].apply(lambda x: x if x in club_std_name else rename(x,club_std_name))




In [64]:
club_score_cleaned_df=club_score_cleaned_df.drop_duplicates('club')

In [65]:
# save the dataframe for future use
pickle.dump(club_score_cleaned_df[['club','score']], open('./temporary_pkl/club_score_df.pkl', 'wb'))

In [68]:
# AS there are clubs not figuring in the clubs score list, we will need to fill up their score by a reasonable value 
# Noting that they are typically small clubs not reputed, we decide to fill up their value by the min value of 
# their league minus 10
avrg_sc_by_league_dict = club_score_cleaned_df.groupby(
    by='league')['score'].min().apply(lambda x:x-10).round(3).to_dict()
avrg_sc_by_league_dict

{'2. Bundesliga': 20.5,
 'Bundesliga': 46.4,
 'Championship': 34.2,
 'Eredivisie': 16.7,
 'La Liga': 49.7,
 'La Liga 2': 17.3,
 'League One': 10.7,
 'Ligue 1': 39.3,
 'Ligue 2': 12.9,
 'Premier League': 50.1,
 'Primeira Liga': 25.9,
 'Serie A': 40.0,
 'Serie B': 9.5}

In [67]:
pickle.dump(avrg_sc_by_league_dict, open('./temporary_pkl/avrg_sc_by_league_dict.pkl', 'wb'))