<a href="https://colab.research.google.com/github/vincenzoaltavilla/thesis_project/blob/main/thesis_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DOWNLOAD DATASET DA KAGGLE**

LINK: https://www.kaggle.com/code/chrisferentinos/premier-league-championship-team-analysis/notebook

In [1]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'player-scores:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1071543%2F7910805%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240517%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240517T080818Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Da9360169bfbea2d38416811175e5f9018a5ba7681ba64f751e3b5db34f640f6693393c1d1c1352812cac21916535a9dc740fb2fbccb96d254845b3622a06a6d331c85c0d9abd02d0451df318e5eb40b236d1f4db40442ae9ad5ddc18dde976f4ad28d7d271f418d0dce9f2ef64934e7f0d25e6bef4b63c21d6eb633a45a99c911dad1bf3683aa00c68de3139a336e66dc6f79ae988dc67925328931c247e9f3d1acb12e99dc514bef475b34b38f6501fd6d1409fd9380e0a8eaa1982132721e3d2fbf2673000ef1c83d1cfb252f974a1083136efafac663264f591705b158884b7c5e96d414e54008e4d1e2563e88522cc0e6e6f1c4ededd36e54b773e9e22ff'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue


Downloading player-scores, 152669203 bytes compressed
Downloaded and uncompressed: player-scores


# **IMPORT LIBRERIE, ACQUISIZIONE DATASET E AGGIUSTAMENTI PRELIMINARI SUI TIPI DI DATI**



In [2]:
import os
import warnings
warnings.filterwarnings('ignore')

import plotly.io as pio
pio.renderers.default='colab'

import pandas as pd
pd.set_option('display.max.columns', None)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from datetime import datetime

appearances = pd.read_csv("/kaggle/input/player-scores/appearances.csv")
club_games = pd.read_csv("/kaggle/input/player-scores/club_games.csv")
clubs = pd.read_csv("/kaggle/input/player-scores/clubs.csv")
competitions = pd.read_csv("/kaggle/input/player-scores/competitions.csv")
game_events = pd.read_csv("/kaggle/input/player-scores/game_events.csv")
game_lineups = pd.read_csv("/kaggle/input/player-scores/game_lineups.csv")
games = pd.read_csv("/kaggle/input/player-scores/games.csv")
player_valuations = pd.read_csv("/kaggle/input/player-scores/player_valuations.csv")
players = pd.read_csv("/kaggle/input/player-scores/players.csv")

players['date_of_birth'] = pd.to_datetime(players['date_of_birth'])
appearances['date'] = pd.to_datetime(appearances['date'])
game_events['date'] = pd.to_datetime(game_events['date'])
player_valuations['date'] = pd.to_datetime(player_valuations['date'])

# **INIZIO LAVORO**

## **Data cleaning & feature engineering**

### **Players**

In [3]:
mean_height = players['height_in_cm'].mean()
players['height_in_cm'] = players['height_in_cm'].fillna(mean_height)
players['height_in_cm'] = players['height_in_cm'].astype(int)
players['foot'] = players['foot'].fillna('both')

players = players.drop(columns=['url','image_url','agent_name','last_season','current_club_id',
                      'player_code','country_of_birth','city_of_birth', 'current_club_name',
                      'current_club_domestic_competition_id', 'market_value_in_eur',
                      'highest_market_value_in_eur', 'contract_expiration_date',
                      'first_name', 'last_name'])

#altezza media per valori nulli, ambidestro per piedi nulli

### **Players appearances**

In [4]:
players_appearances = pd.merge(players, appearances, on='player_id', how='inner')
players_appearances = players_appearances.drop(columns=['player_current_club_id','appearance_id',
                                                        'name', 'competition_id'])

In [5]:
players_appearances = pd.merge(players_appearances, clubs, left_on='player_club_id', right_on='club_id', how='inner')
players_appearances = players_appearances.drop(columns=['player_club_id','club_code','squad_size',
                                                        'average_age', 'foreigners_number',
                                                        'foreigners_percentage',
                                                        'national_team_players', 'stadium_name',
                                                        'stadium_seats', 'net_transfer_record',
                                                        'last_season', 'filename',
                                                        'url', 'coach_name', 'total_market_value'])
players_appearances = players_appearances.rename(columns={"name": "club_name"})

In [6]:
players_appearances = pd.merge(players_appearances, competitions, left_on='domestic_competition_id', right_on='competition_id', how='left')
players_appearances = players_appearances.drop(columns=['domestic_competition_id','competition_code',
                                                        'name', 'sub_type', 'type', 'country_id',
                                                        'competition_id', 'domestic_league_code',
                                                        'confederation', 'url'])

In [7]:
players_appearances = players_appearances.rename(columns={"country_name": "club_national_league",
                                                          "is_major_national_league": "top5_leagues"})
players_appearances = players_appearances[['player_id', 'player_name', 'position', 'sub_position',
                                          'foot', 'height_in_cm', 'country_of_citizenship',
                                          'date_of_birth', 'club_id', 'club_name',
                                          'club_national_league', 'top5_leagues', 'game_id', 'date',
                                          'yellow_cards', 'red_cards', 'goals', 'assists',
                                          'minutes_played']]

In [None]:
players_appearances

### **Different type of goals**

In [9]:
goal_events = game_events[game_events['type']== 'Goals']
goal_events = goal_events.drop(columns=['game_event_id', 'player_in_id', 'type', 'player_assist_id',
                                        'minute'])
mask = goal_events['description'].str.contains('Own-goal', case=False)
goal_events = goal_events[~mask]

In [10]:
kind_of_goals = ['Header', 'Right-footed', 'Left-footed', 'Penalty', 'Long distance kick', 'Free kick']

for kind_of_goal in kind_of_goals:
    mask = goal_events['description'].str.contains(kind_of_goal, case=False)
    goal_events.loc[mask, 'description'] = kind_of_goal

import re
regex_other_goals = '|'.join(kind_of_goals)

mask = ~goal_events['description'].str.contains(regex_other_goals, case=False)
goal_events.loc[mask, 'description'] = 'Other'

In [11]:
goal_events['right_footed_goals'] = 0
goal_events['left_footed_goals'] = 0
goal_events['long_distance_goals'] = 0
goal_events['direct_free_kick_goals'] = 0
goal_events['penalty_goals'] = 0
goal_events['header_goals'] = 0
goal_events['other_kind_of_goals'] = 0

goal_events.loc[goal_events['description'] == 'Header', 'header_goals'] = 1
goal_events.loc[goal_events['description'] == 'Right-footed', 'right_footed_goals'] = 1
goal_events.loc[goal_events['description'] == 'Left-footed', 'left_footed_goals'] = 1
goal_events.loc[goal_events['description'] == 'Penalty', 'penalty_goals'] = 1
goal_events.loc[goal_events['description'] == 'Long distance kick', 'long_distance_goals'] = 1
goal_events.loc[goal_events['description'] == 'Free kick', 'direct_free_kick_goals'] = 1
goal_events.loc[goal_events['description'] == 'Other', 'other_kind_of_goals'] = 1

#.sort_values(by=['date'], ascending=False)

In [12]:
goal_events = goal_events.groupby(['date', 'game_id', 'player_id']).agg({
    'club_id': 'first',
    'description': 'first',
    'right_footed_goals': 'sum',
    'left_footed_goals': 'sum',
    'long_distance_goals': 'sum',
    'direct_free_kick_goals': 'sum',
    'penalty_goals': 'sum',
    'header_goals': 'sum',
    'other_kind_of_goals': 'sum',
}).reset_index()

goal_events.rename(columns=dict(zip(goal_events.columns[:5], ['goal_'+str(col) for col in goal_events.columns[:5]])), inplace=True)

In [None]:
goal_events#[(goal_events['goal_player_id'] == 10)]

## **First important join**

In [35]:
players_appearances_detailed = pd.merge(players_appearances, goal_events, left_on=['game_id', 'player_id'], right_on=['goal_game_id', 'goal_player_id'], how='left')
players_appearances_detailed = players_appearances_detailed.drop(columns=['goal_date', 'goal_game_id',
                                                                          'goal_player_id', 'goal_club_id'])

In [38]:
players_appearances_detailed.loc[players_appearances_detailed['goals'] == 0, ['goal_description',
                                                                              'right_footed_goals',
                                                                              'left_footed_goals',
                                                                              'long_distance_goals',
                                                                              'direct_free_kick_goals',
                                                                              'penalty_goals',
                                                                              'header_goals',
                                                                              'other_kind_of_goals']] = 0

players_appearances_detailed = players_appearances_detailed.drop(columns=['goal_description'])

players_appearances_detailed['right_footed_goals'] = players_appearances_detailed['right_footed_goals'].fillna(0).astype(int)
players_appearances_detailed['left_footed_goals'] = players_appearances_detailed['left_footed_goals'].fillna(0).astype(int)
players_appearances_detailed['long_distance_goals'] = players_appearances_detailed['long_distance_goals'].fillna(0).astype(int)
players_appearances_detailed['direct_free_kick_goals'] = players_appearances_detailed['direct_free_kick_goals'].fillna(0).astype(int)
players_appearances_detailed['penalty_goals'] = players_appearances_detailed['penalty_goals'].fillna(0).astype(int)
players_appearances_detailed['header_goals'] = players_appearances_detailed['header_goals'].fillna(0).astype(int)
players_appearances_detailed['other_kind_of_goals'] = players_appearances_detailed['other_kind_of_goals'].fillna(0).astype(int)

In [39]:
first_stats = players_appearances_detailed[['player_id', 'player_name', 'position', 'sub_position',
                                            'foot',	'height_in_cm',	'country_of_citizenship',
                                            'date_of_birth', 'club_id', 'club_name',
                                            'club_national_league', 'top5_leagues', 'date',
                                            'game_id', 'minutes_played', 'goals', 'assists',
                                            'right_footed_goals', 'left_footed_goals',
                                            'header_goals', 'direct_free_kick_goals',
                                            'penalty_goals', 'long_distance_goals',
                                            'other_kind_of_goals', 'yellow_cards',
                                            'red_cards',]]

In [None]:
players_appearances_detailed.sort_values(by=['goals'], ascending=False)

## **Second important join**

In [40]:
intermediate_stats = pd.merge(first_stats, club_games, on=['game_id', 'club_id'], how='inner')
intermediate_stats = intermediate_stats.drop(columns=['own_goals', 'own_position', 'own_manager_name',
                                                      'opponent_id', 'opponent_position',
                                                      'opponent_manager_name', 'hosting'])

In [41]:
intermediate_stats['is_clean_sheet'] = 1
intermediate_stats.loc[intermediate_stats['opponent_goals'] > 0, 'is_clean_sheet'] = 0
intermediate_stats['year'] = intermediate_stats['date'].dt.year
intermediate_stats['age'] = intermediate_stats['year'] - intermediate_stats['date_of_birth'].dt.year
intermediate_stats['age'] = intermediate_stats['age'].fillna(0).astype(int)

In [42]:
intermediate_stats = intermediate_stats.sort_values(by=['date'], ascending=[True])

## **First group by**

In [43]:
grouped_stats = intermediate_stats.groupby(['player_id', 'year']).agg({
    'player_name': 'first',
    'position': 'first',
    'sub_position': 'first',
    'foot': 'first',
    'height_in_cm': 'first',
    'country_of_citizenship': 'first',
    'date_of_birth': 'first',
    'club_id': 'first',
    'club_name': 'first',
    'club_national_league': 'first',
    'top5_leagues': 'first',
    'date': 'first',
    'game_id': 'count',
    'minutes_played': 'sum',
    'goals': 'sum',
    'assists': 'sum',
    'right_footed_goals': 'sum',
    'left_footed_goals': 'sum',
    'header_goals': 'sum',
    'direct_free_kick_goals': 'sum',
    'penalty_goals': 'sum',
    'long_distance_goals': 'sum',
    'other_kind_of_goals': 'sum',
    'yellow_cards': 'sum',
    'red_cards': 'sum',
    'opponent_goals': 'sum',
    'is_win': 'sum',
    'is_clean_sheet': 'sum',
    'age': 'first'
}).reset_index()


In [44]:
stats = grouped_stats[['player_id', 'year', 'player_name', 'position', 'sub_position',
                       'foot', 'height_in_cm',	'country_of_citizenship',
                       'date_of_birth', 'club_id', 'club_name',
                       'club_national_league', 'top5_leagues',
                       'age', 'game_id', 'minutes_played', 'is_win', 'goals', 'assists',
                       'right_footed_goals', 'left_footed_goals',
                       'header_goals', 'direct_free_kick_goals',
                       'penalty_goals', 'long_distance_goals',
                       'other_kind_of_goals', 'opponent_goals', 'is_clean_sheet', 'yellow_cards',
                       'red_cards']]

stats.rename(columns={"game_id": "games_played", "is_win": "games_won"}, inplace=True)
stats.loc[stats['position'] != "Goalkeeper", 'is_clean_sheet'] = 0

stats['percentage_of_win'] = (stats['games_won'] / stats['games_played']) * 100
stats['percentage_of_clean_sheets'] = (stats['is_clean_sheet'] / stats['games_played']) * 100
stats['conceded_goals_per_match'] = (stats['opponent_goals'] / stats['games_played'])

stats['percentage_of_win'] = stats['percentage_of_win'].round(2)
stats['percentage_of_clean_sheets'] = stats['percentage_of_clean_sheets'].round(2)
stats['conceded_goals_per_match'] = stats['conceded_goals_per_match'].round(2)

# **Print**

In [47]:
stats[stats['player_name'] == 'Alessio Romagnoli']
#.sort_values(by=['goals'], ascending=False).head(50)

Unnamed: 0,player_id,year,player_name,position,sub_position,foot,height_in_cm,country_of_citizenship,date_of_birth,club_id,club_name,club_national_league,top5_leagues,age,games_played,minutes_played,games_won,goals,assists,right_footed_goals,left_footed_goals,header_goals,direct_free_kick_goals,penalty_goals,long_distance_goals,other_kind_of_goals,opponent_goals,is_clean_sheet,yellow_cards,red_cards,percentage_of_win,percentage_of_clean_sheets,conceded_goals_per_match
54556,197747,2012,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,12,Associazione Sportiva Roma,Italy,True,17,2,100,2,0,0,0,0,0,0,0,0,0,2,0,0,0,100.0,0.0,1.0
54557,197747,2013,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,12,Associazione Sportiva Roma,Italy,True,18,1,90,1,1,0,0,0,1,0,0,0,0,1,0,0,0,100.0,0.0,1.0
54558,197747,2014,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,12,Associazione Sportiva Roma,Italy,True,19,21,1442,13,1,1,0,0,1,0,0,0,0,17,0,7,0,61.9,0.0,0.81
54559,197747,2015,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,1038,UC Sampdoria,Italy,True,20,39,3455,15,1,1,0,0,1,0,0,0,0,46,0,13,0,38.46,0.0,1.18
54560,197747,2016,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,5,Associazione Calcio Milan,Italy,True,21,38,3411,19,1,0,1,0,0,0,0,0,0,41,0,6,0,50.0,0.0,1.08
54561,197747,2017,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,5,Associazione Calcio Milan,Italy,True,22,34,2916,14,4,1,1,1,1,0,0,0,1,38,0,13,0,41.18,0.0,1.12
54562,197747,2018,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,5,Associazione Calcio Milan,Italy,True,23,40,3420,20,2,0,0,2,0,0,0,0,0,44,0,9,0,50.0,0.0,1.1
54563,197747,2019,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,5,Associazione Calcio Milan,Italy,True,24,39,3509,17,0,0,0,0,0,0,0,0,0,41,0,8,1,43.59,0.0,1.05
54564,197747,2020,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,5,Associazione Calcio Milan,Italy,True,25,36,3192,23,2,1,0,0,2,0,0,0,0,43,0,3,0,63.89,0.0,1.19
54565,197747,2021,Alessio Romagnoli,Defender,Centre-Back,left,185,Italy,1995-01-12,5,Associazione Calcio Milan,Italy,True,26,34,2736,18,1,0,0,0,1,0,0,0,0,40,0,7,1,52.94,0.0,1.18


In [None]:
stats.to_csv('stats.csv', index=False)