<a href="https://colab.research.google.com/github/vincenzoaltavilla/thesis_project/blob/main/thesis_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DOWNLOAD DATASET DA KAGGLE**

LINK: https://www.kaggle.com/code/chrisferentinos/premier-league-championship-team-analysis/notebook

In [2]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'player-scores:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F1071543%2F7910805%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240513%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240513T123038Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5dbaaf017c12182f634d8c1fec203fa58901853c20e56ee075e36388012649764b07773d41a09a399aa4ae884b0dafebf5c2b5d8ad2d508e2f4a459ad35be5ddf5d497c07d3f30b52a4b53e42df4b020adae2c4e31ffc23856841c074fb9d64d4079704f7b18d61c88f4d63687fb5ed7594171c021fefc262f47ba860174a11363cee1a78a9c335dd4e6b10e21f4278614918f4b7f00d5ef915a05d6264f444286f2b1d0737abd3eb4ce7728c2c53676855107ed5635c8c047276bb7390407adf7e1964056462a0d1894b13e608177cace3e8bf7f35945feeb48d31b1536279ab4d670a35451d9a0a105e4c8732f3624d293fcbcabe955cd268e0612fcb4cfc0'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

Downloading player-scores, 152669203 bytes compressed
Downloaded and uncompressed: player-scores


# **ACQUISIZIONE DATASET**



In [3]:
import os
import warnings
warnings.filterwarnings('ignore')

import plotly.io as pio
pio.renderers.default='colab'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio
from datetime import datetime


appearances = pd.read_csv("/kaggle/input/player-scores/appearances.csv")
club_games = pd.read_csv("/kaggle/input/player-scores/club_games.csv")
clubs = pd.read_csv("/kaggle/input/player-scores/clubs.csv")
competitions = pd.read_csv("/kaggle/input/player-scores/competitions.csv")
game_events = pd.read_csv("/kaggle/input/player-scores/game_events.csv")
game_lineups = pd.read_csv("/kaggle/input/player-scores/game_lineups.csv")
games = pd.read_csv("/kaggle/input/player-scores/games.csv")
player_valuations = pd.read_csv("/kaggle/input/player-scores/player_valuations.csv")
players = pd.read_csv("/kaggle/input/player-scores/players.csv")

# **AGGIUSTAMENTI PRELIMINARI SUI TIPI DI DATI**

In [6]:
#conversione di tutte le varie date delle tabelle in formato datetime
players['date_of_birth'] = pd.to_datetime(players['date_of_birth'])
appearances['date'] = pd.to_datetime(appearances['date'])
game_events['date'] = pd.to_datetime(game_events['date'])
player_valuations['date'] = pd.to_datetime(player_valuations['date'])

#calcolo età di ogni giocatore
players['age'] = datetime.now().year - players['date_of_birth'].dt.year

# **INIZIO LAVORO SUI DATASET**

In [7]:
#giocatori che al 31/08/2023 fanno parte di una competizione europea, 6499
last_updated_players = players[players['last_season']==2023]
last_updated_players = last_updated_players.sort_values(by=['last_name'])
last_updated_players = last_updated_players[['player_id', 'first_name', 'last_name', 'name', 'age', 'country_of_citizenship',
                                             'date_of_birth', 'height_in_cm', 'current_club_name',
                                             'current_club_id', 'contract_expiration_date',
                                             'position', 'sub_position',
                                             'current_club_domestic_competition_id',
                                             'market_value_in_eur',
                                             'highest_market_value_in_eur']]

In [8]:
#presenze di tutti i giocatori che al 31/08/2023 fanno parte di una competizione europea
#al dal 05-07-2012 al 17-03-2024
last_updated_players_appearances = pd.merge(last_updated_players, appearances, on='player_id', how='inner')
last_updated_players_appearances = last_updated_players_appearances[['player_id', 'first_name', 'last_name', 'name', 'country_of_citizenship',
                                                                     'date_of_birth', 'current_club_name', 'current_club_id',
                                                                     'current_club_domestic_competition_id', 'contract_expiration_date',
                                                                     'position', 'sub_position',
                                                                     'market_value_in_eur', 'highest_market_value_in_eur',
                                                                     'appearance_id', 'game_id', 'player_club_id',
                                                                     'date', 'competition_id', 'yellow_cards',
                                                                     'red_cards', 'goals', 'assists', 'minutes_played']]
last_updated_players_appearances['productivity'] = last_updated_players_appearances['goals'] + last_updated_players_appearances['assists']
last_updated_players_appearances = last_updated_players_appearances.sort_values(by=['date'], ascending=[True])

In [53]:
# presenze e statistiche di un singolo giocatore anno per anno
player_name = "Kylian Mbappé"
player_id = players[(players['name']== player_name)]['player_id'].iat[0]
print(player_id)

player_position = players[(players['name'] == player_name)]['sub_position'].iloc[0]

if player_position == "Goalkeeper":
  pass #test per switchare le feature da analizzare in base ai ruoli

player_valuation = player_valuations[player_valuations['player_id'] == player_id]
player_valuation['year'] = player_valuation['date'].dt.year
player_valuation = player_valuation.groupby('year').agg({'market_value_in_eur': 'last'}).reset_index()

last_updated_player_appearances = last_updated_players_appearances[last_updated_players_appearances['player_id']==player_id]
last_updated_player_appearances['year'] = last_updated_player_appearances['date'].dt.year

last_updated_player_appearances = last_updated_player_appearances.groupby('year').agg({
    'game_id': 'count', #campo riguardante numero di partite giocate
    'player_club_id': 'first',
    'minutes_played': 'sum',
    'goals': 'sum',
    'assists': 'sum',
    'yellow_cards': 'sum',
    'red_cards': 'sum',
    'productivity': 'sum'
}).reset_index()

last_updated_player_appearances = last_updated_player_appearances.rename(columns={"game_id": "games_played"})

342229


In [54]:
last_updated_player_appearances

Unnamed: 0,year,games_played,player_club_id,minutes_played,goals,assists,yellow_cards,red_cards,productivity
0,2015,3,162,56,0,1,0,0,1
1,2016,23,162,792,4,7,3,0,11
2,2017,46,162,3311,30,15,3,0,45
3,2018,33,583,2550,21,10,4,1,31
4,2019,37,583,2818,38,12,4,0,50
5,2020,31,583,2286,21,12,2,0,33
6,2021,47,583,4037,34,17,10,0,51
7,2022,41,583,3545,41,16,9,0,57
8,2023,43,583,3580,37,6,5,0,43
9,2024,11,583,785,10,3,1,0,13


In [55]:
player_valuation

Unnamed: 0,year,market_value_in_eur
0,2015,50000
1,2016,4000000
2,2017,90000000
3,2018,200000000
4,2019,200000000
5,2020,180000000
6,2021,160000000
7,2022,180000000
8,2023,180000000


In [56]:
fig = px.line(last_updated_player_appearances, x='year', y='productivity', title=f'{player_name}\'s goals and assists over time')

fig.update_traces(line=dict(color='red', width=2),
                  marker=dict(color='red', size=8))

fig.update_layout(title_font=dict(size=24, family='Arial', color='red'),
                  xaxis=dict(title='Time', tickfont=dict(size=12),showgrid=True, gridcolor='LightPink'),
                  yaxis=dict(title='Goals', tickfont=dict(size=12),showgrid=True, gridcolor='LightPink'),
                  legend=dict(title='', font=dict(size=12, color='red')))

fig.show()

In [57]:
fig = px.line(player_valuation, x='year', y='market_value_in_eur', title=f'{player_name}\'s average value per year')

fig.update_traces(line=dict(color='red', width=2),
                  marker=dict(color='red', size=8))

fig.update_layout(title_font=dict(size=24, family='Arial', color='red'),
                  xaxis=dict(title='Year', tickmode='linear', dtick=1, tickfont=dict(size=12), showgrid=True, gridcolor='LightPink'),
                  yaxis=dict(title='Average Value', tickfont=dict(size=12), showgrid=True, gridcolor='LightPink'),
                  legend=dict(title='', font=dict(size=12, color='red')))

fig.show()

# **ALTRI TEST**

In [26]:
#giocatori che al 31/08/2023 fanno parte di una competizione europea e che hanno giocato almeno una partita, 6132
last_updated_players_statistics = last_updated_players_appearances.groupby('name').agg({
    'player_id': 'first',
    'first_name': 'first',
    'last_name': 'first',
    'position': 'first',
    'game_id': 'count', #campo riguardante numero di partite giocate
    'sub_position': 'first',
    'market_value_in_eur': 'first',
    'country_of_citizenship': 'first',
    'date_of_birth': 'first',
    'current_club_name': 'first',
    'current_club_domestic_competition_id': 'first',
    'contract_expiration_date': 'first',
    'minutes_played': 'sum',
    'goals': 'sum',
    'assists': 'sum',
    'yellow_cards': 'sum',
    'red_cards': 'sum'
})
last_updated_players_statistics['age'] = datetime.now().year - last_updated_players_statistics['date_of_birth'].dt.year

last_updated_players_statistics = last_updated_players_statistics.sort_values(by=['date_of_birth', 'goals', 'assists', 'red_cards', 'yellow_cards'], ascending=[False, False, False, True, True])

In [27]:
# informazioni su carriera di un singolo giocatore
player_last_name = "Giroud"
last_updated_player_statistic = last_updated_players_statistics[last_updated_players_statistics['last_name']==player_last_name]

In [29]:
# informazioni su carriere dei giocatori di una squadra
team = "Associazione Calcio Milan"
last_updated_team_statistic = last_updated_players_statistics[last_updated_players_statistics['current_club_name']==team]