## Contexto:
### Foi coletado estatisticas resumos até a rodada 29 das 20 equipes da liga Serie A Italiana.

### O objetivo é predizer para quandos os confrontos forem acontecer, quais as probabilidades das zonas de chute(Direita,Esquerda,Centro) para a equipe da casa e a equipe visitante se enfrentando.


# **Modelo predição chutes pela esquerda time da casa**

In [29]:
import pandas as pd
import numpy as np
import itertools
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [30]:
data= pd.read_csv("data_consolidado.csv", sep = ';')
data.head(10) #visualizando as 10 primeiras linhas do dataset

Unnamed: 0,Team_Id,Team,Att_Away_Left_Side,Att_Away_Middle_of_the_pitch,Att_Away_Right_Side,Att_Home_Left_Side,Att_Home_Middle_of_the_pitch,Att_Home_Right_Side,Away_Yellow_Mean,Away_Red_Mean,...,Away_red_card,Away_yellow_card,Goals_Home,Home_Shots_pg,Home_Possession%,Home_Pass%,Home_AerialsWon,Home_General_Rating,Home_red_card,Home_yellow_card
0,1,Lecce,0.41,0.23,0.36,0.4,0.25,0.35,2.7,0.3,...,4.0,40.0,15.0,15.1,45.8,79.1,14.3,6.5,0.0,32.0
1,2,Atalanta,0.41,0.23,0.36,0.39,0.24,0.37,2.7,0.0,...,0.0,38.0,31.0,16.7,52.2,83.1,17.9,6.87,1.0,21.0
2,3,Empoli,0.41,0.23,0.36,0.41,0.24,0.35,2.4,0.0,...,0.0,33.0,9.0,12.4,44.2,77.9,11.6,6.43,0.0,33.0
3,4,Lazio,0.4,0.28,0.32,0.42,0.23,0.35,2.7,0.1,...,1.0,40.0,14.0,11.1,51.2,84.8,10.6,6.54,4.0,33.0
4,5,Fiorentina,0.39,0.28,0.33,0.37,0.3,0.33,2.7,0.0,...,0.0,38.0,26.0,15.2,57.0,83.0,16.9,6.7,0.0,23.0
5,6,Inter,0.39,0.27,0.33,0.4,0.27,0.33,1.6,0.0,...,0.0,22.0,36.0,17.6,58.2,88.3,13.6,6.91,0.0,16.0
6,7,Juventus,0.39,0.27,0.34,0.4,0.26,0.35,2.6,0.0,...,0.0,37.0,22.0,15.3,47.9,84.0,13.6,6.71,1.0,27.0
7,8,Monza,0.39,0.26,0.36,0.36,0.25,0.39,2.2,0.1,...,2.0,31.0,18.0,13.1,55.2,86.0,10.7,6.56,1.0,31.0
8,9,Bologna,0.38,0.23,0.39,0.39,0.23,0.38,2.4,0.1,...,1.0,34.0,26.0,14.0,59.0,85.8,11.1,6.74,0.0,33.0
9,10,Cagliari,0.37,0.24,0.39,0.35,0.27,0.38,2.1,0.1,...,2.0,31.0,20.0,14.4,41.9,76.3,14.6,6.51,2.0,23.0


In [31]:
# Dividindo o dataset em dados de casa e de visitante
home_data = data.filter(regex='Home|Team_Id|Team')
away_data = data.filter(regex='Away|Team_Id|Team')

### Por termos um conjunto de dados com apenas 20 linhas com as estatisticas por equiepes, a abordagem aqui foi gerar todos os confrontos possiveis entre as equipes(38 Rodadas, 380 Partidas)

In [32]:
# Criando um dicionário para mapear Team_Id para Team
team_id_to_name = pd.Series(data.Team.values,index=data.Team_Id).to_dict()

# Criando uma lista de todos os IDs de equipe
team_ids = data['Team_Id'].unique()

# Gerando todos os possíveis pares de confrontos
confrontos = list(itertools.product(team_ids, repeat=2))

# Removendo pares onde uma equipe enfrenta a si mesma
confrontos = [par for par in confrontos if par[0] != par[1]]

# Criando um DataFrame vazio para armazenar os dados dos confrontos
confrontos_df = pd.DataFrame()

In [33]:
for home_team, away_team in confrontos:
    home_stats = home_data[home_data['Team_Id'] == home_team].drop(['Team_Id', 'Team'], axis=1).add_prefix('Home_')
    away_stats = away_data[away_data['Team_Id'] == away_team].drop(['Team_Id', 'Team'], axis=1).add_prefix('Away_')

    confronto_row = pd.concat([home_stats.reset_index(drop=True), away_stats.reset_index(drop=True)], axis=1)
    confronto_row['Home_Team_Id'] = home_team
    confronto_row['Away_Team_Id'] = away_team

    # Corrigindo a adição dos nomes das equipes
    confronto_row['Home_Team_Name'] = team_id_to_name[home_team]
    confronto_row['Away_Team_Name'] = team_id_to_name[away_team]

    confrontos_df = pd.concat([confrontos_df, confronto_row], ignore_index=True)

In [34]:
#Dataset com todos confrontos possiveis
confrontos_df

Unnamed: 0,Home_Att_Home_Left_Side,Home_Att_Home_Middle_of_the_pitch,Home_Att_Home_Right_Side,Home_Home_Yellow_Mean,Home_Home_Red_Mean,Home_Def_Home_ShotsBlocked_pg,Home_Def_Home_Shots_pg,Home_Def_Home_Tackles_pg,Home_Def_Home_Interceptions_pg,Home_Def_Home_Fouls_pg,...,Away_Away_Possession%,Away_Away_Pass%,Away_Away_AerialsWon,Away_Away_General_Rating,Away_Away_red_card,Away_Away_yellow_card,Home_Team_Id,Away_Team_Id,Home_Team_Name,Away_Team_Name
0,0.40,0.25,0.35,2.3,0.0,2.4,11.6,16.4,7.9,12.8,...,48.4,80.9,15.4,6.58,0.0,38.0,1,2,Lecce,Atalanta
1,0.40,0.25,0.35,2.3,0.0,2.4,11.6,16.4,7.9,12.8,...,44.6,80.1,14.2,6.48,0.0,33.0,1,3,Lecce,Empoli
2,0.40,0.25,0.35,2.3,0.0,2.4,11.6,16.4,7.9,12.8,...,52.2,84.4,10.5,6.57,1.0,40.0,1,4,Lecce,Lazio
3,0.40,0.25,0.35,2.3,0.0,2.4,11.6,16.4,7.9,12.8,...,56.0,83.2,15.5,6.52,0.0,38.0,1,5,Lecce,Fiorentina
4,0.40,0.25,0.35,2.3,0.0,2.4,11.6,16.4,7.9,12.8,...,53.2,85.8,15.1,6.88,0.0,22.0,1,6,Lecce,Inter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0.34,0.29,0.37,2.1,0.1,2.8,9.8,16.9,7.1,12.9,...,58.8,87.3,10.5,6.62,5.0,38.0,20,15,Roma,AC Milan
376,0.34,0.29,0.37,2.1,0.1,2.8,9.8,16.9,7.1,12.9,...,44.7,77.1,13.2,6.35,0.0,26.0,20,16,Roma,Salernitana
377,0.34,0.29,0.37,2.1,0.1,2.8,9.8,16.9,7.1,12.9,...,48.4,78.9,11.8,6.36,2.0,27.0,20,17,Roma,Sassuolo
378,0.34,0.29,0.37,2.1,0.1,2.8,9.8,16.9,7.1,12.9,...,41.5,77.2,16.4,6.50,2.0,34.0,20,18,Roma,Genoa


### Criação de novas features

In [35]:
# Ataques
confrontos_df['Diff_Attack_Left'] = confrontos_df['Home_Att_Home_Left_Side'] - confrontos_df['Away_Att_Away_Left_Side']
confrontos_df['Diff_Attack_Middle'] = confrontos_df['Home_Att_Home_Middle_of_the_pitch'] - confrontos_df['Away_Att_Away_Middle_of_the_pitch']
confrontos_df['Diff_Attack_Right'] = confrontos_df['Home_Att_Home_Right_Side'] - confrontos_df['Away_Att_Away_Right_Side']

In [36]:
# Shots for home
confrontos_df['Diff_Attack_x_Defend_Left_Home'] = confrontos_df['Home_ShotSide_Home_For_Left_Side'] - confrontos_df['Away_ShotSide_Away_Against_Right_Side']
confrontos_df['Diff_Attack_X_Defend_Middle_Home'] = confrontos_df['Home_ShotSide_Home_For_Attempts_middle'] - confrontos_df['Away_ShotSide_Away_Against_Attempts_middle']
confrontos_df['Diff_Attack_X_Defend_Right_Home'] = confrontos_df['Home_ShotSide_Home_For_Right_Side'] - confrontos_df['Away_ShotSide_Away_Against_Left_Side']
# Shots for Away
confrontos_df['Diff_Attack_x_Defend_Left_Away'] = confrontos_df['Away_ShotSide_Away_For_Left_Side'] - confrontos_df['Home_ShotSide_Against_Home_Right_Side']
confrontos_df['Diff_Attack_X_Defend_Middle_Away'] = confrontos_df['Away_ShotSide_Away_For_Attempts_middle'] - confrontos_df['Home_ShotSide_Against_Home_Attempts_middle']
confrontos_df['Diff_Attack_X_Defend_Right_Away'] = confrontos_df['Away_ShotSide_Away_For_Right_Side'] - confrontos_df['Home_ShotSide_Against_Home_For_Left_Side']

In [37]:
# Shots Home x Def
confrontos_df['Diff_Shot_Home_Away'] = confrontos_df['Home_Offensive_Home_Shots_pg'] - confrontos_df['Away_Def_Away_Shots_pg']
confrontos_df['Diff_Shot_Away_Home'] = confrontos_df['Away_Offensive_Away_Shots_pg'] - confrontos_df['Home_Def_Home_Shots_pg']
confrontos_df['Diff_Shot_OT_Home'] = confrontos_df['Home_Offensive_Home_Shots_OT_pg'] - confrontos_df['Away_Def_Away_ShotsBlocked_pg']
confrontos_df['Diff_Shot_OT_Away'] = confrontos_df['Away_Offensive_Away_Shots_OT_pg'] - confrontos_df['Home_Def_Home_ShotsBlocked_pg']

In [38]:
# Chutes a favor por zona
confrontos_df['Diff_Shot_For_Left'] = confrontos_df['Home_ShotSide_Home_For_Left_Side'] - confrontos_df['Away_ShotSide_Away_For_Left_Side']
confrontos_df['Diff_Shot_For_Middle'] = confrontos_df['Home_ShotSide_Home_For_Attempts_middle'] - confrontos_df['Away_ShotSide_Away_For_Attempts_middle']
confrontos_df['Diff_Shot_For_Right'] = confrontos_df['Home_ShotSide_Home_For_Right_Side'] - confrontos_df['Away_ShotSide_Away_For_Right_Side']

In [39]:
# Médias de cartões
confrontos_df['Diff_Yellow_Cards'] = confrontos_df['Home_Home_Yellow_Mean'] - confrontos_df['Away_Away_Yellow_Mean']
confrontos_df['Diff_Red_Cards'] = confrontos_df['Home_Home_Red_Mean'] - confrontos_df['Away_Away_Red_Mean']

In [40]:
# Considerando que o desempenho defensivo e ofensivo é importante, vamos calcular as médias das estatísticas defensivas e ofensivas.
defensive_features = ['Shots_pg', 'Tackles_pg', 'Interceptions_pg', 'Fouls_pg', 'Offsides_pg']
offensive_features = ['Shots_pg', 'Shots_OT_pg', 'Dribbles_pg', 'Fouled_pg']

In [41]:
# Médias defensivas
for feature in defensive_features:
    home_feature = 'Home_Def_Home_' + feature
    away_feature = 'Away_Def_Away_' + feature
    confrontos_df['Diff_Def_' + feature] = confrontos_df[home_feature] - confrontos_df[away_feature]

In [42]:
# Médias ofensivas
for feature in offensive_features:
    home_feature = 'Home_Offensive_Home_' + feature
    away_feature = 'Away_Offensive_Away_' + feature
    confrontos_df['Diff_Off_' + feature] = confrontos_df[home_feature] - confrontos_df[away_feature]

In [43]:
confrontos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 91 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Home_Att_Home_Left_Side                     380 non-null    float64
 1   Home_Att_Home_Middle_of_the_pitch           380 non-null    float64
 2   Home_Att_Home_Right_Side                    380 non-null    float64
 3   Home_Home_Yellow_Mean                       380 non-null    float64
 4   Home_Home_Red_Mean                          380 non-null    float64
 5   Home_Def_Home_ShotsBlocked_pg               380 non-null    float64
 6   Home_Def_Home_Shots_pg                      380 non-null    float64
 7   Home_Def_Home_Tackles_pg                    380 non-null    float64
 8   Home_Def_Home_Interceptions_pg              380 non-null    float64
 9   Home_Def_Home_Fouls_pg                      380 non-null    float64
 10  Home_Def_Home_

In [44]:
confrontos_engineered_model = confrontos_df

In [45]:
confrontos_engineered_model = confrontos_engineered_model.drop(['Home_Team_Id'], axis=1)
confrontos_engineered_model = confrontos_engineered_model.drop(['Away_Team_Id'], axis=1)
confrontos_engineered_model = confrontos_engineered_model.drop(['Home_Team_Name'], axis=1)
confrontos_engineered_model = confrontos_engineered_model.drop(['Away_Team_Name'], axis=1)

In [47]:
X = confrontos_engineered_model.drop(['Home_ShotSide_Home_For_Left_Side'], axis=1)
y = confrontos_engineered_model['Home_ShotSide_Home_For_Left_Side']
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 380 entries, 0 to 379
Data columns (total 86 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Home_Att_Home_Left_Side                     380 non-null    float64
 1   Home_Att_Home_Middle_of_the_pitch           380 non-null    float64
 2   Home_Att_Home_Right_Side                    380 non-null    float64
 3   Home_Home_Yellow_Mean                       380 non-null    float64
 4   Home_Home_Red_Mean                          380 non-null    float64
 5   Home_Def_Home_ShotsBlocked_pg               380 non-null    float64
 6   Home_Def_Home_Shots_pg                      380 non-null    float64
 7   Home_Def_Home_Tackles_pg                    380 non-null    float64
 8   Home_Def_Home_Interceptions_pg              380 non-null    float64
 9   Home_Def_Home_Fouls_pg                      380 non-null    float64
 10  Home_Def_Home_

In [51]:
#Modelagem
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instanciando o modelo de Gradient Boosting
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

In [52]:
# Executando a validação cruzada
scores = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')

# Calculando a média dos scores
mean_mse = np.mean(scores)

print(f"Mean MSE from cross-validation: {mean_mse}")

Mean MSE from cross-validation: -0.0009218260299695737


In [53]:
#importância das características
feature_importance = model.feature_importances_

# Criar um DataFrame para visualizar a importância
features_df = pd.DataFrame({'Features': X.columns, 'Importance': feature_importance})
features_df = features_df.sort_values(by='Importance', ascending=False)

features_df

Unnamed: 0,Features,Importance
19,Home_ShotSide_Home_For_Attempts_middle,0.558699
28,Home_Home_yellow_card,0.214066
17,Home_ShotSide_Against_Home_Attempts_middle,0.034771
62,Diff_Attack_x_Defend_Left_Home,0.031851
20,Home_ShotSide_Home_For_Right_Side,0.025960
...,...,...
56,Away_Away_General_Rating,0.000000
58,Away_Away_yellow_card,0.000000
33,Away_Away_Red_Mean,0.000000
39,Away_Def_Away_Offsides_pg,0.000000


In [54]:
# Definindo o espaço de parâmetros
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05],
    'max_depth': [3, 5]
}

# Criando o objeto GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Executando a busca
grid_search.fit(X, y)

# Melhores parâmetros e score
print("Melhores Parâmetros:", grid_search.best_params_)
print("Melhor MSE:", grid_search.best_score_)

Melhores Parâmetros: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Melhor MSE: -0.0009218260299695737


In [55]:
# Fazendo previsões no conjunto de teste
y_pred = model.predict(X_test)

In [56]:
# Avaliando o modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 1.9698338439052637e-07
R^2 Score: 0.9998980998756681


### Simulação de uma rodada

In [57]:
jogos_rodada = [(19, 12),(10, 16), (17, 14), (18, 8), (1, 13), (15, 3),
                (7, 2), (5, 20), (9, 6), (4, 11)]

In [58]:
jogos_df = confrontos_df[confrontos_df.apply(lambda x: (x['Home_Team_Id'], x['Away_Team_Id']) in jogos_rodada, axis=1)]

In [59]:
X_jogos_rodada = jogos_df.drop(['Home_ShotSide_Home_For_Left_Side','Home_Team_Id', 'Away_Team_Id', 'Home_Team_Name', 'Away_Team_Name'], axis=1)

In [60]:
jogos_df['Predicted_Home_ShotSide_Left_Side'] = model.predict(X_jogos_rodada)

# Mostrar as previsões para a rodada
previsoes_rodada = jogos_df[['Home_Team_Id', 'Home_Team_Name', 'Away_Team_Id', 'Away_Team_Name', 'Predicted_Home_ShotSide_Left_Side']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jogos_df['Predicted_Home_ShotSide_Left_Side'] = model.predict(X_jogos_rodada)


In [61]:
previsoes_rodada

Unnamed: 0,Home_Team_Id,Home_Team_Name,Away_Team_Id,Away_Team_Name,Predicted_Home_ShotSide_Left_Side
11,1,Lecce,13,Verona,0.240133
66,4,Lazio,11,Udinese,0.169908
94,5,Fiorentina,20,Roma,0.139991
115,7,Juventus,2,Atalanta,0.210086
157,9,Bologna,6,Inter,0.251216
185,10,Cagliari,16,Salernitana,0.150035
268,15,AC Milan,3,Empoli,0.240036
317,17,Sassuolo,14,Frosinone,0.280095
330,18,Genoa,8,Monza,0.180306
353,19,Napoli,12,Torino,0.199948
