In [6]:
# Required Libraries
import pandas as pd
import understatapi
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
client = understatapi.UnderstatClient()

In [7]:
# Importing Haaland Shot Data
player_data = client.player(player = '8260').get_shot_data()
player_data[0]

{'id': '354876',
 'minute': '58',
 'result': 'Goal',
 'X': '0.8880000305175781',
 'Y': '0.6659999847412109',
 'xG': '0.07933320105075836',
 'player': 'Erling Haaland',
 'h_a': 'a',
 'player_id': '8260',
 'situation': 'OpenPlay',
 'season': '2019',
 'shotType': 'LeftFoot',
 'match_id': '12562',
 'h_team': 'Augsburg',
 'a_team': 'Borussia Dortmund',
 'h_goals': '3',
 'a_goals': '5',
 'date': '2020-01-18 14:30:00',
 'player_assisted': 'Jadon Sancho',
 'lastAction': 'Throughball'}

In [8]:
# Creating a dataframe in pandas
df = pd.DataFrame(player_data)
df

Unnamed: 0,id,minute,result,X,Y,xG,player,h_a,player_id,situation,season,shotType,match_id,h_team,a_team,h_goals,a_goals,date,player_assisted,lastAction
0,354876,58,Goal,0.8880000305175781,0.6659999847412109,0.07933320105075836,Erling Haaland,a,8260,OpenPlay,2019,LeftFoot,12562,Augsburg,Borussia Dortmund,3,5,2020-01-18 14:30:00,Jadon Sancho,Throughball
1,354881,69,Goal,0.98,0.48900001525878906,0.9206209778785706,Erling Haaland,a,8260,OpenPlay,2019,LeftFoot,12562,Augsburg,Borussia Dortmund,3,5,2020-01-18 14:30:00,Thorgan Hazard,Pass
2,354883,78,Goal,0.8830000305175781,0.34700000762939454,0.32283100485801697,Erling Haaland,a,8260,OpenPlay,2019,LeftFoot,12562,Augsburg,Borussia Dortmund,3,5,2020-01-18 14:30:00,Marco Reus,Throughball
3,355527,65,BlockedShot,0.8859999847412109,0.639000015258789,0.11918099969625473,Erling Haaland,h,8260,OpenPlay,2019,LeftFoot,12566,Borussia Dortmund,FC Cologne,5,1,2020-01-24 19:30:00,Jadon Sancho,Pass
4,355531,76,Goal,0.955,0.495,0.7466409802436829,Erling Haaland,h,8260,OpenPlay,2019,LeftFoot,12566,Borussia Dortmund,FC Cologne,5,1,2020-01-24 19:30:00,,Rebound
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
480,593830,94,SavedShot,0.9169999694824219,0.5070000076293946,0.11966534703969955,Erling Haaland,a,8260,OpenPlay,2024,Head,26652,Newcastle United,Manchester City,1,1,2024-09-28 11:30:00,Sávio,Cross
481,595454,3,MissedShots,0.9219999694824219,0.6680000305175782,0.11882532387971878,Erling Haaland,h,8260,OpenPlay,2024,LeftFoot,26664,Manchester City,Fulham,3,2,2024-10-05 14:00:00,Phil Foden,Throughball
482,595455,6,BlockedShot,0.7780000305175782,0.3529999923706055,0.06815728545188904,Erling Haaland,h,8260,DirectFreekick,2024,LeftFoot,26664,Manchester City,Fulham,3,2,2024-10-05 14:00:00,,Standard
483,595456,7,SavedShot,0.7980000305175782,0.47099998474121096,0.06431261450052261,Erling Haaland,h,8260,OpenPlay,2024,RightFoot,26664,Manchester City,Fulham,3,2,2024-10-05 14:00:00,Phil Foden,TakeOn


In [9]:
# Convert appropriate columns to their respective dtypes
df['id'] = df['id'].astype(str)  # string
df['minute'] = df['minute'].astype(int)  # Convert to integer
df['X'] = df['X'].astype(float)  # Convert to float
df['Y'] = df['Y'].astype(float)  # Convert to float
df['xG'] = df['xG'].astype(float)  # Convert to float
df['player'] = df['player'].astype(str)  # string
df['h_a'] = df['h_a'].astype(str)  # string
df['player_id'] = df['player_id'].astype(str)  # string
df['situation'] = df['situation'].astype(str)  # string
df['season'] = df['season'].astype(str)  # string
df['shotType'] = df['shotType'].astype(str)  # string
df['match_id'] = df['match_id'].astype(str)  # string
df['h_team'] = df['h_team'].astype(str)  # string
df['a_team'] = df['a_team'].astype(str)  # string
df['h_goals'] = df['h_goals'].astype(int)  # Convert to integer
df['a_goals'] = df['a_goals'].astype(int)  # Convert to integer
df['date'] = pd.to_datetime(df['date'])  # Convert to datetime
df['player_assisted'] = df['player_assisted'].astype(str)  # string
df['lastAction'] = df['lastAction'].astype(str)  # Kstring

print(df.dtypes)

id                         object
minute                      int64
result                     object
X                         float64
Y                         float64
xG                        float64
player                     object
h_a                        object
player_id                  object
situation                  object
season                     object
shotType                   object
match_id                   object
h_team                     object
a_team                     object
h_goals                     int64
a_goals                     int64
date               datetime64[ns]
player_assisted            object
lastAction                 object
dtype: object


In [10]:
# Encode the target variable
df['goal'] = df['result'].apply(lambda x: 1 if x == 'Goal' else 0)

# Select features and target variable
X = df[['minute', 'X', 'Y', 'xG', 'h_goals', 'a_goals']]
y = df['goal']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(classification_report(y_test, y_pred))

Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.85      0.93      0.89        73
           1       0.71      0.50      0.59        24

    accuracy                           0.82        97
   macro avg       0.78      0.72      0.74        97
weighted avg       0.81      0.82      0.81        97



In [12]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(class_weight='balanced', random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='f1_weighted',
                           verbose=1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yuvrajbains/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yuvrajbains/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/yuvrajbains/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/yuvrajbains/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Initialize the model with best parameters
best_model = RandomForestClassifier(
    n_estimators=200,
    max_features='sqrt',
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=1,
    class_weight='balanced',
    random_state=42
)

# Fit the model
best_model.fit(X_train, y_train)

# Evaluate the model using cross-validation
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='f1_weighted')
print(f'Cross-validated F1 Score: {cv_scores.mean()}')

# Feature importances
importances = best_model.feature_importances_
print("Feature Importances:", importances)

Cross-validated F1 Score: 0.8041528392493518
Feature Importances: [0.11819944 0.12820355 0.13765964 0.45687902 0.09549724 0.06356111]
