In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [30]:
import pandas as pd

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/concatenated_final_dataset.csv')

In [3]:
# View the first few rows of the DataFrame
print("First few rows of the dataset:")
print(df.head())
#summary about the dataset before doing any work
print("summary about the dataset before doing any work")
print(df.describe())

First few rows of the dataset:
        id  minute       result      X      Y        xG               player  \
0  30358.0     5.0  MissedShots  0.923  0.464  0.141077           Leroy Sané   
1  30359.0     6.0  BlockedShot  0.764  0.427  0.012746     Roman Neustädter   
2  30360.0     8.0         Goal  0.929  0.490  0.530869  Klaas-Jan Huntelaar   
3  30362.0    13.0  MissedShots  0.829  0.555  0.075300          Dennis Aogo   
4  30363.0    14.0    SavedShot  0.889  0.610  0.232375           Leroy Sané   

  h_a  player_id situation  season   shotType  match_id      h_team  \
0   h      337.0  OpenPlay  2014.0   LeftFoot    5159.0  Schalke 04   
1   h      346.0  OpenPlay  2014.0  RightFoot    5159.0  Schalke 04   
2   h      340.0  OpenPlay  2014.0  RightFoot    5159.0  Schalke 04   
3   h      334.0  OpenPlay  2014.0  RightFoot    5159.0  Schalke 04   
4   h      337.0  OpenPlay  2014.0   LeftFoot    5159.0  Schalke 04   

          a_team  h_goals  a_goals                 date  \
0 

In [35]:
def preprocess(df, xg):
    # Keep only the desired columns
    if not xg:
        desired_columns = ['minute', 'X', 'Y', 'h_team', 'a_team', 'situation', 'shotType', 'lastAction', 'result', 'player_assisted']
    else:
        desired_columns = ['minute', 'X', 'Y', 'h_team', 'a_team', 'situation', 'shotType', 'lastAction', 'result', 'xG', 'player_assisted']

    df_subset = df[desired_columns]

    # Drop the rows where the value in any row of the 'result' column is 'Son Heung-Min'
    df_subset = df_subset.drop(df_subset[df_subset['result'] == 'Son Heung-Min'].index, axis=0)

    # Replace values in the 'result' column
    df_subset.loc[:, 'result'] = df_subset['result'].replace({'MissedShots': 0, 'BlockedShot': 0, 'SavedShot': 0, 'ShotOnPost': 0, 'OwnGoal': 0, 'Goal': 1})

    # Convert the 'result' column to integers
    df_subset['result'] = df_subset['result'].astype(int)

    # Initialize 'player_assisted' column with zeros
    df_subset['player_assisted'] = 0

    # Set 'player_assisted' to 1 where there's a player name present
    df_subset.loc[df_subset['player_assisted'].notna(), 'player_assisted'] = 1

    # Drop rows with NaN values in any column
    df_subset = df_subset.dropna()

    return df_subset


In [36]:
#feature_engineering
import numpy as np
from sklearn.preprocessing import LabelEncoder

def feature_engineering(df, catboost):
    # Make a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Define a function to calculate XDTGi (Distance To Goal)
    def calculate_XDTGi(X, Y):
        return np.sqrt((105 - (X * 105))**2 + (32.5 - (Y * 32.5))**2)

    # Define a function to calculate XATGi (Angle To Goal)
    def calculate_XATGi(X, Y):
        ai = np.arctan(7.32 * (105 - (X * 105)))
        bi = ((105 - (X * 105))**2 + (32.5 - (Y * 68))**2 - (7.32/2)**2)
        return np.abs((ai / bi) * (180 / np.pi))

    # Define a function to categorize y values
    def categorize_y(y_value):
        if y_value <= 0.4:
            return "right"
        elif y_value <= 0.6:
            return "middle"
        else:
            return "left"

    # Define a function to determine foot shot
    def determine_foot_shot(row):
        if row['position'] == 'right' and row['shotType'] == 'LeftFoot':
            return 1
        elif row['position'] == 'left' and row['shotType'] == 'RightFoot':
            return 1
        elif row['position'] == 'middle':
            return 1
        else:
            return 0

    # Apply the functions to the dataset using .loc on the copy
    df_copy.loc[:, 'XDTGi'] = calculate_XDTGi(df_copy['X'], df_copy['Y'])
    df_copy.loc[:, 'XATGi'] = calculate_XATGi(df_copy['X'], df_copy['Y'])

    # Add the categorical feature y_category
    df_copy['position'] = df_copy['Y'].apply(categorize_y)

    # Add the new column footShot
    df_copy['footShot'] = df_copy.apply(determine_foot_shot, axis=1)

    # Apply label encoding to object type columns if not using CatBoost
    if (not catboost):
        label_encoder = LabelEncoder()
        for column in df_copy.columns:
            if df_copy[column].dtype == 'object':
                df_copy[column] = label_encoder.fit_transform(df_copy[column])

    return df_copy

In [37]:
df_subset=preprocess(df,False)
df_subset_regular=feature_engineering(df_subset,False)
df_subset_catboost=feature_engineering(df_subset,True)

In [38]:
#dataset after preprocessing & feature_engineering
print("First few rows of the dataset:")
print((df_subset.head()))
print("summary about the dataset after preprocessing & feature_engineering")
print(df_subset.describe())

# Check for missing values in the entire dataset
print("Missing values per column:")
print(df_subset.isna().sum())

First few rows of the dataset:
   minute      X      Y      h_team         a_team situation   shotType  \
0     5.0  0.923  0.464  Schalke 04  VfB Stuttgart  OpenPlay   LeftFoot   
1     6.0  0.764  0.427  Schalke 04  VfB Stuttgart  OpenPlay  RightFoot   
2     8.0  0.929  0.490  Schalke 04  VfB Stuttgart  OpenPlay  RightFoot   
3    13.0  0.829  0.555  Schalke 04  VfB Stuttgart  OpenPlay  RightFoot   
4    14.0  0.889  0.610  Schalke 04  VfB Stuttgart  OpenPlay   LeftFoot   

    lastAction  result  player_assisted  
0         Pass       0                1  
1       Aerial       0                1  
2  BlockedPass       1                1  
3         Pass       0                1  
4       TakeOn       0                1  
summary about the dataset after preprocessing & feature_engineering
              minute              X              Y         result  \
count  338034.000000  338034.000000  338034.000000  338034.000000   
mean       48.655319       0.846020       0.504946       0.1

In [8]:
#dataset after preprocessing & feature_engineering
print("First few rows of the dataset:")
print((df_subset_catboost.head()))
print(df_subset.head())

First few rows of the dataset:
   minute      X      Y      h_team         a_team situation   shotType  \
0     5.0  0.923  0.464  Schalke 04  VfB Stuttgart  OpenPlay   LeftFoot   
1     6.0  0.764  0.427  Schalke 04  VfB Stuttgart  OpenPlay  RightFoot   
2     8.0  0.929  0.490  Schalke 04  VfB Stuttgart  OpenPlay  RightFoot   
3    13.0  0.829  0.555  Schalke 04  VfB Stuttgart  OpenPlay  RightFoot   
4    14.0  0.889  0.610  Schalke 04  VfB Stuttgart  OpenPlay   LeftFoot   

    lastAction  result      XDTGi     XATGi position  footShot  
0         Pass       0  19.204780  1.683970   middle         1  
1       Aerial       0  30.997513  0.146387   middle         1  
2  BlockedPass       1  18.174367  2.075666   middle         1  
3         Pass       0  23.055279  0.266208   middle         1  
4       TakeOn       0  17.219019  0.439860     left         0  
   minute      X      Y      h_team         a_team situation   shotType  \
0     5.0  0.923  0.464  Schalke 04  VfB Stuttgart  O

In [9]:
print(df_subset_catboost.dtypes)

minute        float64
X             float64
Y             float64
h_team         object
a_team         object
situation      object
shotType       object
lastAction     object
result          int64
XDTGi         float64
XATGi         float64
position       object
footShot        int64
dtype: object


In [39]:
from sklearn.model_selection import train_test_split
from catboost import Pool

# Define the features and target variable
X = df_subset_regular.drop('result', axis=1)  # Features
X_catboost=df_subset_catboost.drop('result', axis=1)
y = df_subset['result']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_catboost, X_test_catboost, y_train_catboost, y_test_catboost = train_test_split(X_catboost, y, test_size=0.2, random_state=42)



In [14]:
# Display the shapes of the train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (270427, 12)
Shape of X_test: (67607, 12)
Shape of y_train: (270427,)
Shape of y_test: (67607,)


In [15]:
print(y_train.unique())
print(y_train.dtype)

[0 1]
int64


In [40]:
# Convert object-type columns to string type in X_train_catboost and X_test_catboost
X_train_catboost = X_train_catboost.astype(str)
X_test_catboost = X_test_catboost.astype(str)

In [41]:
# Convert categorical features to string type in the train and test pools
train_pool = Pool(data=X_train_catboost, label=y_train_catboost, cat_features=['h_team', 'a_team', 'situation', 'shotType', 'lastAction','position'], feature_names=list(X_train_catboost.columns))
test_pool = Pool(data=X_test_catboost, label=y_test_catboost, cat_features=['h_team', 'a_team', 'situation', 'shotType', 'lastAction','position'], feature_names=list(X_test_catboost.columns))


In [42]:
from imblearn.over_sampling import SMOTE

#Balancing the train dataset
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE to the labels
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)


In [43]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Initialize models
rf_model = RandomForestClassifier()
catboost_model = CatBoostClassifier()
xgboost_model = XGBClassifier()
lgbm_model = LGBMClassifier()
adaboost_model = AdaBoostClassifier()

# Train models
rf_model.fit(X_train_balanced, y_train_balanced)
catboost_model.fit(train_pool)
xgboost_model.fit(X_train_balanced, y_train_balanced)
lgbm_model.fit(X_train_balanced, y_train_balanced)
adaboost_model.fit(X_train_balanced, y_train_balanced)

# Predict probabilities using trained models
rf_probabilities = rf_model.predict_proba(X_test)
catboost_probabilities = catboost_model.predict_proba(test_pool)
xgboost_probabilities = xgboost_model.predict_proba(X_test)
lgbm_probabilities = lgbm_model.predict_proba(X_test)
adaboost_probabilities = adaboost_model.predict_proba(X_test)

Learning rate set to 0.112569
0:	learn: 0.5864071	total: 431ms	remaining: 7m 10s
1:	learn: 0.5079868	total: 798ms	remaining: 6m 37s
2:	learn: 0.4499257	total: 1.17s	remaining: 6m 29s
3:	learn: 0.4054511	total: 1.57s	remaining: 6m 32s
4:	learn: 0.3758651	total: 2s	remaining: 6m 38s
5:	learn: 0.3555217	total: 2.24s	remaining: 6m 10s
6:	learn: 0.3371990	total: 2.6s	remaining: 6m 9s
7:	learn: 0.3237179	total: 3s	remaining: 6m 11s
8:	learn: 0.3140555	total: 3.32s	remaining: 6m 5s
9:	learn: 0.3054551	total: 3.64s	remaining: 6m
10:	learn: 0.2991867	total: 4.01s	remaining: 6m
11:	learn: 0.2947886	total: 4.39s	remaining: 6m 1s
12:	learn: 0.2907107	total: 4.79s	remaining: 6m 3s
13:	learn: 0.2874237	total: 5.2s	remaining: 6m 6s
14:	learn: 0.2850041	total: 5.61s	remaining: 6m 8s
15:	learn: 0.2830651	total: 6s	remaining: 6m 8s
16:	learn: 0.2813607	total: 6.33s	remaining: 6m 5s
17:	learn: 0.2803254	total: 6.79s	remaining: 6m 10s
18:	learn: 0.2789196	total: 7.35s	remaining: 6m 19s
19:	learn: 0.278035

In [44]:
# Evaluate models (using validation data or cross-validation)

(X_val, y_val)=(X_test,y_test)

# Evaluate RandomForestClassifier
rf_accuracy = rf_model.score(X_val, y_val)
# Evaluate CatBoostClassifier
catboost_accuracy = catboost_model.score(test_pool)
# Evaluate XGBClassifier
xgboost_accuracy = xgboost_model.score(X_val, y_val)
# Evaluate LGBMClassifier
lgbm_accuracy = lgbm_model.score(X_val, y_val)
# Evaluate AdaBoostClassifier
adaboost_accuracy = adaboost_model.score(X_val, y_val)

# Print model accuracies
print("RandomForestClassifier Accuracy:", rf_accuracy)
print("CatBoostClassifier Accuracy:", catboost_accuracy)
print("XGBClassifier Accuracy:", xgboost_accuracy)
print("LGBMClassifier Accuracy:", lgbm_accuracy)
print("AdaBoostClassifier Accuracy:", adaboost_accuracy)


RandomForestClassifier Accuracy: 0.8690816039759196
CatBoostClassifier Accuracy: 0.9050542103628323
XGBClassifier Accuracy: 0.8671439347996509
LGBMClassifier Accuracy: 0.8625142366914669
AdaBoostClassifier Accuracy: 0.7941041608117503


In [45]:
from sklearn.metrics import mean_squared_error

# Evaluate RandomForestClassifier
rf_mse = mean_squared_error(y_test, rf_probabilities[:, 1])
rf_error = 1 - rf_accuracy
print("RandomForestClassifier MSE:", rf_mse)
print("RandomForestClassifier Error:", rf_error)

# Evaluate CatBoostClassifier
catboost_mse = mean_squared_error(y_test, catboost_probabilities[:, 1])
catboost_error = 1 - catboost_accuracy
print("CatBoostClassifier MSE:", catboost_mse)
print("CatBoostClassifier Error:", catboost_error)

# Evaluate XGBClassifier
xgboost_mse = mean_squared_error(y_test, xgboost_probabilities[:, 1])
xgboost_error = 1 - xgboost_accuracy
print("XGBClassifier MSE:", xgboost_mse)
print("XGBClassifier Error:", xgboost_error)

# Evaluate LGBMClassifier
lgbm_mse = mean_squared_error(y_test, lgbm_probabilities[:, 1])
lgbm_error = 1 - lgbm_accuracy
print("LGBMClassifier MSE:", lgbm_mse)
print("LGBMClassifier Error:", lgbm_error)

# Evaluate AdaBoostClassifier
adaboost_mse = mean_squared_error(y_test, adaboost_probabilities[:,1])
adaboost_error = 1 - adaboost_accuracy
print("AdaBoostClassifier MSE:", adaboost_mse)
print("AdaBoostClassifier Error:", adaboost_error)


RandomForestClassifier MSE: 0.10038217720222922
RandomForestClassifier Error: 0.1309183960240804
CatBoostClassifier MSE: 0.07596409154392585
CatBoostClassifier Error: 0.09494578963716771
XGBClassifier MSE: 0.10001467327952576
XGBClassifier Error: 0.1328560652003491
LGBMClassifier MSE: 0.10289725096496838
LGBMClassifier Error: 0.13748576330853313
AdaBoostClassifier MSE: 0.2439309024143107
AdaBoostClassifier Error: 0.20589583918824972


In [None]:
# Importing required libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Initialize the deep learning model
dl_model = Sequential()

# Add layers to the model
dl_model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
dl_model.add(Dropout(0.5))
dl_model.add(Dense(32, activation='relu'))
dl_model.add(Dropout(0.5))
dl_model.add(Dense(1, activation='sigmoid'))

# Compile the model
dl_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

# Train the model
dl_model.fit(X_train_balanced, y_train_balanced, epochs=100, batch_size=32, verbose=1)

# Evaluate the model
dl_accuracy = dl_model.evaluate(X_test, y_test, verbose=0)
print("Deep Learning Model Accuracy:", dl_accuracy[1])


In [None]:
original_xG = pd.read_csv('/content/drive/MyDrive/concatenated_final_dataset.csv')
original_xG=preprocess(original_xG,True)
original_xG=feature_engineering(original_xG)
predict_xG=(original_xG.drop(['xG'],axis=1))
original_xG=original_xG.reset_index()
original_xG=original_xG.drop(['index'],axis=1)
predict_xG=predict_xG.reset_index()
predict_xG=predict_xG.drop(['index'],axis=1)

In [None]:
# Calculate the predicted probabilities of xG using the trained models
X = predict_xG.drop('result', axis=1)
rf_predicted_xG = rf_model.predict_proba(X)
catboost_predicted_xG = catboost_model.predict_proba(X)
xgboost_predicted_xG = xgboost_model.predict_proba(X)
lgbm_predicted_xG = lgbm_model.predict_proba(X)
adaboost_predicted_xG = adaboost_model.predict_proba(X)

# Create a DataFrame to store the predicted xG values
predicted_xG_df = pd.DataFrame({
    'RF_predicted_xG': rf_predicted_xG[:, 1],
    'CatBoost_predicted_xG': catboost_predicted_xG[:, 1],
    'XGBoost_predicted_xG': xgboost_predicted_xG[:, 1],
    'LGBM_predicted_xG': lgbm_predicted_xG[:, 1],
    'AdaBoost_predicted_xG': adaboost_predicted_xG[:, 1]
})

# Extract the original xG values
xG = original_xG['xG']
# Merge the original xG values with the predicted xG values
merged_xG = pd.concat([xG, predicted_xG_df], axis=1)

# Compare the predicted xG values with the original xG values
merged_xG['RF_xG_difference'] = merged_xG['RF_predicted_xG'] - merged_xG['xG']
merged_xG['CatBoost_xG_difference'] = merged_xG['CatBoost_predicted_xG'] - merged_xG['xG']
merged_xG['XGBoost_xG_difference'] = merged_xG['XGBoost_predicted_xG'] - merged_xG['xG']
merged_xG['LGBM_xG_difference'] = merged_xG['LGBM_predicted_xG'] - merged_xG['xG']
merged_xG['AdaBosst_xG_difference'] = merged_xG['AdaBoost_predicted_xG'] - merged_xG['xG']

# Display the merged xG data with predicted xG values and differences
print(merged_xG)


              xG  RF_predicted_xG  CatBoost_predicted_xG  \
0       0.141077             0.79               0.388178   
1       0.012746             0.01               0.030785   
2       0.530869             0.63               0.579303   
3       0.075300             0.16               0.192653   
4       0.232375             0.21               0.289411   
...          ...              ...                    ...   
338029  0.054789             0.06               0.103768   
338030  0.127258             0.24               0.420007   
338031  0.080335             0.19               0.261630   
338032  0.031638             0.05               0.125812   
338033  0.450002             0.87               0.488116   

        XGBoost_predicted_xG  LGBM_predicted_xG  AdaBoost_predicted_xG  \
0                   0.761209           0.784651               0.510516   
1                   0.063485           0.064712               0.480712   
2                   0.732128           0.792962          

In [None]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(merged_xG['RF_predicted_xG'] , merged_xG['xG']))
print(mean_squared_error(merged_xG['CatBoost_predicted_xG'] , merged_xG['xG']))
print(mean_squared_error(merged_xG['XGBoost_predicted_xG'] , merged_xG['xG']))
print(mean_squared_error(merged_xG['LGBM_predicted_xG'] , merged_xG['xG']))
print(mean_squared_error(merged_xG['AdaBoost_predicted_xG'] , merged_xG['xG']))

0.038576178515317444
0.009724966984891857
0.03607229691957938
0.04072413427507297
0.17239806247332545


In [None]:
merged_xG.describe()

Unnamed: 0,xG,RF_predicted_xG,CatBoost_predicted_xG,XGBoost_predicted_xG,LGBM_predicted_xG,AdaBoost_predicted_xG,RF_xG_difference,CatBoost_xG_difference,XGBoost_xG_difference,LGBM_xG_difference,AdaBosst_xG_difference
count,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0
mean,0.110556,0.167563,0.141035,0.230017,0.248196,0.494896,0.057006,0.030479,0.119461,0.13764,0.384339
std,0.162089,0.247153,0.16323,0.221611,0.22144,0.007817,0.187954,0.093787,0.147653,0.147578,0.157103
min,0.0,0.0,0.000112,0.001321,0.009525,0.460646,-0.827863,-0.718899,-0.731059,-0.636474,-0.479032
25%,0.024697,0.02,0.039118,0.063022,0.078876,0.489254,-0.017831,5e-06,0.0211,0.037903,0.402387
50%,0.050862,0.07,0.079726,0.146462,0.164205,0.494857,0.007776,0.020886,0.07859,0.09668,0.44375
75%,0.097514,0.18,0.176235,0.331862,0.35599,0.500065,0.073263,0.05793,0.181543,0.200295,0.464185
max,0.979887,1.0,0.99541,0.995742,0.992529,0.529031,0.969385,0.915441,0.945901,0.940431,0.50708


In [None]:
# Save the merged_xG DataFrame to a CSV file
merged_xG.to_csv('merged_xG.csv', index=False)


In [None]:
from google.colab import files
files.download('merged_xG.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>