In [26]:
import pandas as pd

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/concatenated_final_dataset.csv')

# View the first few rows of the DataFrame
print("First few rows of the dataset:")
print(df.head())

print(df.describe())
# View the last few rows
#print("Last few rows of the dataset:")
#print(df.tail())


First few rows of the dataset:
        id  minute       result      X      Y        xG               player  \
0  30358.0     5.0  MissedShots  0.923  0.464  0.141077           Leroy Sané   
1  30359.0     6.0  BlockedShot  0.764  0.427  0.012746     Roman Neustädter   
2  30360.0     8.0         Goal  0.929  0.490  0.530869  Klaas-Jan Huntelaar   
3  30362.0    13.0  MissedShots  0.829  0.555  0.075300          Dennis Aogo   
4  30363.0    14.0    SavedShot  0.889  0.610  0.232375           Leroy Sané   

  h_a  player_id situation  season   shotType  match_id      h_team  \
0   h      337.0  OpenPlay  2014.0   LeftFoot    5159.0  Schalke 04   
1   h      346.0  OpenPlay  2014.0  RightFoot    5159.0  Schalke 04   
2   h      340.0  OpenPlay  2014.0  RightFoot    5159.0  Schalke 04   
3   h      334.0  OpenPlay  2014.0  RightFoot    5159.0  Schalke 04   
4   h      337.0  OpenPlay  2014.0   LeftFoot    5159.0  Schalke 04   

          a_team  h_goals  a_goals                 date  \
0 

In [31]:
#Preprocessing
from sklearn.preprocessing import LabelEncoder

def preprocess(df,xg):
    # Keep only the desired columns
    if(not xg):
      desired_columns = ['minute', 'X', 'Y' , 'h_team', 'a_team', 'situation', 'shotType', 'lastAction' , 'result']
      df_subset = df[desired_columns]
    else:
      desired_columns = ['minute', 'X', 'Y' , 'h_team', 'a_team', 'situation', 'shotType', 'lastAction' , 'result','xG']
      df_subset = df[desired_columns]
    # Drop the rows where the value in any row of the 'result' column is 'Son Heung-Min'
    df_subset = df_subset.drop(df_subset[df_subset['result'] == 'Son Heung-Min'].index, axis=0)

    # Replace values in the 'result' column
    df_subset.loc[:, 'result'] = df_subset['result'].replace({'MissedShots': 0, 'BlockedShot': 0, 'SavedShot': 0, 'ShotOnPost': 0, 'OwnGoal': 0, 'Goal': 1})

    # Convert the 'result' column to integers
    df_subset['result'] = df_subset['result'].astype(int)

    # Drop rows with NaN values in any column
    df_subset = df_subset.dropna()

    return df_subset


In [32]:
#feature_engineering
import numpy as np
from sklearn.preprocessing import LabelEncoder

def feature_engineering(df):
    # Define a function to calculate XDTGi (Distance To Goal)
    def calculate_XDTGi(X, Y):
        return np.sqrt((105 - (X * 105))**2 + (32.5 - (Y * 32.5))**2)

    # Define a function to calculate XATGi (Angle To Goal)
    def calculate_XATGi(X, Y):
        ai = np.arctan(7.32 * (105 - (X * 105)))
        bi = ((105 - (X * 105))**2 + (32.5 - (Y * 68))**2 - (7.32/2)**2)
        return np.abs((ai / bi) * (180 / np.pi))

    # Apply the functions to the dataset using .loc
    df.loc[:, 'XDTGi'] = calculate_XDTGi(df['X'], df['Y'])
    df.loc[:, 'XATGi'] = calculate_XATGi(df['X'], df['Y'])

    # Apply label encoding to object type columns
    label_encoder = LabelEncoder()
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = label_encoder.fit_transform(df[column])

    return df


In [35]:
df_subset=preprocess(df,False)
df_subset=feature_engineering(df_subset)

In [36]:
print((df_subset.head()))

   minute      X      Y  h_team  a_team  situation  shotType  lastAction  \
0     5.0  0.923  0.464     148     172          2         1          26   
1     6.0  0.764  0.427     148     172          2         3           0   
2     8.0  0.929  0.490     148     172          2         3           3   
3    13.0  0.829  0.555     148     172          2         3          26   
4    14.0  0.889  0.610     148     172          2         1          38   

   result      XDTGi     XATGi  
0       0  19.204780  1.683970  
1       0  30.997513  0.146387  
2       1  18.174367  2.075666  
3       0  23.055279  0.266208  
4       0  17.219019  0.439860  


In [37]:
from sklearn.model_selection import train_test_split

# Define the features and target variable
X = df_subset.drop('result', axis=1)  # Features
y = df_subset['result']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the train and test sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (270427, 10)
Shape of X_test: (67607, 10)
Shape of y_train: (270427,)
Shape of y_test: (67607,)


In [9]:
print(y_train.unique())
print(y_train.dtype)

[0 1]
int64


In [10]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Initialize SMOTE
smote = SMOTE(random_state=42)
# Apply SMOTE to the labels
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Initialize models
rf_model = RandomForestClassifier()
catboost_model = CatBoostClassifier()
xgboost_model = XGBClassifier()
lgbm_model = LGBMClassifier()

# Train models
rf_model.fit(X_train_balanced, y_train_balanced)
catboost_model.fit(X_train_balanced, y_train_balanced)
xgboost_model.fit(X_train_balanced, y_train_balanced)
lgbm_model.fit(X_train_balanced, y_train_balanced)

# Predict probabilities using trained models
rf_probabilities = rf_model.predict_proba(X_test)
catboost_probabilities = catboost_model.predict_proba(X_test)
xgboost_probabilities = xgboost_model.predict_proba(X_test)
lgbm_probabilities = lgbm_model.predict_proba(X_test)

In [39]:
# Evaluate models (using validation data or cross-validation)

(X_val, y_val)=(X_test,y_test)
# Assuming X_val contains categorical variables
label_encoder_X = LabelEncoder()
for column in X_val.columns:
    if X_val[column].dtype == object:
        X_val[column] = label_encoder_X.fit_transform(X_val[column])
# Evaluate RandomForestClassifier
rf_accuracy = rf_model.score(X_val, y_val)
# Evaluate CatBoostClassifier
catboost_accuracy = catboost_model.score(X_val, y_val)
# Evaluate XGBClassifier
xgboost_accuracy = xgboost_model.score(X_val, y_val)
# Evaluate LGBMClassifier
lgbm_accuracy = lgbm_model.score(X_val, y_val)

# Print model accuracies
print("RandomForestClassifier Accuracy:", rf_accuracy)
print("CatBoostClassifier Accuracy:", catboost_accuracy)
print("XGBClassifier Accuracy:", xgboost_accuracy)
print("LGBMClassifier Accuracy:", lgbm_accuracy)


RandomForestClassifier Accuracy: 0.8577809990089783
CatBoostClassifier Accuracy: 0.9009274187584126
XGBClassifier Accuracy: 0.8567308118981762
LGBMClassifier Accuracy: 0.849394293490319


In [40]:
from sklearn.metrics import mean_squared_error

# Evaluate RandomForestClassifier
rf_mse = mean_squared_error(y_test, rf_probabilities[:, 1])
rf_error = 1 - rf_accuracy
print("RandomForestClassifier MSE:", rf_mse)
print("RandomForestClassifier Error:", rf_error)

# Evaluate CatBoostClassifier
catboost_mse = mean_squared_error(y_test, catboost_probabilities[:, 1])
catboost_error = 1 - catboost_accuracy
print("CatBoostClassifier MSE:", catboost_mse)
print("CatBoostClassifier Error:", catboost_error)

# Evaluate XGBClassifier
xgboost_mse = mean_squared_error(y_test, xgboost_probabilities[:, 1])
xgboost_error = 1 - xgboost_accuracy
print("XGBClassifier MSE:", xgboost_mse)
print("XGBClassifier Error:", xgboost_error)

# Evaluate LGBMClassifier
lgbm_mse = mean_squared_error(y_test, lgbm_probabilities[:, 1])
lgbm_error = 1 - lgbm_accuracy
print("LGBMClassifier MSE:", lgbm_mse)
print("LGBMClassifier Error:", lgbm_error)


RandomForestClassifier MSE: 0.10717067683449936
RandomForestClassifier Error: 0.1422190009910217
CatBoostClassifier MSE: 0.08035488570562702
CatBoostClassifier Error: 0.09907258124158735
XGBClassifier MSE: 0.10761984509321595
XGBClassifier Error: 0.14326918810182376
LGBMClassifier MSE: 0.1123481878845159
LGBMClassifier Error: 0.15060570650968097


In [None]:
# Importing required libraries
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Initialize the deep learning model
dl_model = Sequential()

# Add layers to the model
dl_model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
dl_model.add(Dropout(0.5))
dl_model.add(Dense(64, activation='relu'))
dl_model.add(Dropout(0.5))
dl_model.add(Dense(1, activation='sigmoid'))

# Compile the model
dl_model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
dl_model.fit(X_train_balanced, y_train_balanced, epochs=20, batch_size=32, verbose=1)

# Evaluate the model
dl_accuracy = dl_model.evaluate(X_test, y_test, verbose=0)
print("Deep Learning Model Accuracy:", dl_accuracy[1])


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

In [71]:
original_xG = pd.read_csv('/content/drive/MyDrive/concatenated_final_dataset.csv')
original_xG=preprocess(original_xG,True)
original_xG=feature_engineering(original_xG)
predict_xG=(original_xG.drop(['xG'],axis=1))
original_xG=original_xG.reset_index()
original_xG=original_xG.drop(['index'],axis=1)
predict_xG=predict_xG.reset_index()
predict_xG=predict_xG.drop(['index'],axis=1)

In [72]:
# Calculate the predicted probabilities of xG using the trained models
X = predict_xG.drop('result', axis=1)
rf_predicted_xG = rf_model.predict_proba(X)
catboost_predicted_xG = catboost_model.predict_proba(X)
xgboost_predicted_xG = xgboost_model.predict_proba(X)
lgbm_predicted_xG = lgbm_model.predict_proba(X)

# Create a DataFrame to store the predicted xG values
predicted_xG_df = pd.DataFrame({
    'RF_predicted_xG': rf_predicted_xG[:, 1],
    'CatBoost_predicted_xG': catboost_predicted_xG[:, 1],
    'XGBoost_predicted_xG': xgboost_predicted_xG[:, 1],
    'LGBM_predicted_xG': lgbm_predicted_xG[:, 1]
})

# Extract the original xG values
xG = original_xG['xG']
# Merge the original xG values with the predicted xG values
merged_xG = pd.concat([xG, predicted_xG_df], axis=1)

# Compare the predicted xG values with the original xG values
merged_xG['RF_xG_difference'] = merged_xG['RF_predicted_xG'] - merged_xG['xG']
merged_xG['CatBoost_xG_difference'] = merged_xG['CatBoost_predicted_xG'] - merged_xG['xG']
merged_xG['XGBoost_xG_difference'] = merged_xG['XGBoost_predicted_xG'] - merged_xG['xG']
merged_xG['LGBM_xG_difference'] = merged_xG['LGBM_predicted_xG'] - merged_xG['xG']

# Display the merged xG data with predicted xG values and differences
print(merged_xG)


              xG  RF_predicted_xG  CatBoost_predicted_xG  \
0       0.141077             0.81               0.388178   
1       0.012746             0.00               0.030785   
2       0.530869             0.62               0.579303   
3       0.075300             0.12               0.192653   
4       0.232375             0.24               0.289411   
...          ...              ...                    ...   
338029  0.054789             0.05               0.103768   
338030  0.127258             0.24               0.420007   
338031  0.080335             0.25               0.261630   
338032  0.031638             0.06               0.125812   
338033  0.450002             0.79               0.488116   

        XGBoost_predicted_xG  LGBM_predicted_xG  RF_xG_difference  \
0                   0.761209           0.784651          0.668923   
1                   0.063485           0.064712         -0.012746   
2                   0.732128           0.792962          0.089131   
3  

In [76]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(merged_xG['RF_predicted_xG'] , merged_xG['xG']))
print(mean_squared_error(merged_xG['CatBoost_predicted_xG'] , merged_xG['xG']))
print(mean_squared_error(merged_xG['XGBoost_predicted_xG'] , merged_xG['xG']))
print(mean_squared_error(merged_xG['LGBM_predicted_xG'] , merged_xG['xG']))

0.03849068218853166
0.009724966984891857
0.03607229691957938
0.04072413427507297


In [75]:
merged_xG.describe()

Unnamed: 0,xG,RF_predicted_xG,CatBoost_predicted_xG,XGBoost_predicted_xG,LGBM_predicted_xG,RF_xG_difference,CatBoost_xG_difference,XGBoost_xG_difference,LGBM_xG_difference
count,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0,338034.0
mean,0.110556,0.167534,0.141035,0.230017,0.248196,0.056978,0.030479,0.119461,0.13764
std,0.162089,0.246968,0.16323,0.221611,0.22144,0.187735,0.093787,0.147653,0.147578
min,0.0,0.0,0.000112,0.001321,0.009525,-0.78087,-0.718899,-0.731059,-0.636474
25%,0.024697,0.02,0.039118,0.063022,0.078876,-0.01781,5e-06,0.0211,0.037903
50%,0.050862,0.07,0.079726,0.146462,0.164205,0.00793,0.020886,0.07859,0.09668
75%,0.097514,0.18,0.176235,0.331862,0.35599,0.073171,0.05793,0.181543,0.200295
max,0.979887,1.0,0.99541,0.995742,0.992529,0.969385,0.915441,0.945901,0.940431


In [61]:
#len(xG)
#len(X)
#len(rf_predicted_xG)

338034

In [77]:
# Save the merged_xG DataFrame to a CSV file
merged_xG.to_csv('merged_xG.csv', index=False)


In [78]:
from google.colab import files
files.download('merged_xG.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>