In [1]:
# club

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, matthews_corrcoef
from imblearn.over_sampling import RandomOverSampler
import statsmodels.api as sm

data = pd.read_csv("C:\\Users\\co279\\penalty.csv")

data = data.iloc[:, 4:]
data.dropna(inplace=True)

# data preprocessing
data = pd.get_dummies(data, columns=['position'])

# label encoding 'score' column
label_encoder = LabelEncoder()
data['score'] = label_encoder.fit_transform(data['score'])

scaler = StandardScaler()
data[['goals']] = scaler.fit_transform(data[['goals']])

# data spliting (X, y)
X = data.drop(columns=['score'])
y = data['score']

# data spliting (train, test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=411, stratify=y)

# oversampling
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=411)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# model fitting (statsmodel)
X_resampled = sm.add_constant(X_resampled)  # add constant
model = sm.GLM(y_resampled, X_resampled, family=sm.families.Binomial())
result = model.fit()

# result summary
print(result.summary())

X_test = sm.add_constant(X_test)

y_pred_probs = result.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

# model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("f1_score:", f1)

roc = roc_auc_score(y_test, y_pred)
print("roc_auc:", roc)

mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

coefficients = pd.DataFrame(result.params, columns=['Coefficient'])
print(coefficients)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  score   No. Observations:                 2332
Model:                            GLM   Df Residuals:                     2323
Model Family:                Binomial   Df Model:                            8
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1554.5
Date:                Wed, 07 Aug 2024   Deviance:                       3109.1
Time:                        14:01:55   Pearson chi2:                 2.33e+03
No. Iterations:                     5   Pseudo R-squ. (CS):            0.05169
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0126      0.285      0.044      

In [2]:
# international

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, matthews_corrcoef
from imblearn.over_sampling import RandomOverSampler
import statsmodels.api as sm

data = pd.read_csv("C:\\Users\\co279\\penalty_international.csv")

data = data.iloc[:, 4:]
data.dropna(inplace=True)

# data preprocessing
data = pd.get_dummies(data, columns=['position'])

# label encoding 'score' column
label_encoder = LabelEncoder()
data['score'] = label_encoder.fit_transform(data['score'])

scaler = StandardScaler()
data[['goals']] = scaler.fit_transform(data[['goals']])

# data spliting (X, y)
X = data.drop(columns=['score'])
y = data['score']

# data spliting (train, test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=411, stratify=y)

# oversampling
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=411)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# model fitting (statsmodel)
X_resampled = sm.add_constant(X_resampled)  # add constant
model = sm.GLM(y_resampled, X_resampled, family=sm.families.Binomial())
result = model.fit()

# result summary
print(result.summary())

X_test = sm.add_constant(X_test)

y_pred_probs = result.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

# model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("f1_score:", f1)

roc = roc_auc_score(y_test, y_pred)
print("roc_auc:", roc)

mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

coefficients = pd.DataFrame(result.params, columns=['Coefficient'])
print(coefficients)

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  score   No. Observations:                  510
Model:                            GLM   Df Residuals:                      501
Model Family:                Binomial   Df Model:                            8
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -331.66
Date:                Wed, 07 Aug 2024   Deviance:                       663.33
Time:                        14:03:53   Pearson chi2:                     523.
No. Iterations:                     5   Pseudo R-squ. (CS):            0.08208
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -1.2447      0.717     -1.735      