In [4]:
# club

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score, matthews_corrcoef
import statsmodels.api as sm

# data loading
data = pd.read_csv("C:\\Users\\co279\\penalty.csv")

# data preprocessing
data = pd.get_dummies(data, columns=['position'])

label_encoder = LabelEncoder()
data['score'] = label_encoder.fit_transform(data['score'])

data = data.iloc[:, 4:]

data.dropna(inplace=True)

X = data.drop(columns=['score'])
y = data['score']

# data spliting (train, test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=411, stratify=y)

# model training
model = LogisticRegression(class_weight='balanced', random_state=411)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("f1_score:", f1)

roc = roc_auc_score(y_test, y_pred)
print("roc_auc:", roc)

mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


coefficients = pd.DataFrame(model.coef_.T, index=X.columns, columns=['Coefficient'])
print(coefficients)

# logistic regression model fitting (statsmodel)
X_train_sm = sm.add_constant(X_train)  # add constant
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit()

# result summary
print(result.summary())


accuracy: 0.5909090909090909
f1_score: 0.4375
roc_auc: 0.6146198169698339
MCC: 0.1961348492884834
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.57      0.68       334
           1       0.33      0.66      0.44       106

    accuracy                           0.59       440
   macro avg       0.58      0.61      0.56       440
weighted avg       0.72      0.59      0.62       440

Confusion Matrix:
[[190 144]
 [ 36  70]]
             Coefficient
age             0.001489
goals          -0.007481
period         -0.018060
order          -0.107875
last            0.815829
position_df     0.085331
position_fw     0.162029
position_gk    -0.055033
position_mf     0.095676
Optimization terminated successfully.
         Current function value: 0.541602
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                  score   No. Observations:                 1757
M

In [9]:
# international

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score, matthews_corrcoef
from imblearn.over_sampling import RandomOverSampler
import statsmodels.api as sm

data = pd.read_csv("C:\\Users\\co279\\penalty_international.csv")

data=data.iloc[:,4:]
data.dropna(inplace=True)

# data preprocessing
data = pd.get_dummies(data, columns=['position'])

# labelencoding 'score' column
label_encoder = LabelEncoder()
data['score'] = label_encoder.fit_transform(data['score'])

scaler = StandardScaler()
data[['goals']] = scaler.fit_transform(data[['goals']])

# data spliting (X, y)
X = data.drop(columns=['score'])
y = data['score']

# data spliting (train, test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=411, stratify=y)

# oversamling
oversampler = RandomOverSampler(sampling_strategy='minority', random_state=411)
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

# logistic regression model fitting (statsmodel)
X_resampled1 = sm.add_constant(X_resampled)  # add constant
model = sm.Logit(y_resampled, X_resampled1)
result = model.fit()

# result summary
print(result.summary())

model1 = LogisticRegression(random_state=411)
result1 = model1.fit(X_resampled, y_resampled)

y_pred = model1.predict(X_test)

# model evaluation
accuracy = accuracy_score(y_test, y_pred)
print("accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("f1_score:", f1)

roc = roc_auc_score(y_test, y_pred)
print("roc_auc:", roc)

mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

coefficients = pd.DataFrame(model1.coef_.T, index=X.columns, columns=['Coefficient'])
print(coefficients)

Optimization terminated successfully.
         Current function value: 0.650323
         Iterations 12
                           Logit Regression Results                           
Dep. Variable:                  score   No. Observations:                  510
Model:                          Logit   Df Residuals:                      501
Method:                           MLE   Df Model:                            8
Date:                Wed, 07 Aug 2024   Pseudo R-squ.:                 0.06178
Time:                        13:31:24   Log-Likelihood:                -331.66
converged:                       True   LL-Null:                       -353.51
Covariance Type:            nonrobust   LLR p-value:                 6.538e-07
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -1.2447   1.19e+07  -1.05e-07      1.000   -2.33e+07    2.33e+07
age             0.0795   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
