# Logistic Regression

## 1. Importing the libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import config.ConnectionConfig as cc
from pyspark.sql import SparkSession
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib

## 2. Setting up the spark session

In [2]:
cc.setupEnvironment()
spark = cc.startLocalCluster("UFC_Logistic_Regression_Training")
spark.getActiveSession()
spark = SparkSession.builder.appName("UFC_Fights").getOrCreate()

## 3. Importing the dataset

In [3]:
data = spark.read.csv('../processed_data/fight_total.csv', header = True, inferSchema = True)
data.show()

## 3. Splitting the dataset into the Training set and Test set

In [4]:
# drop both the fighter names because they are not needed
data = data.drop('fighter1', 'fighter2')

data = data.toPandas()
X = data.drop('outcome', axis = 1)
y = data['outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## 4. Feature Scaling

In [5]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## 5. Hyperparameter tuning

In [6]:

parameters = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.1, 0.5, 1, 5, 10],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

grid_search = GridSearchCV(estimator = LogisticRegression(), param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

print("Best Accuracy: {:.2f} %".format(grid_search.best_score_ * 100))
print("Best Parameters:", grid_search.best_params_)

model = grid_search.best_estimator_

## 6. Training the Logistic Regression model on the Training set

In [7]:
model.fit(X_train, y_train)
# model score
print("Model Score: {:f} %".format(model.score(X_test, y_test) * 100))

## 7. Making Predictions

In [8]:
y_pred = model.predict(X_test)

print("Accuracy on Test Set: {:.2f} %".format(accuracy_score(y_test, y_pred) * 100))

## 8. Checking the results and probabilities

In [9]:
pred_prob = model.predict_proba(X_test)
pos_outcomes = model.classes_
norm_prob = pred_prob / pred_prob.sum(axis = 1)[:, None]
# rearrange the cols so first is D/D then NC/NC then W/L then L/W, currently it is DD, LW, NCNC, WL
norm_prob = norm_prob[:, [0, 2, 3, 1]]
pos_outcomes = pos_outcomes[[0, 2, 3, 1]]
prob_norm_df = pd.DataFrame(norm_prob, columns = pos_outcomes)
prob_norm_df['outcome_predicted'] = y_pred
prob_norm_df['outcome_actual'] = y_test.values
prob_norm_df

## 9. Saving the model

In [10]:
joblib.dump(model, 'models/ufc_logistic_regression_model.pkl')