In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import PolynomialFeatures

In [7]:
# Read & Adjust Data from CSV file

# We are trying here linear and 2nd, and 3rd order polynomial regression with oversampling our data to compromise for the inbalance.
# We also applied pca  (a trail to improve performance) on scaled data (as it affects pca severily), to decrease its huge dimensions. As, it's very suspecious to have usless/correlated features.

data = pd.read_csv("C:/Users/Mahmoud/Documents/GitHub/Machine-Learning-Classification-Project/Preprocessing/2017_Accidents_UK_Clean _Mahmoud.csv",dtype=float)
x = data.iloc[:, :-1] 
y = data.iloc[:, -1]  

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

priciple_component_analayzer = PCA(n_components=15)  
x_pca = priciple_component_analayzer.fit_transform(x_scaled)

oversampler = RandomOverSampler(random_state=10)
x_resampled, y_resampled = oversampler.fit_resample(x_pca, y)
x=pd.DataFrame(x_resampled)
y=y_resampled

x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size=0.15, random_state=10)
x=x_training
y=y_training

In [8]:
# We try different parameters and apply K-folding on them. Our evaluation metrices will be accuracy, precision, and recall.
# We weren't able to apply ROC, as it's a multi-class output, and having 1v2 roc isn't the best metric. 
# We store the results to choose the best one later.
# This is the linear one.

penalty_options = ['l1',None, 'l2']
C_values = [0.001, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24]
solver_options = {
    'lbfgs': ['l2', None],
    'liblinear': ['l1', 'l2'],
    'newton-cg': ['l2', None],
    'newton-cholesky': ['l2', None],
    'sag': ['l2', None],
    'saga': ['elasticnet', 'l1', 'l2', None]
}
results = []
warnings.filterwarnings("ignore", category=ConvergenceWarning)

for penalty in penalty_options:
    for C in C_values:
        for solver in solver_options.keys():
            if penalty in solver_options[solver]:
                accuracy_scores = []
                precision_scores = []
                recall_scores = []

                kf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

                for train_index, test_index in kf.split(x, y):
                    X_train_fold, X_cross = x.iloc[train_index], x.iloc[test_index]
                    y_train_fold, y_cross = y.iloc[train_index], y.iloc[test_index]
                    
                    logistic_model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=100)
                    logistic_model.fit(X_train_fold, y_train_fold)
                    y_pred = logistic_model.predict(X_cross)

                    accuracy = accuracy_score(y_cross, y_pred)
                    precision = precision_score(y_cross, y_pred, average='macro', zero_division=0)
                    recall = recall_score(y_cross, y_pred, average='macro')

                    accuracy_scores.append(accuracy)
                    precision_scores.append(precision)
                    recall_scores.append(recall)

                this_result = {
                    'penalty': penalty,
                    'C': C,
                    'solver': solver,
                    'accuracy': np.mean(accuracy_scores),
                    'precision': np.mean(precision_scores),
                    'recall': np.mean(recall_scores)
                }
                results.append(this_result)
                print(this_result)


{'penalty': 'l1', 'C': 0.001, 'solver': 'liblinear', 'accuracy': 0.47939477914598305, 'precision': 0.46828655015585685, 'recall': 0.47942007993648617}
{'penalty': 'l1', 'C': 0.001, 'solver': 'saga', 'accuracy': 0.47976562373480885, 'precision': 0.47031824838950503, 'recall': 0.4797871769347588}
{'penalty': 'l1', 'C': 0.01, 'solver': 'liblinear', 'accuracy': 0.4797070654402941, 'precision': 0.4693070081194829, 'recall': 0.4797309528173274}
{'penalty': 'l1', 'C': 0.01, 'solver': 'saga', 'accuracy': 0.48071810562968664, 'precision': 0.4719775535784977, 'recall': 0.48073874451745924}
{'penalty': 'l1', 'C': 0.02, 'solver': 'liblinear', 'accuracy': 0.4798280776869716, 'precision': 0.46944187784226754, 'recall': 0.4798519010742358}
{'penalty': 'l1', 'C': 0.02, 'solver': 'saga', 'accuracy': 0.4805190200760808, 'precision': 0.47181589379621214, 'recall': 0.48053957774129447}
{'penalty': 'l1', 'C': 0.04, 'solver': 'liblinear', 'accuracy': 0.47985540336725396, 'precision': 0.4694857952203888, 're



{'penalty': None, 'C': 0.001, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.001, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.001, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.001, 'solver': 'sag', 'accuracy': 0.480284803126754, 'precision': 0.4716322151075147, 'recall': 0.48030528650609483}




{'penalty': None, 'C': 0.001, 'solver': 'saga', 'accuracy': 0.4803082238540581, 'precision': 0.4716529785489773, 'recall': 0.48032870890923995}




{'penalty': None, 'C': 0.01, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.01, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.01, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.01, 'solver': 'sag', 'accuracy': 0.4802769961160644, 'precision': 0.4716264540188117, 'recall': 0.4802974798761472}




{'penalty': None, 'C': 0.01, 'solver': 'saga', 'accuracy': 0.48024186289175386, 'precision': 0.47159207744577075, 'recall': 0.48026233872669016}




{'penalty': None, 'C': 0.02, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.02, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.02, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.02, 'solver': 'sag', 'accuracy': 0.48028089946902675, 'precision': 0.4716309367719645, 'recall': 0.480301382619776}




{'penalty': None, 'C': 0.02, 'solver': 'saga', 'accuracy': 0.4803394533444501, 'precision': 0.47168656076314897, 'recall': 0.48035994297252477}




{'penalty': None, 'C': 0.04, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.04, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.04, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.04, 'solver': 'sag', 'accuracy': 0.4803394541063624, 'precision': 0.4716840527450946, 'recall': 0.4803599459444559}




{'penalty': None, 'C': 0.04, 'solver': 'saga', 'accuracy': 0.48028870571780413, 'precision': 0.4716358381377891, 'recall': 0.4803091908485527}




{'penalty': None, 'C': 0.08, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.08, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.08, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.08, 'solver': 'sag', 'accuracy': 0.4802887068606726, 'precision': 0.47163930526231057, 'recall': 0.48030918924972354}




{'penalty': None, 'C': 0.08, 'solver': 'saga', 'accuracy': 0.4802887067844813, 'precision': 0.4716378300861986, 'recall': 0.4803091881070335}




{'penalty': None, 'C': 0.16, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.16, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.16, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.16, 'solver': 'sag', 'accuracy': 0.48027699588749073, 'precision': 0.4716208909273722, 'recall': 0.4802974798761472}




{'penalty': None, 'C': 0.16, 'solver': 'saga', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.32, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.32, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.32, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.32, 'solver': 'sag', 'accuracy': 0.48028480312675403, 'precision': 0.4716347574936114, 'recall': 0.48030528650609483}




{'penalty': None, 'C': 0.32, 'solver': 'saga', 'accuracy': 0.4802808993928355, 'precision': 0.4716265247194785, 'recall': 0.48030138490515606}




{'penalty': None, 'C': 0.64, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 0.64, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 0.64, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 0.64, 'solver': 'sag', 'accuracy': 0.4802496701310172, 'precision': 0.47160343055224085, 'recall': 0.4802701453566378}




{'penalty': None, 'C': 0.64, 'solver': 'saga', 'accuracy': 0.48033164602899553, 'precision': 0.4716761071062995, 'recall': 0.4803521386279573}




{'penalty': None, 'C': 1.28, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 1.28, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 1.28, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 1.28, 'solver': 'sag', 'accuracy': 0.4803589712521304, 'precision': 0.47170449049675967, 'recall': 0.48037946331859216}




{'penalty': None, 'C': 1.28, 'solver': 'saga', 'accuracy': 0.4803745862639957, 'precision': 0.47171097789913147, 'recall': 0.48039508640736184}




{'penalty': None, 'C': 2.56, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 2.56, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 2.56, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 2.56, 'solver': 'sag', 'accuracy': 0.48029651394755346, 'precision': 0.4716448694201866, 'recall': 0.4803169970223612}




{'penalty': None, 'C': 2.56, 'solver': 'saga', 'accuracy': 0.48036287483366646, 'precision': 0.4717084369718589, 'recall': 0.480383367204911}




{'penalty': None, 'C': 5.12, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 5.12, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 5.12, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 5.12, 'solver': 'sag', 'accuracy': 0.4803316470194815, 'precision': 0.4716779892320406, 'recall': 0.48035213588643816}




{'penalty': None, 'C': 5.12, 'solver': 'saga', 'accuracy': 0.4802887068606726, 'precision': 0.4716372759181091, 'recall': 0.48030918924972354}




{'penalty': None, 'C': 10.24, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}




{'penalty': None, 'C': 10.24, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}




{'penalty': None, 'C': 10.24, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}




{'penalty': None, 'C': 10.24, 'solver': 'sag', 'accuracy': 0.4804097178120991, 'precision': 0.4717514124610691, 'recall': 0.4804302170413933}




{'penalty': None, 'C': 10.24, 'solver': 'saga', 'accuracy': 0.4802848029743716, 'precision': 0.47163136144131956, 'recall': 0.4803052876487849}
{'penalty': 'l2', 'C': 0.001, 'solver': 'lbfgs', 'accuracy': 0.4808781543012546, 'precision': 0.47218551333435677, 'recall': 0.48089875015114253}
{'penalty': 'l2', 'C': 0.001, 'solver': 'liblinear', 'accuracy': 0.48006619593209754, 'precision': 0.46971841997032415, 'recall': 0.4800901830721063}




{'penalty': 'l2', 'C': 0.001, 'solver': 'newton-cg', 'accuracy': 0.4808898651982452, 'precision': 0.4721973939232377, 'recall': 0.48091046181009905}
{'penalty': 'l2', 'C': 0.001, 'solver': 'newton-cholesky', 'accuracy': 0.47975391253305333, 'precision': 0.46946074158415535, 'recall': 0.47977764975074455}
{'penalty': 'l2', 'C': 0.001, 'solver': 'sag', 'accuracy': 0.48087425079590973, 'precision': 0.4721809994955687, 'recall': 0.48089484740751376}
{'penalty': 'l2', 'C': 0.001, 'solver': 'saga', 'accuracy': 0.48088205788279065, 'precision': 0.47219072511603316, 'recall': 0.4809026551801514}
{'penalty': 'l2', 'C': 0.01, 'solver': 'lbfgs', 'accuracy': 0.4805619590158301, 'precision': 0.4718886437143759, 'recall': 0.48058248301104933}
{'penalty': 'l2', 'C': 0.01, 'solver': 'liblinear', 'accuracy': 0.480003740075154, 'precision': 0.46964561709825137, 'recall': 0.48002751791407705}




{'penalty': 'l2', 'C': 0.01, 'solver': 'newton-cg', 'accuracy': 0.48055415185275796, 'precision': 0.4718807559911677, 'recall': 0.4805746752384118}
{'penalty': 'l2', 'C': 0.01, 'solver': 'newton-cholesky', 'accuracy': 0.4800310666697311, 'precision': 0.46967044554154896, 'recall': 0.48005482797474597}
{'penalty': 'l2', 'C': 0.01, 'solver': 'sag', 'accuracy': 0.48055024827122195, 'precision': 0.471875871501498, 'recall': 0.48057077249478297}
{'penalty': 'l2', 'C': 0.01, 'solver': 'saga', 'accuracy': 0.48055805551048536, 'precision': 0.47188473773200207, 'recall': 0.4805785791247306}
{'penalty': 'l2', 'C': 0.02, 'solver': 'lbfgs', 'accuracy': 0.48052682640104943, 'precision': 0.47185770920696657, 'recall': 0.4805473398053125}
{'penalty': 'l2', 'C': 0.02, 'solver': 'liblinear', 'accuracy': 0.4800310660602013, 'precision': 0.4696700156485064, 'recall': 0.48005484008950894}




{'penalty': 'l2', 'C': 0.02, 'solver': 'newton-cg', 'accuracy': 0.48052292281951337, 'precision': 0.471852818402423, 'recall': 0.4805434370616837}
{'penalty': 'l2', 'C': 0.02, 'solver': 'newton-cholesky', 'accuracy': 0.48000764464717616, 'precision': 0.4696393627708778, 'recall': 0.4800314133419198}
{'penalty': 'l2', 'C': 0.02, 'solver': 'sag', 'accuracy': 0.48052292281951337, 'precision': 0.4718537288773385, 'recall': 0.4805434359189936}
{'penalty': 'l2', 'C': 0.02, 'solver': 'saga', 'accuracy': 0.4805190191617861, 'precision': 0.4718488100756268, 'recall': 0.48053953317536485}
{'penalty': 'l2', 'C': 0.04, 'solver': 'lbfgs', 'accuracy': 0.4805307300587767, 'precision': 0.47185865327512533, 'recall': 0.4805512448343213}
{'penalty': 'l2', 'C': 0.04, 'solver': 'liblinear', 'accuracy': 0.48003106621258373, 'precision': 0.469670198319381, 'recall': 0.48005482889042367}




{'penalty': 'l2', 'C': 0.04, 'solver': 'newton-cg', 'accuracy': 0.4805307300587767, 'precision': 0.47185865327512533, 'recall': 0.4805512448343213}
{'penalty': 'l2', 'C': 0.04, 'solver': 'newton-cholesky', 'accuracy': 0.4800271626310476, 'precision': 0.4696705233385285, 'recall': 0.48005091677585304}
{'penalty': 'l2', 'C': 0.04, 'solver': 'sag', 'accuracy': 0.4805229228957046, 'precision': 0.4718503659217836, 'recall': 0.48054343820437373}
{'penalty': 'l2', 'C': 0.04, 'solver': 'saga', 'accuracy': 0.4805307300587767, 'precision': 0.47185865327512533, 'recall': 0.4805512448343213}
{'penalty': 'l2', 'C': 0.08, 'solver': 'lbfgs', 'accuracy': 0.48048779066187997, 'precision': 0.47181676010589013, 'recall': 0.4805083036839378}
{'penalty': 'l2', 'C': 0.08, 'solver': 'liblinear', 'accuracy': 0.4800232591257029, 'precision': 0.46966653465991837, 'recall': 0.48004701174684417}




{'penalty': 'l2', 'C': 0.08, 'solver': 'newton-cg', 'accuracy': 0.48050730849336903, 'precision': 0.4718325227940191, 'recall': 0.48052782494447854}
{'penalty': 'l2', 'C': 0.08, 'solver': 'newton-cholesky', 'accuracy': 0.4800154518102483, 'precision': 0.4696532716328585, 'recall': 0.48003920854496684}
{'penalty': 'l2', 'C': 0.08, 'solver': 'sag', 'accuracy': 0.48050730856956025, 'precision': 0.4718318347533751, 'recall': 0.48052782608716865}
{'penalty': 'l2', 'C': 0.08, 'solver': 'saga', 'accuracy': 0.4805112120749051, 'precision': 0.4718365026053931, 'recall': 0.48053172883079737}
{'penalty': 'l2', 'C': 0.16, 'solver': 'lbfgs', 'accuracy': 0.4803550674420206, 'precision': 0.47169516402410866, 'recall': 0.4803755628603435}
{'penalty': 'l2', 'C': 0.16, 'solver': 'liblinear', 'accuracy': 0.4800037412942138, 'precision': 0.4696518791772113, 'recall': 0.48002748934361356}
{'penalty': 'l2', 'C': 0.16, 'solver': 'newton-cg', 'accuracy': 0.48046827260181724, 'precision': 0.47179535636127473, 



{'penalty': 'l2', 'C': 0.32, 'solver': 'newton-cg', 'accuracy': 0.48027309222976344, 'precision': 0.47161848365348896, 'recall': 0.48029357713251847}
{'penalty': 'l2', 'C': 0.32, 'solver': 'newton-cholesky', 'accuracy': 0.48002716263104767, 'precision': 0.46966775596457583, 'recall': 0.48005091677585315}
{'penalty': 'l2', 'C': 0.32, 'solver': 'sag', 'accuracy': 0.4802848030505628, 'precision': 0.4716313202154203, 'recall': 0.4803052876487849}
{'penalty': 'l2', 'C': 0.32, 'solver': 'saga', 'accuracy': 0.4803355496105316, 'precision': 0.4716782087758567, 'recall': 0.48035604137158605}
{'penalty': 'l2', 'C': 0.64, 'solver': 'lbfgs', 'accuracy': 0.4803511638604846, 'precision': 0.4716947797016691, 'recall': 0.4803716578313346}
{'penalty': 'l2', 'C': 0.64, 'solver': 'liblinear', 'accuracy': 0.47998031957642373, 'precision': 0.4696211204698718, 'recall': 0.48000407288184094}
{'penalty': 'l2', 'C': 0.64, 'solver': 'newton-cg', 'accuracy': 0.4803199361224909, 'precision': 0.47166355602995935, 



{'penalty': 'l2', 'C': 1.28, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}
{'penalty': 'l2', 'C': 1.28, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}
{'penalty': 'l2', 'C': 1.28, 'solver': 'sag', 'accuracy': 0.4802887066320989, 'precision': 0.47163874925739224, 'recall': 0.4803091903924136}
{'penalty': 'l2', 'C': 1.28, 'solver': 'saga', 'accuracy': 0.4802848030505628, 'precision': 0.4716339784358393, 'recall': 0.4803052876487849}
{'penalty': 'l2', 'C': 2.56, 'solver': 'lbfgs', 'accuracy': 0.48034726027894853, 'precision': 0.471690839465032, 'recall': 0.4803677539450158}
{'penalty': 'l2', 'C': 2.56, 'solver': 'liblinear', 'accuracy': 0.4799842231579598, 'precision': 0.46962544066793493, 'recall': 0.48000797562546965}




{'penalty': 'l2', 'C': 2.56, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}
{'penalty': 'l2', 'C': 2.56, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}
{'penalty': 'l2', 'C': 2.56, 'solver': 'sag', 'accuracy': 0.48034335799266337, 'precision': 0.4716823528379727, 'recall': 0.48036385485973526}
{'penalty': 'l2', 'C': 2.56, 'solver': 'saga', 'accuracy': 0.48024186296794513, 'precision': 0.4715955376920345, 'recall': 0.48026233758400017}
{'penalty': 'l2', 'C': 5.12, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}
{'penalty': 'l2', 'C': 5.12, 'solver': 'liblinear', 'accuracy': 0.4799842231579598, 'precision': 0.46962544066793493, 'recall': 0.48000797562546965}




{'penalty': 'l2', 'C': 5.12, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, 'recall': 0.4803052853634047}
{'penalty': 'l2', 'C': 5.12, 'solver': 'newton-cholesky', 'accuracy': 0.4799959339787591, 'precision': 0.4696405408282721, 'recall': 0.48001968385635596}
{'penalty': 'l2', 'C': 5.12, 'solver': 'sag', 'accuracy': 0.4805619602348898, 'precision': 0.47188622091282584, 'recall': 0.4805824862120004}
{'penalty': 'l2', 'C': 5.12, 'solver': 'saga', 'accuracy': 0.4802926103660174, 'precision': 0.47164271491569004, 'recall': 0.4803130942787325}
{'penalty': 'l2', 'C': 10.24, 'solver': 'lbfgs', 'accuracy': 0.48034335669741246, 'precision': 0.47168689300303424, 'recall': 0.480363850058697}
{'penalty': 'l2', 'C': 10.24, 'solver': 'liblinear', 'accuracy': 0.4799842231579598, 'precision': 0.46962544066793493, 'recall': 0.48000797562546965}
{'penalty': 'l2', 'C': 10.24, 'solver': 'newton-cg', 'accuracy': 0.4802848032029453, 'precision': 0.4716356968378765, '

In [14]:
# We favor precision and recall over accuracy ; however, we still take accuracy into consideration. 
# So we take the top 10 accuracy scores passing the average precision and recall values.
# After this we, take the top 5 of them based on precision. Then we take the one having the maximum recall.

precision_values = [result['precision'] for result in results]
all_models_average_precision_scores = np.mean(precision_values)

recall_values = [result['recall'] for result in results]
all_models_average_recall_scores = np.mean(recall_values)

filtered_results = []
for result in results:
    if result['precision'] >= all_models_average_precision_scores and result['recall'] >= all_models_average_recall_scores :
        filtered_results.append(result)
        
top_10_results = sorted(filtered_results, key=lambda x: x['accuracy'], reverse=True)[:10]

precision_values = [result['precision'] for result in top_10_results]
top_10_results_average_precision_scores = np.mean(precision_values)


filtered_results_2 = []
for result in top_10_results:
    if result['precision'] >= top_10_results_average_precision_scores:
        filtered_results_2.append(result)
top_5_results = sorted(filtered_results_2, key=lambda x: x['precision'], reverse=True)[:5]

best_model_1 = max(top_5_results, key=lambda x: x['recall'])

print(best_model_1)

{'penalty': 'l2', 'C': 0.001, 'solver': 'newton-cg', 'accuracy': 0.4808898651982452, 'precision': 0.4721973939232377, 'recall': 0.48091046181009905}


In [16]:
# We try different parameters and apply K-folding on them. Our evaluation metrices will be accuracy, precision, and recall.
# We weren't able to apply ROC, as it's a multi-class output, and having 1v2 roc isn't the best metric. 
# We store the results to choose the best one later.
# This is the 2nd order one; however, we interrupted the kernel, as it was showing very slight improvement, and we already tried
# this on weighting data in the other file, and got no improvement. It took there lots of time, so we didn't wanna go through the same thing.

penalty_options = ['l1', None, 'l2']
C_values = [0.001, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24]
solver_options = {
    'lbfgs': ['l2', None],
    'liblinear': ['l1', 'l2'],
    'newton-cg': ['l2', None],
    'newton-cholesky': ['l2', None],
    'sag': ['l2', None],
    'saga': ['elasticnet', 'l1', 'l2', None]
}
results = []
warnings.filterwarnings("ignore", category=ConvergenceWarning)

polynomial_transformer = PolynomialFeatures(degree=2)
x_poly = polynomial_transformer.fit_transform(x)  

for penalty in penalty_options:
    for C in C_values:
        for solver in solver_options.keys():
            if penalty in solver_options[solver]:
                accuracy_scores = []
                precision_scores = []
                recall_scores = []

                kf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)

                for train_index, test_index in kf.split(x_poly, y):  # Use the polynomial features (x_poly)
                    X_train_fold, X_cross = x_poly[train_index], x_poly[test_index]
                    y_train_fold, y_cross = y.iloc[train_index], y.iloc[test_index]

                    logistic_model = LogisticRegression(penalty=penalty, C=C, solver=solver, max_iter=100, class_weight=class_weight_dictionary)
                    logistic_model.fit(X_train_fold, y_train_fold)
                    y_pred = logistic_model.predict(X_cross)

                    accuracy = accuracy_score(y_cross, y_pred)
                    precision = precision_score(y_cross, y_pred, average='macro', zero_division=0)
                    recall = recall_score(y_cross, y_pred, average='macro')

                    accuracy_scores.append(accuracy)
                    precision_scores.append(precision)
                    recall_scores.append(recall)

                this_result = {
                    'penalty': penalty,
                    'C': C,
                    'solver': solver,
                    'accuracy': np.mean(accuracy_scores),
                    'precision': np.mean(precision_scores),
                    'recall': np.mean(recall_scores)
                }
                results.append(this_result)
                print(this_result)


{'penalty': 'l1', 'C': 0.001, 'solver': 'liblinear', 'accuracy': 0.4965628554666174, 'precision': 0.4850331298520678, 'recall': 0.4965884756630777}
{'penalty': 'l1', 'C': 0.001, 'solver': 'saga', 'accuracy': 0.4960983256828385, 'precision': 0.4863139564034234, 'recall': 0.49612051884594405}
{'penalty': 'l1', 'C': 0.01, 'solver': 'liblinear', 'accuracy': 0.5036479120313112, 'precision': 0.4925271514221919, 'recall': 0.5036724848137919}
{'penalty': 'l1', 'C': 0.01, 'solver': 'saga', 'accuracy': 0.5024494997758053, 'precision': 0.49198789172499335, 'recall': 0.5024728340782166}
{'penalty': 'l1', 'C': 0.02, 'solver': 'liblinear', 'accuracy': 0.5042568797415018, 'precision': 0.49317626903345263, 'recall': 0.50428141909731}
{'penalty': 'l1', 'C': 0.02, 'solver': 'saga', 'accuracy': 0.5030155298414976, 'precision': 0.49255395073133157, 'recall': 0.5030388964747246}
{'penalty': 'l1', 'C': 0.04, 'solver': 'liblinear', 'accuracy': 0.5052054532547958, 'precision': 0.49408138015092185, 'recall': 0

KeyboardInterrupt: 

In [17]:
# We favor precision and recall over accuracy ; however, we still take accuracy into consideration. 
# So we take the top 10 accuracy scores passing the average precision and recall values.
# After this we, take the top 5 of them based on precision. Then we take the one having the maximum recall.

precision_values = [result['precision'] for result in results]

all_models_average_precision_scores = np.mean(precision_values)

recall_values = [result['recall'] for result in results]
all_models_average_recall_scores = np.mean(recall_values)

filtered_results = []
for result in results:
    if result['precision'] >= all_models_average_precision_scores and result['recall'] >= all_models_average_recall_scores:
        filtered_results.append(result)     
top_10_results = sorted(filtered_results, key=lambda x: x['accuracy'], reverse=True)[:10]

precision_values = [result['precision'] for result in top_10_results]
top_10_results_average_precision_scores = np.mean(precision_values)


filtered_results_2 = []
for result in top_10_results:
    if result['precision'] >= top_10_results_average_precision_scores:
        filtered_results_2.append(result)
top_5_results = sorted(filtered_results_2, key=lambda x: x['precision'], reverse=True)[:5]

best_model_2 = max(top_5_results, key=lambda x: x['recall'])

print(best_model_2)

{'penalty': 'l1', 'C': 0.32, 'solver': 'liblinear', 'accuracy': 0.5063335989854894, 'precision': 0.4951384724878055, 'recall': 0.5063583383101693}


In [18]:
#Compare betwen the 2 models (2nd, and first order polynomial logistic regression models )

print(best_model_1)
print(best_model_2)


{'penalty': 'l2', 'C': 0.001, 'solver': 'newton-cg', 'accuracy': 0.4808898651982452, 'precision': 0.4721973939232377, 'recall': 0.48091046181009905}
{'penalty': 'l1', 'C': 0.32, 'solver': 'liblinear', 'accuracy': 0.5063335989854894, 'precision': 0.4951384724878055, 'recall': 0.5063583383101693}


In [19]:
# Create the final model with the best parameters

# On trying lots of different parameters, the model always performs poorly.
# Here is some suggestions that may improve the performance.
# Logistic regression may be affected to outliers, as it's very sensitive to them so using a clustering algorithm to kick outliers, and grouping similar data.
# Also trying different thresholds, adding/removing some features (domain knowledge), or trying even higher order models.
# However, at this point I think moving to another algorithm is more efficient, as they already show way better results, and
# this models is computationally expensive, and takes too long to train.

final_logistic_model = LogisticRegression(penalty=best_model_1['penalty'], C=best_model_1['C'], solver=best_model_1['solver']
                    , max_iter=5000, class_weight=class_weight_dictionary)

final_logistic_model.fit(x,y)
y_prediction = final_logistic_model.predict(x_testing)

final_accuracy = accuracy_score(y_testing, y_prediction)
final_precision = precision_score(y_testing, y_prediction, average='macro',zero_division=0)
final_recall = recall_score(y_testing, y_prediction, average='macro')
my_confusion_matrix = confusion_matrix(y_testing, y_prediction, labels=[0, 1, 2])

print("Testing Accuracy:", final_accuracy)
print("Testing Precision:", final_precision)
print("Testing Recall:", final_recall)

print("Confusion Matrix:")
print("True Fatal: " , my_confusion_matrix[0,0],  "False Serious: ", my_confusion_matrix[0,1], "False Slight: ",my_confusion_matrix[0,2] )
print("False Fatal: " , my_confusion_matrix[1,0],  "True Serious: ", my_confusion_matrix[1,1], "False Slight: ",my_confusion_matrix[1,2])
print("False Fatal: " , my_confusion_matrix[2,0],  "False Serious: ", my_confusion_matrix[2,1], "True Slight: ",my_confusion_matrix[2,2])


# confusion matrix explanation (applies to more dimensions)
#                  Predicted bird                         Predicted cat
# Actual bird       True bird                               False Cat 
# Actual cat        False Bird                             True Cat  

Testing Accuracy: 0.4825801313955803
Testing Precision: 0.47362605679348485
Testing Recall: 0.48246178167525694
Confusion Matrix:
True Fatal:  9416 False Serious:  3010 False Slight:  2656
False Fatal:  5095 True Serious:  4638 False Slight:  5317
False Fatal:  3518 False Serious:  3795 True Slight:  7762
