In [None]:
import numpy
import pandas

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

In [None]:
dataFrame = pandas.read_csv("/content/drive/MyDrive/MLAA/Assignment 2/Datasets/training.csv")

In [None]:
genderDictionary = {"Male" : 1,
                    "Female" : 2,
                    numpy.nan : 3}

ageBandDictionary = {"1. <25" : 1,
                     "2. 25 to 34" : 2,
                     "3. 35 to 44" : 3,
                     "4. 45 to 54" : 4,
                     "5. 55 to 64" : 5,
                     "6. 65 to 74" : 6,
                     "7. 75+": 7,
                     numpy.nan : 0}

carSegmentDictionary = {"Small/Medium" : 1,
                        "Large/SUV" : 2,
                        "LCV" : 3,
                        "Other" : 4}

carModelDictionary = {"model_1" : 1,
                      "model_2" : 2,
                      "model_3" : 3,
                      "model_4" : 4,
                      "model_5" : 5,
                      "model_6" : 6,
                      "model_7" : 7,
                      "model_8" : 8,
                      "model_9" : 9,
                      "model_10" : 10,
                      "model_11" : 11,
                      "model_12" : 12,
                      "model_13" : 13,
                      "model_14" : 14,
                      "model_15" : 15,
                      "model_16" : 16,
                      "model_17" : 17,
                      "model_18" : 18,
                      "model_19" : 19}

In [None]:
dataFrame["gender"].replace(genderDictionary, inplace=True)
dataFrame["age_band"].replace(ageBandDictionary, inplace=True)
dataFrame["car_model"].replace(carModelDictionary, inplace=True)
dataFrame["car_segment"].replace(carSegmentDictionary, inplace=True)

In [None]:
dataFrame.dropna(how='any', inplace=True)
dataFrame.isna().sum()

ID                           0
Target                       0
age_band                     0
gender                       0
car_model                    0
car_segment                  0
age_of_vehicle_years         0
sched_serv_warr              0
non_sched_serv_warr          0
sched_serv_paid              0
non_sched_serv_paid          0
total_paid_services          0
total_services               0
mth_since_last_serv          0
annualised_mileage           0
num_dealers_visited          0
num_serv_dealer_purchased    0
dtype: int64

In [None]:
y = dataFrame.pop("Target")
dataFrame = dataFrame[["age_of_vehicle_years", "sched_serv_warr", "non_sched_serv_warr", "sched_serv_paid", "total_paid_services", "total_services", "mth_since_last_serv", "annualised_mileage"]]

X_train, X_test, y_train, y_test = train_test_split(dataFrame, y, test_size=0.2)

In [None]:
y_mode = y.mode()
y_base = numpy.full(y_train.shape, y_mode)

print("Accuracy Score : ", accuracy_score(y_train, y_base))
print("F1 Score : ", f1_score(y_train, y_base, average='weighted'))

Accuracy Score :  0.9733889158552951
F1 Score :  0.960262798576927


In [None]:
logisticRegression = LogisticRegression()
logisticRegression.fit(X_train, y_train)
y_train_preds = logisticRegression.predict(X_train)
y_test_preds = logisticRegression.predict(X_test)

print("\n Training Data :")
print("Accuracy Score : ", accuracy_score(y_train, y_train_preds))
print("F1 Score : ", f1_score(y_train, y_train_preds, average='weighted'))

print("\n Testing Data :")
print("Accuracy Score : ", accuracy_score(y_test, y_test_preds))
print("F1 Score : ", f1_score(y_test, y_test_preds, average='weighted'))

print("\nConfusion Matrix for Training :\n", confusion_matrix(y_train, y_train_preds, labels=logisticRegression.classes_))
print("\nConfusion Matrix for Testing :\n", confusion_matrix(y_test, y_test_preds, labels=logisticRegression.classes_))


 Training Data :
Accuracy Score :  0.9740265920490344
F1 Score :  0.964205967147454

 Testing Data :
Accuracy Score :  0.973008984315517
F1 Score :  0.962833694759908

Confusion Matrix for Training :
 [[102123    150]
 [  2579    217]]

Confusion Matrix for Testing :
 [[25503    40]
 [  669    56]]


In [None]:
logisticRegression = LogisticRegression(class_weight="balanced")
logisticRegression.fit(X_train, y_train)
y_train_preds = logisticRegression.predict(X_train)
y_test_preds = logisticRegression.predict(X_test)

print("\n Training Data :")
print("Accuracy Score : ", accuracy_score(y_train, y_train_preds))
print("F1 Score : ", f1_score(y_train, y_train_preds, average='weighted'))

print("\n Testing Data :")
print("Accuracy Score : ", accuracy_score(y_test, y_test_preds))
print("F1 Score : ", f1_score(y_test, y_test_preds, average='weighted'))

print("\nConfusion Matrix for Training :\n", confusion_matrix(y_train, y_train_preds, labels=logisticRegression.classes_))
print("\nConfusion Matrix for Testing :\n", confusion_matrix(y_test, y_test_preds, labels=logisticRegression.classes_))


 Training Data :
Accuracy Score :  0.7538665067717405
F1 Score :  0.8372960936744062

 Testing Data :
Accuracy Score :  0.7513324196741282
F1 Score :  0.8349031809825402

Confusion Matrix for Training :
 [[76785 25488]
 [  373  2423]]

Confusion Matrix for Testing :
 [[19099  6444]
 [   88   637]]


In [None]:
logisticRegression = LogisticRegression(solver="newton-cholesky", class_weight="balanced")
logisticRegression.fit(X_train, y_train)
y_train_preds = logisticRegression.predict(X_train)
y_test_preds = logisticRegression.predict(X_test)

print("\n Training Data :")
print("Accuracy Score : ", accuracy_score(y_train, y_train_preds))
print("F1 Score : ", f1_score(y_train, y_train_preds, average='weighted'))

print("\n Testing Data :")
print("Accuracy Score : ", accuracy_score(y_test, y_test_preds))
print("F1 Score : ", f1_score(y_test, y_test_preds, average='weighted'))

print("\nConfusion Matrix for Training :\n", confusion_matrix(y_train, y_train_preds, labels=logisticRegression.classes_))
print("\nConfusion Matrix for Testing :\n", confusion_matrix(y_test, y_test_preds, labels=logisticRegression.classes_))


 Training Data :
Accuracy Score :  0.7538665067717405
F1 Score :  0.8372960936744062

 Testing Data :
Accuracy Score :  0.7512943505405817
F1 Score :  0.8348776894830401

Confusion Matrix for Training :
 [[76785 25488]
 [  373  2423]]

Confusion Matrix for Testing :
 [[19098  6445]
 [   88   637]]
