In [None]:
import numpy
import pandas

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix

In [None]:
dataFrame = pandas.read_csv("/content/drive/MyDrive/MLAA/Assignment 2/Datasets/training.csv")

In [None]:
genderDictionary = {"Male" : 1,
                    "Female" : 2,
                    numpy.nan : 3}

ageBandDictionary = {"1. <25" : 1,
                     "2. 25 to 34" : 2,
                     "3. 35 to 44" : 3,
                     "4. 45 to 54" : 4,
                     "5. 55 to 64" : 5,
                     "6. 65 to 74" : 6,
                     "7. 75+": 7,
                     numpy.nan : 0}

carSegmentDictionary = {"Small/Medium" : 1,
                        "Large/SUV" : 2,
                        "LCV" : 3,
                        "Other" : 4}

carModelDictionary = {"model_1" : 1,
                      "model_2" : 2,
                      "model_3" : 3,
                      "model_4" : 4,
                      "model_5" : 5,
                      "model_6" : 6,
                      "model_7" : 7,
                      "model_8" : 8,
                      "model_9" : 9,
                      "model_10" : 10,
                      "model_11" : 11,
                      "model_12" : 12,
                      "model_13" : 13,
                      "model_14" : 14,
                      "model_15" : 15,
                      "model_16" : 16,
                      "model_17" : 17,
                      "model_18" : 18,
                      "model_19" : 19}

In [None]:
dataFrame["gender"].replace(genderDictionary, inplace=True)
dataFrame["age_band"].replace(ageBandDictionary, inplace=True)
dataFrame["car_model"].replace(carModelDictionary, inplace=True)
dataFrame["car_segment"].replace(carSegmentDictionary, inplace=True)

In [None]:
dataFrame.dropna(how='any', inplace=True)
dataFrame.isna().sum()

In [None]:
y = dataFrame.pop("Target")
dataFrame = dataFrame[["age_of_vehicle_years", "sched_serv_warr", "non_sched_serv_warr", "sched_serv_paid", "total_paid_services", "total_services", "mth_since_last_serv", "annualised_mileage"]]

X_train, X_test, y_train, y_test = train_test_split(dataFrame, y, test_size=0.20, random_state=42)

In [None]:
y_mode = y.mode()
y_base = numpy.full(y_train.shape, y_mode)

print("Accuracy Score : ", accuracy_score(y_train, y_base))
print("F1 Score : ", f1_score(y_train, y_base, average='weighted'))

Accuracy Score :  0.9727702747718167
F1 Score :  0.9593433351879643


In [None]:
baseTree = DecisionTreeClassifier(random_state=42, class_weight="balanced")
baseTree.fit(X_train, y_train)

y_train_preds = baseTree.predict(X_train)
y_test_preds = baseTree.predict(X_test)

print("\n Training Data :")
print("Accuracy Score : ", accuracy_score(y_train, y_train_preds))
print("F1 Score : ", f1_score(y_train, y_train_preds, average='weighted'))

print("\n Testing Data :")
print("Accuracy Score : ", accuracy_score(y_test, y_test_preds))
print("F1 Score : ", f1_score(y_test, y_test_preds, average='weighted'))

print("\nConfusion Matrix for Training :\n", confusion_matrix(y_train, y_train_preds, labels=baseTree.classes_))
print("\nConfusion Matrix for Testing :\n", confusion_matrix(y_test, y_test_preds, labels=baseTree.classes_))


 Training Data :
Accuracy Score :  0.9969067945826077
F1 Score :  0.9969860469047783

 Testing Data :
Accuracy Score :  0.980470534490635
F1 Score :  0.9809553945378519

Confusion Matrix for Training :
 [[101886    322]
 [     3   2858]]

Confusion Matrix for Testing :
 [[25316   292]
 [  221   439]]


In [None]:
minSampleTree = DecisionTreeClassifier(random_state=42, class_weight="balanced", min_samples_split=150)
minSampleTree.fit(X_train, y_train)

y_train_preds = minSampleTree.predict(X_train)
y_test_preds = minSampleTree.predict(X_test)

print("\n Training Data :")
print("Accuracy Score : ", accuracy_score(y_train, y_train_preds))
print("F1 Score : ", f1_score(y_train, y_train_preds, average='weighted'))

print("\n Testing Data :")
print("Accuracy Score : ", accuracy_score(y_test, y_test_preds))
print("F1 Score : ", f1_score(y_test, y_test_preds, average='weighted'))

print("\nConfusion Matrix for Training :\n", confusion_matrix(y_train, y_train_preds, labels=minSampleTree.classes_))
print("\nConfusion Matrix for Testing :\n", confusion_matrix(y_test, y_test_preds, labels=minSampleTree.classes_))


 Training Data :
Accuracy Score :  0.941428965727284
F1 Score :  0.9555462086931491

 Testing Data :
Accuracy Score :  0.9389371097913811
F1 Score :  0.9541485684799206

Confusion Matrix for Training :
 [[96125  6083]
 [   71  2790]]

Confusion Matrix for Testing :
 [[24068  1540]
 [   64   596]]


In [None]:
minSampleTree = DecisionTreeClassifier(random_state=42, class_weight="balanced", min_samples_split=150, max_leaf_nodes=75)
minSampleTree.fit(X_train, y_train)

y_train_preds = minSampleTree.predict(X_train)
y_test_preds = minSampleTree.predict(X_test)

print("\n Training Data :")
print("Accuracy Score : ", accuracy_score(y_train, y_train_preds))
print("F1 Score : ", f1_score(y_train, y_train_preds, average='weighted'))

print("\n Testing Data :")
print("Accuracy Score : ", accuracy_score(y_test, y_test_preds))
print("F1 Score : ", f1_score(y_test, y_test_preds, average='weighted'))

print("\nConfusion Matrix for Training :\n", confusion_matrix(y_train, y_train_preds, labels=minSampleTree.classes_))
print("\nConfusion Matrix for Testing :\n", confusion_matrix(y_test, y_test_preds, labels=minSampleTree.classes_))


 Training Data :
Accuracy Score :  0.9324634287944112
F1 Score :  0.9497436963855678

 Testing Data :
Accuracy Score :  0.9331125323587636
F1 Score :  0.9506518166673206

Confusion Matrix for Training :
 [[95210  6998]
 [   98  2763]]

Confusion Matrix for Testing :
 [[23896  1712]
 [   45   615]]
