In [1]:
import numpy
import pandas

from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import randint

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

In [2]:
dataFrame = pandas.read_csv("/content/drive/MyDrive/MLAA/Assignment 2/Datasets/training.csv")

In [3]:
genderDictionary = {"Male" : 1,
                    "Female" : 2,
                    numpy.nan : 3}

ageBandDictionary = {"1. <25" : 1,
                     "2. 25 to 34" : 2,
                     "3. 35 to 44" : 3,
                     "4. 45 to 54" : 4,
                     "5. 55 to 64" : 5,
                     "6. 65 to 74" : 6,
                     "7. 75+": 7,
                     numpy.nan : 0}

carSegmentDictionary = {"Small/Medium" : 1,
                        "Large/SUV" : 2,
                        "LCV" : 3,
                        "Other" : 4}

carModelDictionary = {"model_1" : 1,
                      "model_2" : 2,
                      "model_3" : 3,
                      "model_4" : 4,
                      "model_5" : 5,
                      "model_6" : 6,
                      "model_7" : 7,
                      "model_8" : 8,
                      "model_9" : 9,
                      "model_10" : 10,
                      "model_11" : 11,
                      "model_12" : 12,
                      "model_13" : 13,
                      "model_14" : 14,
                      "model_15" : 15,
                      "model_16" : 16,
                      "model_17" : 17,
                      "model_18" : 18,
                      "model_19" : 19}

In [4]:
dataFrame["gender"].replace(genderDictionary, inplace=True)
dataFrame["age_band"].replace(ageBandDictionary, inplace=True)
dataFrame["car_model"].replace(carModelDictionary, inplace=True)
dataFrame["car_segment"].replace(carSegmentDictionary, inplace=True)

In [5]:
dataFrame.dropna(how='any', inplace=True)
dataFrame.isna().sum()

ID                           0
Target                       0
age_band                     0
gender                       0
car_model                    0
car_segment                  0
age_of_vehicle_years         0
sched_serv_warr              0
non_sched_serv_warr          0
sched_serv_paid              0
non_sched_serv_paid          0
total_paid_services          0
total_services               0
mth_since_last_serv          0
annualised_mileage           0
num_dealers_visited          0
num_serv_dealer_purchased    0
dtype: int64

In [6]:
y = dataFrame.pop("Target")
dataFrame = dataFrame[["age_of_vehicle_years", "sched_serv_warr", "non_sched_serv_warr", "sched_serv_paid", "total_paid_services", "total_services", "mth_since_last_serv", "annualised_mileage"]]

X_train, X_test, y_train, y_test = train_test_split(dataFrame, y, test_size=0.20, random_state=42)

In [7]:
y_mode = y.mode()
y_base = numpy.full(y_train.shape, y_mode)

print("Accuracy Score : ", accuracy_score(y_train, y_base))
print("F1 Score : ", f1_score(y_train, y_base, average='weighted'))

Accuracy Score :  0.9727702747718167
F1 Score :  0.9593433351879643


In [8]:
random_ranges = {
    'n_estimators': randint(50, 250),
    'min_samples_split': randint(150, 250)
}

In [9]:
classifier = ExtraTreesClassifier(random_state=8, max_features="log2", class_weight="balanced")

In [10]:
randomSearcher = RandomizedSearchCV(classifier, random_ranges, random_state=8, scoring='neg_root_mean_squared_error', cv=5, verbose=1)

In [11]:
randomSearcher.fit(X_train, y_train)
randomSearcher.best_params_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'min_samples_split': 155, 'n_estimators': 186}

In [12]:
optimisedRegressor = randomSearcher.best_estimator_

In [24]:
y_train_preds = optimisedRegressor.predict(X_train)
y_test_preds = optimisedRegressor.predict(X_test)

In [25]:
print("\n Training Scores : \n")
print("Mean Squared Error : ", mean_squared_error(y_train, y_train_preds))

print("\n Testing Scores : \n")
print("Mean Squared Error : ", mean_squared_error(y_test, y_test_preds))


 Training Scores : 

Mean Squared Error :  0.06608038527063168

 Testing Scores : 

Mean Squared Error :  0.0678391959798995
