In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import sklearn.model_selection as model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import  cross_val_score
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix




## Mapping Categorical Variables
Preprocess Categorical Data function takes in the dataframe and changes the categorical columns to mapping 0, 1 values or one hot encoding vectors.

Mapping to 0 or 1: Maps the categorical column if only two values are possible for that attribute

One Hot Encoding: One Hot encoding for the categorical column if that attribute has more than two values

In [2]:
def preprocess_categorical_data(X):
    type_list = X.dtypes
    cols = X.columns
    for col in cols:
        if X[col].dtype == np.object:
            temp = set(X[col])
            if len(temp) == 2:
                temp_dict = dict()
                for key in temp:
                    if sum(X[col] == key) > (len(X[col]) / 2):
                        temp_dict[key] = 1
                    else:
                        temp_dict[key] = 0
                X.loc[:,col] = X[col].map(temp_dict).astype(int)
            else:
                np.unique(X[col])
                emb = pd.get_dummies(X[col], columns=col, prefix=col)
                X = pd.concat([X, emb], axis=1)
                X.drop([col], axis=1, inplace=True)
    return X


## Data Imputation
This function imputes the missing rows data based on the imputed_column given 
as input


In [3]:
def impute_missing_data(X, imputed_column):
    cols = X.columns
    for col in cols:
        if X[col].dtype == np.float:
          X[col] = X.groupby(imputed_column)[col].transform(lambda grp: grp.fillna(grp.mean()))
    return X


## Dataset - Don't get Kicked
Data Source: https://www.kaggle.com/c/DontGetKicked


Data Description: The data is of cars sold at various auctions and the parameters. And the task is to predict whether the car bought is a bad buy or not.

We artificially induced missing data by replacing values with NaN using python code. Also i sampled the data for 50 models of cars to make the dataset smaller.

In [4]:
with open('carvana.csv', mode='r') as csv_file:
    df = pd.read_csv(csv_file)
df.columns

Index(['IsBadBuy', 'PurchDate', 'Auction', 'VehYear', 'VehicleAge', 'Make',
       'Model', 'Trim', 'SubModel', 'Color', 'Transmission', 'VehOdo',
       'Nationality', 'Size', 'TopThreeAmericanName',
       'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
       'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
       'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
       'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'BYRNO',
       'VehBCost', 'IsOnlineSale', 'WarrantyCost'],
      dtype='object')

## Dropping off unnecessary columns

We are dropping off the columns which are IDs of the buyer and auction ID 

In [5]:
df_new = df.drop(columns=['BYRNO','Make','SubModel','Trim','Color'])


## Imputing the missing data

We are imputing the missing data in real value columns with the average value of the same model cars

We also removed the Model column since the variable IsBadBuy depends on the cost at which car was bought and the estimated price of the car.

In [6]:
df_new = impute_missing_data(df_new, 'Model')
df_new = df_new.drop(columns=['Model'])

## Pre-processing categorical data

Mapping or one hot encoding the categorical columns

In [7]:
df_new = preprocess_categorical_data(df_new)

## Dropping Null rows

The imputing method works if the missing values belongs to a model which has multiple columns to infer values from.  But if the model has only one row we don't have any reference values to impute from.

In [8]:
df_new = df_new.dropna()


## Data Assignments

Assigning the predictor variables to X 
Assigning the output variable to Y

In [9]:
X = df_new.drop(columns=['IsBadBuy'])
y = df_new['IsBadBuy']

## Data Splitting and Scaling

We are splitting the data to training and testing data.

Once the splitting is done we are scaling  the training data columns using the normal scaler.

And the testing data is modified using the scaling parameters of the training data.

We are using the Normal Scaler since normal scaler preserves the statistical properties of the data

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, random_state=0)
cols = X_train.columns
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train,columns = cols)
X_test = pd.DataFrame(X_test,columns = cols)


## K Nearest Neighbors

We used grid search along with 10 Fold Cross Validation on number of neighbors to choose the best parameter based on accuracy. 

In [11]:
#KNN Classification
knn = KNeighborsClassifier()
k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

In [12]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                         23, 24, 25, 26, 27, 28, 29, 30]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

Confusion Matrix of the best K Nearest Neighbor Model.

As you can see because of the imbalanced data set we can see even though most of the positive examples are misclassified we are getting high accuracy.

One way to deal with imbalanced datasets is to use weighted models with balanced accuracy as parameter.

Or we could use a sampling strategy to deal with data imbalance.

In [13]:
best_params = grid.best_params_
knn = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'])
knn.fit(X_train, y_train)
y_test_pred = knn.predict(X_test)
confusion_matrix(y_test, y_test_pred)


array([[968,   1],
       [131,   3]], dtype=int64)

Test Accuracy of the best K Nearest Neighbor Model

In [14]:
accuracy = sum(y_test == y_test_pred)/len(y_test_pred)
accuracy

0.8803263825929284

## Logistic Classifier Model

We are using lbfgs solver (Limitied Memory BFGS Model) which is a quasi-newton method for faster convergence. 

Since, LBFGS works only on l2 penalty we are performing grid search on l2 penalty with various c values and 5 fold cross validation

In [15]:
logistic_reg = LogisticRegression(multi_class="auto",solver="lbfgs", max_iter=10000, random_state=0)

In [16]:
param_grid = {  'penalty' : ['l2'],
     'C' : np.logspace(-4, 4, 20)}


In [17]:
grid = GridSearchCV(logistic_reg, param_grid, cv=5, scoring='accuracy')

In [18]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=10000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': array([1.0000000...,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
      

From the confusion matrix we can see that some positive examples are correctly being classified.

Which also proves that with more data we will have a better model with high sensitivity and specificity values

In [19]:
#C = param_grid['C'][np.argmin(grid.cv_results_['rank_test_score'])]
#y_test_pred = grid.decision_function(X_test)
#y_test_pred
best_params = grid.best_params_
logistic_reg = LogisticRegression(multi_class="auto",solver="liblinear", max_iter=10000, random_state=0, penalty=best_params['penalty'], C= best_params['C'])
logistic_reg.fit(X_train, y_train)
y_test_pred = logistic_reg.predict(X_test)
confusion_matrix(y_test, y_test_pred)

array([[893,  76],
       [115,  19]], dtype=int64)

Test Accuracy of the Best logistic classifier model

In [20]:
accuracy = sum(y_test == y_test_pred)/len(y_test_pred)
accuracy

0.8268359020852222

## Linear SVC Model

We are using the Linear SVC model with squared_hinge loss with both l1 and l2 penalty.

l1 Model leads to much sparser model compared to l2 model.

Grid Search is performed on various C Values.

In [21]:
#Linear SVM
param_grid = {'penalty':['l1','l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

In [22]:
svc = LinearSVC(max_iter = 1e+4, random_state=0, loss='squared_hinge', dual=False)
grid = GridSearchCV(svc, param_grid, cv=5, scoring='accuracy')

We see this training because liblinear model is unable to fit the model with the given data in mentioned iterations limit.

Tried with various max_iterations to see if the model converges. But the model didn't converge.

In [23]:
grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LinearSVC(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=10000.0,
                                 multi_class='ovr', penalty='l2',
                                 random_state=0, tol=0.0001, verbose=0),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

Test Accuracy of the best Linear SVM Model

In [24]:
best_params = grid.best_params_
svc = LinearSVC(penalty = best_params['penalty'],C=best_params['C'], max_iter = 1e+4, random_state=0, loss='squared_hinge', dual=False)
svc.fit(X_train, y_train)
y_test_pred = svc.predict(X_test)
accuracy = sum(y_test == y_test_pred)/len(y_test_pred)
accuracy

0.8785131459655485

Confusion Matrix of the best Linear SVM Model. As you can see, since no positive samples are being classified correctly this linear svm model is not a great model despite the same accuracy levels.

In [25]:
confusion_matrix(y_test, y_test_pred)

array([[969,   0],
       [134,   0]], dtype=int64)

## RBF Kernel SVM

We are using grid search to train for different values of gamma and c parameters to find best rbf kernel svm.

In [26]:
#RBF Kernel SVM
tuned_parameters = {'gamma': [1e-3,1e-2,1e-1,1],
                     'C': [1, 10, 100, 1000]}

In [27]:
svm = SVC(random_state = 0, kernel = 'rbf')
grid = GridSearchCV(svm, tuned_parameters, cv=5, scoring='accuracy')

In [28]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=0, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000],
                         'gamma': [0.001, 0.01, 0.1, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [29]:
best_params = grid.best_params_

The accuracy of the best rbf kernel model.

In [30]:
svm = SVC(random_state = 0, kernel = 'rbf', C =best_params['C'], gamma=best_params['gamma'] )
svm.fit(X_train, y_train)
y_test_pred = svm.predict(X_test)
accuracy = sum(y_test == y_test_pred)/len(y_test_pred)
accuracy

0.8776065276518585

As seen from the confusion matrix, we can say that the rbf kernel svm is also not a good model due to low sensitivity

In [31]:
confusion_matrix(y_test, y_test_pred)

array([[967,   2],
       [133,   1]], dtype=int64)

## Linear and Polynomial Kernel SVM

We are using gridsearch with 5 fold cross validation with various degrees and C values.

In [32]:
#Polynomial Kernel SVM
tuned_parameters = {'degree': [1,2,3,4,5,6,7],
                     'C': [1, 10, 100, 1000]}

In [33]:
svm = SVC(random_state = 0, kernel = 'poly', gamma = 'auto')
grid = GridSearchCV(svm, tuned_parameters, cv=5, scoring='accuracy')

In [34]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='poly', max_iter=-1,
                           probability=False, random_state=0, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000],
                         'degree': [1, 2, 3, 4, 5, 6, 7]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [35]:
best_params = grid.best_params_

The test accuracy of the best polyomial kernel svm model is 87.03%

In [36]:
svm = SVC(random_state=0, degree=best_params['degree'], C=best_params['C'], gamma='auto')
svm.fit(X_train, y_train)
y_test_pred = svm.predict(X_test)
accuracy = sum(y_test == y_test_pred)/len(y_test_pred)
accuracy

0.8703535811423391

From confusion matrix, we can see that the sensitivty of this model is very low.

In [37]:
confusion_matrix(y_test, y_test_pred)

array([[959,  10],
       [133,   1]], dtype=int64)

## Decision Tree Classifier

We use grid-search on various min_samples_split values with 10 fold cross-validation to find the best classifier.

In [38]:
#Decision Tree
dtc = DecisionTreeClassifier(random_state=0)
sample_split_range = list(range(2, 50))
param_grid = dict(min_samples_split=sample_split_range)
grid = GridSearchCV(dtc, param_grid, cv=10, scoring='balanced_accuracy')


In [39]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=0,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                               12, 13, 14, 

In [40]:
best_param = grid.best_params_

The test accuracy of the decision tree classifer

In [41]:
dtc = DecisionTreeClassifier(random_state=0, min_samples_split=best_param['min_samples_split'])
dtc.fit(X_train, y_train)
y_test_pred = dtc.predict(X_test)
accuracy = sum(y_test == y_test_pred)/len(y_test_pred)
accuracy

0.8485947416137806

From the confusion matrix we can see that the sensitivty value is low but better than the SVM Models

In [42]:
confusion_matrix(y_test, y_test_pred)

array([[918,  51],
       [116,  18]], dtype=int64)

Seeing all the test accuracies and the sensitivity ratio of different models. We can say that the Decision Tree Classification Model is the best model inspite of SVM Models with higher test accuracies.

Since, the Decision tree classification model is the best model that gave high test accuracy and high sensitivity values.
