# Lecture 12

Here we use the E-car dataset on car loan applications.

IF you're running this on Google Colab, and only then, should you run this cell:

In [None]:
# !! Run this on Google Colab only.
# from google.colab import drive
# drive.mount('/content/drive')

Import the required modules

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
import sklearn.neighbors
import sklearn.datasets
import sklearn.neural_network
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

Load the dataset:

In [None]:
DATA_FILEPATH = "drive/MyDrive/e_car_data.csv"
df = pd.read_csv(DATA_FILEPATH)

Show summary statistics:

In [None]:
df.describe()

Place loans into 8 unordered classes / bins, depending on their acceptance (0-3 for denied, 4-7 for accepted) and their Annual Percentage Rate (APR, into 4 classes each):

In [None]:
def loan_to_bin(accept_value, apr_value):
  offset = 0
  if accept_value:
    offset = 4

  if apr_value < 4:
    return offset

  if apr_value < 6:
    return 1 + offset

  if apr_value < 8:
    return 2 + offset

  return 3 + offset

Create new labels for the dataset:

In [None]:
apr = df['apr'].values
accept = df['accept'].values

y = []
for i in range(len(apr)):
    accept_value = accept[i]
    apr_value = apr[i]
    bin = loan_to_bin(accept_value, apr_value)
    y.append(bin)

# Alternative in one-line with a list comprehension (but less readable):
# y = [loan_to_bin(accept[i], apr[i]) for i in range(len(apr))]

Define the features and preprocess (or scale, or normalize) them, to help with convergence:

In [None]:
columns = ['tier',
           'amount',
           'apr',
           'prime',
           'fico',
           'competition apr',
           'partner bin']
x = df[columns].values

# Don't forget to scale features!
x_scaled = preprocessing.scale(x)

Split the data into three groups: training, validation, and testing (60/20/20 split):


In [None]:
# Split into training (60%), validation (20%), testing (20%)
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y,
                                                    test_size=0.2,
                                                    random_state=1)

x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
                                                  test_size=0.25, # 0.25 x 0.8 = 0.2
                                                  random_state=1)


# Multi-layer perceptron

We'll use a model from SciKit-Learn, which already has all we need (e.g., cross-entropy loss function). We define two hidden layers, with 64 and 32 neurons, and fit it to data. Notice that the accuracy has impoved to around 80%.

In [None]:
mlp = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=(64, 32),
                                           activation="logistic",
                                           max_iter=1000,
                                           random_state=42)

mlp.fit(x_train, y_train)

# Make predictions on the test data
y_pred = mlp.predict(x_test)

# Calculate the accuracy of the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy of multi-layer perceptron on test data: %.5f" %
      accuracy)

# Accuracy comparison of our 3 classifiers

Let's compare this to the other two classifiers we have: k-Nearest Neighbors and multinomial logistic regression.

In [None]:
logistic = LogisticRegression(max_iter=int(1e5))
logistic.fit(x_train, y_train)
y_logistic_pred = logistic.predict(x_test)
accuracy_logistic = metrics.accuracy_score(y_test, y_logistic_pred)
print("Accuracy of logistic regression on test data: %.5f" %
      accuracy_logistic)

knn = sklearn.neighbors.KNeighborsClassifier()
knn.fit(x_train, y_train)
y_knn_pred = knn.predict(x_test)
accuracy_knn = metrics.accuracy_score(y_test, y_knn_pred)
print("Accuracy of kNN regression on test data: %.5f" %
      accuracy_knn)

# How to choose the number of neighbors / hyper-parameters

In [None]:
# Define a helper function to be DRY.
def fit_knn(k, x_train, y_train, x_test, y_test):
  knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=k)
  knn.fit(x_train, y_train)
  y_knn_pred = knn.predict(x_test)
  accuracy = metrics.accuracy_score(y_test, y_knn_pred)
  return knn, accuracy


# Find the best k.
k_range = range(1, 100)
accuracy_range = []
for k in k_range:
  _, accuracy = fit_knn(k, x_train, y_train, x_val, y_val)
  accuracy_range.append(accuracy)

plt.plot(k_range, accuracy_range)

best_k = k_range[np.argmax(accuracy_range)]

# Compute accuracy with the best k.
_, acc = fit_knn(best_k, x_train, y_train, x_test, y_test)
print("Accuracy of optimized k-nearest neighbors (k=%d): %.3f" % (best_k, acc))

# Question

Why is the kNN accuracy smaller with k = 13 then before, with default value of k? That is: why did the optimization of hyper-parameters not optimize the accuracy?

# Revisiting MNIST hand-written digits

In lecture 1 and assignment 2, we used 1-nearest neighbor for MNIST. Here we code it again, using SciKit-Learn, and compare its accuracy to the a neural network. In case you don't have Tensorflow installed, you can use this cell (with a smaller version of the dataset):

In [None]:
mnist = sklearn.datasets.load_digits()
print("Data shape:", mnist.data.shape)
x = mnist.data
y = mnist.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

If you have Tensorflow / Keras installed, you can use the full dataset with 60k images:

In [None]:
#!pip install keras
import keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
print("Shape before flattening:", x_train.shape)

# Flatten the data, for input into the Multi-Layer Perceptron.
# If we used Tensorflow, we could use a tf.keras.layers.Flatten layer
# to convert the image.

x_train = x_train.reshape([x_train.shape[0], -1])
x_test = x_test.reshape([x_test.shape[0], -1])
print("Shape after flattening:", x_train.shape)

Check the output counts, and split the data into training and testing:

In [None]:
print("Target classes: ", np.unique(y_train, return_counts=True))

In [None]:
knn = sklearn.neighbors.KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_knn_pred = knn.predict(x_test)
accuracy_knn = metrics.accuracy_score(y_test, y_knn_pred)
print("Accuracy of kNN regression on test data: %.5f" %
      accuracy_knn)

# TODO: estimate a neural network on MNIST data

Complete this next cell.

In [None]:
import time
start = time.time()

# TODO: train a neural network to recognize handwritten digits, and gauge the
# accuracy of the model



# This line prints how long it took to train the model and estimate accuracy.
minutes = (time.time() - start) / 60
print("Elapsed: %.1f minutes" % minutes)