# Supervised learning: diverse classifiers

*   Section 1. Baseline logistic regression.
*   Section 2. ML classifiers.



The scripts include more than the seven classifiers we discussed in the class.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

In [None]:
## import the sklearn models
# logistic regression
from sklearn.linear_model import LogisticRegression

# K nearest neighbor
from sklearn.neighbors import KNeighborsClassifier

# support vector machine
from sklearn.svm import SVC, LinearSVC
# SVC: support vector classification (using kernel methods)

# decision tree
from sklearn.tree import DecisionTreeClassifier

# ensemble methods, e.g., random forest
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# naive Bayesian
from sklearn.naive_bayes import GaussianNB

# discriminant analysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# neural network
from sklearn.neural_network import MLPClassifier


In [None]:
# define the mounting point on Google drive
from google.colab import drive
drive.mount('/content/drive/')

# Switch to Colab Notebooks.
# Mac system
# !cd '/content/drive/My Drive/Colab Notebooks/data/'
# Windows system
%cd /content/drive/My Drive/Colab Notebooks/data/

In [None]:
# read data
df = pd.read_csv('Florida_ct.csv', index_col = 0)

In [None]:
df.head()

In [None]:
# preprocessing
# expensive vs. non-expensive properties as the binary variable
# threshold = 0.8
df['property_value_discrete'] = 1
df.loc[df['property_value_median'] < 200000, 'property_value_discrete'] = 0

## Section 1. Creating a baseline logistic regression in ML (lec08)

In [None]:
# assign the inputs and outputs
var_list = ['inc_median_household',
            'households',
            'travel_driving_ratio', 'travel_pt_ratio', 'travel_taxi_ratio', 'travel_work_home_ratio',
            'edu_higher_edu_ratio',
            'household_size_avg',
            'vacancy_ratio', 'rent_median',
            'race_white_ratio',
            'race_asian_ratio'
            ]

y = df['property_value_discrete']
X = df[var_list]
# X = sm.add_constant(X)

# change the data format
X = X.values
y = y.values


In [None]:
# creating the training and testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=16)

In [None]:
# initialize logistic regression
# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16)

# fit the model with training data only
logreg.fit(X_train, y_train)

# check the performance
train_predictions = logreg.predict(X_train)
acc = accuracy_score(y_train, train_predictions)
print("Training Accuracy: {:.4%}".format(acc))

test_predictions = logreg.predict(X_test)
acc = accuracy_score(y_test, test_predictions)
print("Testing Accuracy: {:.4%}".format(acc))


## Section 2. ML classifiers

In [None]:
# check the performance for all the classifiers.
classifiers = [
    LogisticRegression(random_state=16),
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB(),
    MLPClassifier(alpha=1e-10, hidden_layer_sizes=(20, 2), random_state=1),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

# Logging for Visual Comparison
log_cols=["Classifier", "Train Accuracy", "Train Log Loss", "Test Accuracy", "Test Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    print("="*30)
    print(name)

    print('****Results****')
    # training
    train_predictions = clf.predict(X_train)
    train_acc = accuracy_score(y_train, train_predictions)
    print("Training Accuracy: {:.4%}".format(train_acc))

    train_predictions = clf.predict_proba(X_train)
    train_ll = log_loss(y_train, train_predictions)
    print("Training Log Loss: {}".format(train_ll))

    # testing
    test_predictions = clf.predict(X_test)
    test_acc = accuracy_score(y_test, test_predictions)
    print("Testing Accuracy: {:.4%}".format(test_acc))

    test_predictions = clf.predict_proba(X_test)
    test_ll = log_loss(y_test, test_predictions)
    print("Testing Log Loss: {}".format(test_ll))

    log_entry = pd.DataFrame([[name, train_acc*100, train_ll, test_acc*100, test_ll]], columns=log_cols)
    log = pd.concat([log, log_entry])


In [None]:
# Visualize the performance.
sns.set_color_codes("muted")
sns.barplot(x='Test Accuracy', y='Classifier', data=log, color="b")
plt.xlabel(' Accuracy %')
plt.title('Classifier Accuracy')
plt.show()

In [None]:
# Visualize the performance.
sns.set_color_codes("muted")
sns.barplot(x='Test Log Loss', y='Classifier', data=log, color="g")
plt.xlabel('Log Loss')
plt.title('Classifier Log Loss')
plt.show()





*   The performance of the logistic regression is not bad, although usually it is not the highest.
*   The ensemble methods, e.g. RF, achieve the highest predictive performance.





## **Exercise.** Create a dummy variable to represent the auto vs. non-auto census tracts by using a threshold value. Then compare the performance of the ML classifiers.  