# Week 6: Classification

### _Sushmita V Gopalan_

In [36]:
#Special module written for this class
#This provides access to data and to helper functions from previous weeks
#Make sure you update it before starting this notebook
import lucem_illud #pip install -U git+git://github.com/Computational-Content-Analysis-2018/lucem_illud.git

#All these packages need to be installed from pip
#For ML
import sklearn
import sklearn.naive_bayes
import sklearn.tree
import sklearn.ensemble
import sklearn.neural_network
import sklearn.decomposition

import nltk #For tokenizing and normalizing
import numpy as np #arrays
import matplotlib.pyplot as plt #Plots
import matplotlib.colors # For nice colours
import seaborn #Makes plots look nice, also heatmaps
import scipy as sp #for interp

#These are from the standard library
import collections
import os
import os.path
import random
import re
import glob
import pandas as pd
import requests
import json
import math

#This 'magic' command makes the plots work better
#in the notebook, don't use it outside of a notebook.
#Also you can ignore the warning
%matplotlib inline

In [37]:
%%time
# Creating 10 artificial datasets 

train1, test1 = lucem_illud.trainTestSplit(lucem_illud.random())

noise = 0.1
train2, test2 = lucem_illud.trainTestSplit(lucem_illud.andSplit(noise))
train3, test3 = lucem_illud.trainTestSplit(lucem_illud.xorSplit(noise)) #Please try this one
train4, test4 = lucem_illud.trainTestSplit(lucem_illud.targetSplit(noise))
train5, test5 = lucem_illud.trainTestSplit(lucem_illud.multiBlobs(noise))

noise = 0.5 

train6, test6 = lucem_illud.trainTestSplit(lucem_illud.andSplit(noise))
train7, test7 = lucem_illud.trainTestSplit(lucem_illud.xorSplit(noise)) #Please try this one
train8, test8 = lucem_illud.trainTestSplit(lucem_illud.targetSplit(noise))
train9, test9 = lucem_illud.trainTestSplit(lucem_illud.multiBlobs(noise))

noise = 0.9

train10, test10 = lucem_illud.trainTestSplit(lucem_illud.andSplit(noise))
train11, test11 = lucem_illud.trainTestSplit(lucem_illud.xorSplit(noise)) #Please try this one
train12, test12 = lucem_illud.trainTestSplit(lucem_illud.targetSplit(noise))
train13, test13 = lucem_illud.trainTestSplit(lucem_illud.multiBlobs(noise))

# loading other datasets
train14, test14 = lucem_illud.trainTestSplit(lucem_illud.loadReddit())
train15, test15 = lucem_illud.trainTestSplit(lucem_illud.loadNewsGroups())
train16, test16 = lucem_illud.trainTestSplit(lucem_illud.loadSenateSmall())
train17, test17 = lucem_illud.trainTestSplit(lucem_illud.loadSenateLarge())
train18, test18 = lucem_illud.trainTestSplit(lucem_illud.loadSpam())

trains = [train1, train2, train3, train4, train5, train6, train7, train8, train9, train10, train11, train12, train13, train14, train15, train16, train17, train18]
tests = [test1, test2, test3, test4, test5, test6, test7, test8, test9, test10, test11, test12, test13, test14, test15, test16, test17, test18]



Loading Reddit data
Converting to vectors
Loading data for: comp.sys.mac.hardware
Loading data for: comp.windows.x
Loading data for: misc.forsale
Loading data for: rec.autos
Converting to vectors
Loading senate data
Converting to vectors
Loading senator: Kennedy
Loading senator: Kerry
Loading senator: Klobuchar
Loading senator: Kohl
Loading senator: Kyl
Converting to vectors
Loading Spam
Loading Ham
Converting to vectors
CPU times: user 2min 20s, sys: 3.52 s, total: 2min 23s
Wall time: 2min 28s


In [98]:
%%time
def auc(clf,train,test):
    clf.fit(np.stack(train['vect'], axis=0), train['category'])
    return lucem_illud.evaluateClassifier(clf, test)['AUC'][0]

# Naive Bayes
clf = sklearn.naive_bayes.GaussianNB()
naive_bayes = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    naive_bayes.append(x)
    

CPU times: user 153 ms, sys: 4.98 ms, total: 158 ms
Wall time: 158 ms


  'precision', 'predicted', average, warn_for)


In [105]:
%%time
# SVM - Linear Kernel
clf = sklearn.svm.SVC(kernel = 'linear', probability = False) #slow, set probability = False to speed up
svm_linear = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    svm_linear.append(x)


  'precision', 'predicted', average, warn_for)


CPU times: user 240 ms, sys: 8.98 ms, total: 249 ms
Wall time: 248 ms


In [40]:
%%time
# SVM - Cubic Kernel
clf = sklearn.svm.SVC(kernel = 'poly', degree = 3, probability = False) #slower
svm_cubic = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    svm_cubic.append(x)

  'precision', 'predicted', average, warn_for)


CPU times: user 1.28 s, sys: 7.47 ms, total: 1.28 s
Wall time: 1.29 s


In [41]:
%%time
# knn
clf = sklearn.neighbors.KNeighborsClassifier(5, weights='distance')# k, 'distance' or 'uniform'
knn = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    knn.append(x)

CPU times: user 151 ms, sys: 4.19 ms, total: 156 ms
Wall time: 155 ms


In [42]:
%%time
# logistic regression
clf = sklearn.linear_model.LogisticRegression()
logit = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    logit.append(x)

CPU times: user 173 ms, sys: 2.83 ms, total: 176 ms
Wall time: 176 ms


  'precision', 'predicted', average, warn_for)


In [43]:
%%time
# decision tree
clf = sklearn.tree.DecisionTreeClassifier()
dt = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    dt.append(x)

CPU times: user 170 ms, sys: 2.86 ms, total: 173 ms
Wall time: 173 ms


In [44]:
%%time
# random forest
clf = sklearn.ensemble.RandomForestClassifier()
rf = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    rf.append(x)

CPU times: user 394 ms, sys: 3.74 ms, total: 397 ms
Wall time: 403 ms


In [45]:
%%time
# mlpc
clf = sklearn.neural_network.MLPClassifier()
mlpc = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    mlpc.append(x)

  'precision', 'predicted', average, warn_for)


CPU times: user 6.69 s, sys: 771 ms, total: 7.46 s
Wall time: 7.25 s


In [73]:
%%time
# gradient boosting
clf = sklearn.ensemble.GradientBoostingClassifier()
gb = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    gb.append(x)

CPU times: user 2.67 s, sys: 23.5 ms, total: 2.69 s
Wall time: 2.73 s


In [62]:
%%time
# adaboost
clf = sklearn.ensemble.AdaBoostClassifier()
adaboost = []
for i in range(0,13):
    x = auc(clf,trains[i],tests[i])
    adaboost.append(x)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


CPU times: user 1.34 s, sys: 12.5 ms, total: 1.35 s
Wall time: 1.36 s


In [66]:
# and split 
classifiers = [naive_bayes, svm_linear, svm_cubic, knn, logit, dt, rf, mlpc, gb]
names = ["Naive Bayes", "SVM(Linear)", "SVM(Cubic)", "K-NN","Logistic Regression","Decision Tree",
         "Random Forest","MLPC","Gradient Boost"]       
data_and = []
data_xor = []
data_target = []
data_blob = []

for classifier in classifiers:
    for i in [2,6,10]:
        data_and.append(classifier[i-1])
    for i in [3,7,11]:
        data_xor.append(classifier[i-1])
    for i in [4,8,12]:
        data_target.append(classifier[i-1])
    for i in [5,9,13]:
        data_blob.append(classifier[i-1])


and_low = []
and_med = []
and_high = []

xor_low = []
xor_med = []
xor_high = []

target_low = []
target_med = []
target_high = []

blob_low = []
blob_med = []
blob_high = []

for i in range(27):
    if i in [0,3,6,9,12,15,18,21,24,27]:
        and_low.append(data_and[i])
        xor_low.append(data_xor[i])
        target_low.append(data_target[i])
        blob_low.append(data_blob[i])

        
    if i in [1,4,7,10,13,16,19,22,25,28]:
        and_med.append(data_and[i])
        xor_med.append(data_xor[i])
        target_med.append(data_target[i])
        blob_med.append(data_blob[i])


    if i in [2,5,8,11,14,17,20,23,26,29]:
        and_high.append(data_and[i])
        xor_high.append(data_xor[i])
        target_high.append(data_target[i])
        blob_high.append(data_blob[i])



In [67]:
#len(names)
len(and_low)

9

In [68]:
and_df = pd.DataFrame({'0.Classifier':names, 
                       '1. Low Noise':and_low, 
                       '2. Medium Noise':and_med, 
                       '3. High Noise':and_high})
xor_df = pd.DataFrame({'0.Classifier':names, 
                       '1. Low Noise':xor_low, 
                       '2. Medium Noise':xor_med, 
                       '3. High Noise':xor_high})
target_df = pd.DataFrame({'0.Classifier':names, 
                       '1. Low Noise':target_low, 
                       '2. Medium Noise':target_med, 
                       '3. High Noise':target_high})
blob_df = pd.DataFrame({'0.Classifier':names, 
                       '1. Low Noise':blob_low, 
                       '2. Medium Noise':blob_med, 
                       '3. High Noise':blob_high})

In [69]:
and_df

Unnamed: 0,0.Classifier,1. Low Noise,2. Medium Noise,3. High Noise
0,Naive Bayes,0.93109,0.779247,0.550628
1,SVM(Linear),0.93109,0.784054,0.535273
2,SVM(Cubic),0.870192,0.759215,0.5
3,K-NN,0.935497,0.730369,0.487862
4,Logistic Regression,0.936298,0.779247,0.535014
5,Decision Tree,0.924279,0.762821,0.505187
6,Random Forest,0.934696,0.751603,0.491804
7,MLPC,0.935897,0.77484,0.547878
8,Gradient Boost,0.939904,0.764423,0.53159


In [70]:
xor_df

Unnamed: 0,0.Classifier,1. Low Noise,2. Medium Noise,3. High Noise
0,Naive Bayes,0.579021,0.514701,0.534803
1,SVM(Linear),0.704634,0.479948,0.530053
2,SVM(Cubic),0.661746,0.733323,0.672917
3,K-NN,0.939045,0.810081,0.59501
4,Logistic Regression,0.51136,0.485099,0.555506
5,Decision Tree,0.939345,0.739474,0.550455
6,Random Forest,0.938745,0.790379,0.571057
7,MLPC,0.953909,0.80018,0.689569
8,Gradient Boost,0.954209,0.805031,0.629863


In [71]:
target_df

Unnamed: 0,0.Classifier,1. Low Noise,2. Medium Noise,3. High Noise
0,Naive Bayes,0.951918,0.709084,0.553285
1,SVM(Linear),0.643478,0.546819,0.497596
2,SVM(Cubic),0.5,0.5,0.5
3,K-NN,0.956266,0.739096,0.573718
4,Logistic Regression,0.643223,0.461184,0.491186
5,Decision Tree,0.916624,0.689276,0.522837
6,Random Forest,0.946036,0.733794,0.525641
7,MLPC,0.927366,0.69968,0.500801
8,Gradient Boost,0.951918,0.688275,0.536859


In [72]:
blob_df

Unnamed: 0,0.Classifier,1. Low Noise,2. Medium Noise,3. High Noise
0,Naive Bayes,1.0,1.0,0.99334
1,SVM(Linear),1.0,1.0,0.99334
2,SVM(Cubic),1.0,1.0,0.986679
3,K-NN,1.0,0.998731,0.986679
4,Logistic Regression,1.0,0.994924,0.993873
5,Decision Tree,1.0,0.988028,0.960731
6,Random Forest,1.0,0.998731,0.987905
7,MLPC,1.0,0.996193,0.99334
8,Gradient Boost,1.0,0.989297,0.987905


Go back through all of the cells above and generate 10 distinct artificial datasets and classify them with all of the available methods. Add a cell immediately below and describe which classifier(s) worked best with which artificially constructed data source and why. Then go through all of the empirical datasets (i.e., Newsgroups, Senate Small, Senate Large, Email Spam) and classify them with all available methods. Add a second cell immediately below and describe which classifier(s) worked best with which data set and why.
Stretch (but also required) Wander through the SKLearn documentation available here, particularly perusing the classifiers. In cells following, identify and implement a new classifier that we have not yet used (e.g., AdaBoost, CART) on one artificial dataset and one real dataset (used above). Then, in the next cell describe the classifier, detail how it compares with the approaches above, and why it performed better or worse than others.

In [79]:
# AdaBoost
a = ['Random','And: Low Noise', 'And: Med Noise','And: High Noise', 
     'Xor: Low Noise', 'Xor: Med Noise','Xor: High Noise', 
     'Target Split: Low Noise','Target Split: Med Noise','Target Split: High Noise',
     'Blobs: Low Noise', 'Blobs: Med Noise','Blobs: High Noise']
b = []
for i in [1,2,6,10,3,7,11,4,8,12,5,9,13]:
    b.append(adaboost[i-1])


In [80]:
adaboost_df = pd.DataFrame({'0.Dataset':a, 
                       '1. AUC':b})
adaboost_df

Unnamed: 0,0.Dataset,1. AUC
0,Random,0.492839
1,And: Low Noise,0.935096
2,And: Med Noise,0.762821
3,And: High Noise,0.515302
4,Xor: Low Noise,0.563257
5,Xor: Med Noise,0.519052
6,Xor: High Noise,0.505301
7,Target Split: Low Noise,0.951918
8,Target Split: Med Noise,0.737895
9,Target Split: High Noise,0.522436


In [92]:
# Random dataset
random = []
for i in classifiers:
    random.append(i[0])
adaboost_df = pd.DataFrame({'0.Classifier':names, 
                       '1. AUC':random})
adaboost_df


Unnamed: 0,0.Classifier,1. AUC
0,Naive Bayes,0.5
1,SVM(Linear),0.5
2,SVM(Cubic),0.5
3,K-NN,0.49335
4,Logistic Regression,0.5
5,Decision Tree,0.496419
6,Random Forest,0.484399
7,MLPC,0.5
8,Gradient Boost,0.517647


## Real Datasets

In [108]:
%%time
#Bayes
clf1 = sklearn.naive_bayes.GaussianNB()

#Analogizes
clf2 = sklearn.svm.SVC(kernel = 'linear', probability = False) #slow, set probability = False to speed up, but lose ROC
clf3 = sklearn.svm.SVC(kernel = 'poly', degree = 3, probability = False) #slower
clf4 = sklearn.neighbors.KNeighborsClassifier(5, weights='distance')# k, 'distance' or 'uniform'

#Classical Regression
clf5 = sklearn.linear_model.LogisticRegression()

#Symbolists
clf6 = sklearn.tree.DecisionTreeClassifier()
clf7 = sklearn.ensemble.RandomForestClassifier()

#Connectionists
clf8 = sklearn.neural_network.MLPClassifier()

#Ensemble
clf9 = sklearn.ensemble.GradientBoostingClassifier()

CPU times: user 275 µs, sys: 946 µs, total: 1.22 ms
Wall time: 1.24 ms


In [109]:
clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7, clf8, clf9]
real_train = [train14, train15, train16, train17, train18]
real_test = [test14, test15, test16, test17, test18]

In [111]:
%%time
real_auc = []
for i in clfs:
    print(str(i))
    for j in [0,1,2,3,4]:
            x = auc(i,real_train[j],real_test[j])
            real_auc.append(x)

GaussianNB(priors=None)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


  'precision', 'predicted', average, warn_for)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='distance')
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
          

In [118]:
def chunk(xs, n):
    '''Split the list, xs, into n chunks'''
    L = len(xs)
    assert 0 < n <= L
    s = L//n
    return [xs[p:p+s] for p in range(0, L, s)]

ds = chunk(real_auc,9)

In [120]:
ds1 = []
ds2 = []
ds3 = []
ds4 = []
ds5 = []

for i in ds:
    ds1.append(i[0])
    ds2.append(i[1])
    ds3.append(i[2])
    ds4.append(i[3])
    ds5.append(i[4])

In [123]:
real_df = pd.DataFrame({'0.Classifier':names, 
                       '1. Reddit': ds1,
                       '2. Newsgroups': ds2,
                       '3. Senators (Small)': ds3, 
                       '4. Senators (Large)': ds4,
                       '5. Email': ds5})



In [124]:
real_df

Unnamed: 0,0.Classifier,1. Reddit,2. Newsgroups,3. Senators (Small),4. Senators (Large),5. Email
0,Naive Bayes,0.852971,0.896479,0.860712,0.760457,0.813491
1,SVM(Linear),0.991857,0.935699,1.0,0.97607,0.775007
2,SVM(Cubic),0.5,0.5,0.5,0.5,0.5
3,K-NN,0.955381,0.586614,0.860694,0.809323,0.711002
4,Logistic Regression,0.989739,0.944159,0.978873,0.94301,0.62959
5,Decision Tree,0.908311,0.875485,0.992958,0.995134,0.768618
6,Random Forest,0.914667,0.858566,0.971831,0.976392,0.764505
7,MLPC,0.997881,0.947074,0.952226,0.954392,0.82381
8,Gradient Boost,0.977359,0.912663,1.0,0.995495,0.66051
