Using KNN for classification of Star Wars Survey Dataset

In [14]:
#initial imports
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import time
from sklearn.neighbors import KNeighborsClassifier

In [15]:
%matplotlib inline

In [16]:
#read in numeric version of survey data
data = pd.read_csv("survey_numeric.csv")
print(data.shape)
data.head(10)

(1186, 37)


Unnamed: 0,Seen a Star Wars film,Fan of Star Wars,Seen The Phantom Menace,Seen Attack of the Clones,Seen Revenge of the Sith,Seen A New Hope,Seen The Empire Strikes Back,Seen Return of the Jedi,Rank for The Phantom Menace,Rank for Attack of the Clones,...,View of Yoda,Which character shot first?,Familiar with the Expanded Universe?,Fan of the Expanded Universe?,Star Trek Fan,Gender,Age,Household Income,Education,Location (Census Region)
0,1,1,1,1,1,1,1,1,3.0,2.0,...,2,0,1,-1,-1,-1,1,0,2,1
1,0,0,0,0,0,0,0,0,0.0,0.0,...,-100,0,0,0,1,-1,1,1,4,2
2,1,-1,1,1,1,0,0,0,1.0,2.0,...,-100,0,-1,0,-1,-1,1,1,2,3
3,1,1,1,1,1,1,1,1,5.0,6.0,...,2,0,-1,0,1,-1,1,4,3,3
4,1,1,1,1,1,1,1,1,5.0,4.0,...,1,1,1,-1,-1,-1,1,4,3,3
5,1,1,1,1,1,1,1,1,1.0,4.0,...,2,-1,1,-1,1,-1,1,2,4,4
6,1,1,1,1,1,1,1,1,6.0,5.0,...,2,-1,1,-1,-1,-1,1,0,2,5
7,1,1,1,1,1,1,1,1,4.0,5.0,...,2,-1,-1,0,1,-1,1,0,2,1
8,1,1,1,1,1,1,1,1,5.0,4.0,...,1,-1,-1,0,-1,-1,1,1,3,1
9,1,-1,0,1,0,0,0,0,1.0,2.0,...,2,0,-1,0,-1,-1,1,2,3,6


In [5]:
#imports
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

Create function to KNN Classifier on different labels. PCA to reduce dimensionality and scaling will be done using a pipeline.

In [10]:
def scoreKNN(label):
    #split into features and label
    features = data.drop(label, axis=1)
    labels = data[[label]]
    #create a scaler, PCA and KNN classifier
    scaler = sk.preprocessing.MinMaxScaler()

    #create a PCA
    pca = PCA()

    #create a KNN classifier
    knn = KNeighborsClassifier()

    #create a pipeline that does scaling, then PCA, then KNN
    pipe = Pipeline(steps=[('scaler', scaler), ('pca', pca), ('knn', knn)])

    #Set up the parameters you want to tune for each of your pipeline steps
    #Parameters of pipelines can be set using ‘__’ separated parameter names:
    param_grid = {
        'pca__n_components': list(range(1, 19)), #find how many principal componenet to keep
        'knn__n_neighbors': list(range(1, 30)),  #find the best value of k
    }

    # pass the pipeline and the parameters into a GridSearchCV with a 5-fold cross validation
    search = GridSearchCV(pipe, param_grid, cv=5)
    # call fit() on the GridSearchCV and pass in the unscaled data (X_values, Y_values)
    #search.fit(features,labels.values.ravel())
    # print out the best_score_ and best_params_ from the GridSearchCV
    #print(search.best_score_)
    #print(search.best_params_)

    #pass same GridSearchCV into a cross_val_score then print out the accuracy
    #search = GridSearchCV(pipe, param_grid, cv=5)
    scores = cross_val_score(estimator=search, X=features, y=labels.values.ravel(), cv=5, scoring = 'accuracy')
    predictions = cross_val_predict(estimator=search, X=features, y=labels.values.ravel(), cv=5)
    print('Accuracy:', scores.mean())
    print(classification_report(labels, predictions))

In [13]:
test_labels = ['Which character shot first?','Gender', 'Age','Household Income','Education','Location (Census Region)', 'Star Trek Fan']
for l in test_labels:
    print('\n'+l+'\n')
    scoreKNN(l)


Which character shot first?

Accuracy: 0.6871943542488868
              precision    recall  f1-score   support

          -1       0.50      0.79      0.61       325
           0       0.83      0.83      0.83       664
           1       0.17      0.02      0.03       197

    accuracy                           0.68      1186
   macro avg       0.50      0.54      0.49      1186
weighted avg       0.63      0.68      0.64      1186


Gender





Accuracy: 0.62642353229642
              precision    recall  f1-score   support

          -1       0.56      0.52      0.54       497
           0       1.00      0.89      0.94       140
           1       0.59      0.66      0.62       549

    accuracy                           0.63      1186
   macro avg       0.72      0.69      0.70      1186
weighted avg       0.63      0.63      0.63      1186


Age





Accuracy: 0.37003681253171716
              precision    recall  f1-score   support

           0       0.95      0.90      0.92       140
           1       0.36      0.32      0.34       218
           2       0.28      0.34      0.31       268
           3       0.30      0.29      0.29       291
           4       0.32      0.29      0.30       269

    accuracy                           0.38      1186
   macro avg       0.44      0.43      0.43      1186
weighted avg       0.39      0.38      0.38      1186


Household Income



  'precision', 'predicted', average, warn_for)


Accuracy: 0.330520442219441
              precision    recall  f1-score   support

           0       0.52      0.52      0.52       328
           1       0.12      0.03      0.05       138
           2       0.22      0.19      0.20       186
           3       0.30      0.64      0.41       298
           4       0.18      0.04      0.07       141
           5       0.00      0.00      0.00        95

    accuracy                           0.34      1186
   macro avg       0.22      0.24      0.21      1186
weighted avg       0.29      0.34      0.29      1186


Education



  'precision', 'predicted', average, warn_for)


Accuracy: 0.38192817103806975
              precision    recall  f1-score   support

           0       0.97      0.85      0.90       150
           1       0.00      0.00      0.00         7
           2       0.10      0.03      0.04       105
           3       0.32      0.52      0.39       328
           4       0.31      0.31      0.31       321
           5       0.31      0.19      0.24       275

    accuracy                           0.38      1186
   macro avg       0.33      0.32      0.31      1186
weighted avg       0.37      0.38      0.37      1186


Location (Census Region)





Accuracy: 0.250428907521848
              precision    recall  f1-score   support

           0       0.96      0.91      0.94       143
           1       0.16      0.32      0.22       170
           2       0.09      0.08      0.09       110
           3       0.08      0.03      0.05        93
           4       0.12      0.11      0.11       122
           5       0.16      0.22      0.19       181
           6       0.21      0.25      0.22       175
           7       0.00      0.00      0.00        79
           8       0.11      0.01      0.02        75
           9       0.00      0.00      0.00        38

    accuracy                           0.25      1186
   macro avg       0.19      0.19      0.18      1186
weighted avg       0.23      0.25      0.23      1186


Star Trek Fan





Accuracy: 0.7657394578265804
              precision    recall  f1-score   support

          -1       0.81      0.76      0.78       641
           0       0.95      0.87      0.91       118
           1       0.68      0.77      0.72       427

    accuracy                           0.77      1186
   macro avg       0.82      0.80      0.81      1186
weighted avg       0.78      0.77      0.77      1186



