# Data Analysis of Star Wars Survey Dataset

#### Chance Mason, Nicolas Arrieche Villegas, Mitchell Walker, Tyler Wittig

## Part 4. Data Analysis - Naive Bayes

In [1]:
import warnings
warnings.simplefilter("ignore")

import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, KFold

from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
%matplotlib inline

In [3]:
# Read the data from csv file
with open('column_names.txt', 'r') as cn:
    col_names = [line.strip() for line in cn]
    col_names.remove('RespondentID')
    
data = pd.read_csv('survey_numeric.csv')
data.head()

Unnamed: 0,Seen a Star Wars film,Fan of Star Wars,Seen The Phantom Menace,Seen Attack of the Clones,Seen Revenge of the Sith,Seen A New Hope,Seen The Empire Strikes Back,Seen Return of the Jedi,Rank for The Phantom Menace,Rank for Attack of the Clones,...,View of Yoda,Which character shot first?,Familiar with the Expanded Universe?,Fan of the Expanded Universe?,Star Trek Fan,Gender,Age,Household Income,Education,Location (Census Region)
0,1,1,1,1,1,1,1,1,3.0,2.0,...,2,0,1,-1,-1,-1,1,0,2,1
1,0,0,0,0,0,0,0,0,0.0,0.0,...,-100,0,0,0,1,-1,1,1,4,2
2,1,-1,1,1,1,0,0,0,1.0,2.0,...,-100,0,-1,0,-1,-1,1,1,2,3
3,1,1,1,1,1,1,1,1,5.0,6.0,...,2,0,-1,0,1,-1,1,4,3,3
4,1,1,1,1,1,1,1,1,5.0,4.0,...,1,1,1,-1,-1,-1,1,4,3,3


### 4.1 Separate Features Columns from Labels
For now, we are going to look at each of the label columns separately to see if there are any significant results.
We may later remove some of these columns from our dataset entirely.

In [4]:
labels = ['Gender', 'Age', 'Household Income', 'Education', 'Location (Census Region)']
features = [col for col in col_names if col not in labels]

data[features]

Unnamed: 0,Seen a Star Wars film,Fan of Star Wars,Seen The Phantom Menace,Seen Attack of the Clones,Seen Revenge of the Sith,Seen A New Hope,Seen The Empire Strikes Back,Seen Return of the Jedi,Rank for The Phantom Menace,Rank for Attack of the Clones,...,View of Boba Fett,View of C-3P0,View of R2 D2,View of Jar Jar Binks,View of Padme Amidala,View of Yoda,Which character shot first?,Familiar with the Expanded Universe?,Fan of the Expanded Universe?,Star Trek Fan
0,1,1,1,1,1,1,1,1,3.0,2.0,...,-100,2,2,2,2,2,0,1,-1,-1
1,0,0,0,0,0,0,0,0,0.0,0.0,...,-100,-100,-100,-100,-100,-100,0,0,0,1
2,1,-1,1,1,1,0,0,0,1.0,2.0,...,-100,-100,-100,-100,-100,-100,0,-1,0,-1
3,1,1,1,1,1,1,1,1,5.0,6.0,...,-1,2,2,2,2,2,0,-1,0,1
4,1,1,1,1,1,1,1,1,5.0,4.0,...,2,1,1,-2,1,1,1,1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1181,1,1,1,1,1,1,1,1,5.0,4.0,...,1,2,2,1,1,2,-1,-1,0,1
1182,1,1,1,1,1,1,1,1,4.0,5.0,...,-100,1,2,-1,-1,2,0,-1,0,1
1183,0,0,0,0,0,0,0,0,0.0,0.0,...,-100,-100,-100,-100,-100,-100,0,0,0,-1
1184,1,1,1,1,1,1,1,1,4.0,3.0,...,2,1,1,2,1,2,-1,-1,0,1


### 4.2 Naive Bayes Classifications
Below is a function to automate the Gaussian Naive Bayes Classification process with a 10-fold cross validation. It will predict whatever label name is passed in as a parameter, and then display the accuracy, confusion matrix, and classification report of the resulting classification.

In [7]:
def scoreNB(label):
    # separate the features from the class label
    X = data.loc[:, features].values
    y = data.loc[:, [label]].values
    
    # initialize classifier
    clf = GaussianNB()
    clf.fit(X, y)
    
    #GaussianNB(priors=None)
    #print(clf.score(X, y))
    
    # 10-fold cross validation
    k_fold = KFold(n_splits=10, shuffle=False, random_state=None)

    # display accuracy
    print('Accuracy:', cross_val_score(clf, X, y, cv=k_fold).mean())
    
    # cross_val_predict
    y_pred = cross_val_predict(clf, X, y, cv=k_fold)

    # print confusion matrix
    conf_mat = confusion_matrix(y, y_pred)
    print('Confusion Matrix:\n', conf_mat)

    # display classification report
    print(classification_report(y, y_pred))

In [8]:
# run NB on all labels to see which works best
for l in labels:
    print('\n' + l + '\n')
    scoreNB(l)


Gender

Accuracy: 0.48278735222902724
Confusion Matrix:
 [[378  74  45]
 [ 15 124   1]
 [326 152  71]]
              precision    recall  f1-score   support

          -1       0.53      0.76      0.62       497
           0       0.35      0.89      0.51       140
           1       0.61      0.13      0.21       549

    accuracy                           0.48      1186
   macro avg       0.50      0.59      0.45      1186
weighted avg       0.54      0.48      0.42      1186


Age

Accuracy: 0.2925651616578835
Confusion Matrix:
 [[124  10   0   6   0]
 [ 38 105   1  67   7]
 [ 61 115   1  89   2]
 [ 51 129   0  90  21]
 [ 76  66   2  98  27]]
              precision    recall  f1-score   support

           0       0.35      0.89      0.51       140
           1       0.25      0.48      0.33       218
           2       0.25      0.00      0.01       268
           3       0.26      0.31      0.28       291
           4       0.47      0.10      0.17       269

    accuracy       