Here we will implement a Neural Network model to predict_____

In [335]:
import pandas as pd

In [336]:
filepath = r"C:\Users\User\AppData\Local\Packages\CanonicalGroupLimited.UbuntuonWindows_79rhkp1fndgsc\LocalState\rootfs\home\mitchel\StarWarsSurvey-ClassificationAnalysis\survey_numeric.csv"
alt_path = r"C:\Users\User\AppData\Local\Packages\CanonicalGroupLimited.UbuntuonWindows_79rhkp1fndgsc\LocalState\rootfs\home\mitchel\StarWarsSurvey-ClassificationAnalysis\survey_data.csv"

data = pd.read_csv(filepath)
data.head(15)
data.shape

(1186, 37)

In [337]:
#drop all records where respondent has not seen a star wars film
data.drop(data[data["Seen a Star Wars film"] == 0].index,inplace=True)

#drop location column and "Seen a star wars film"
data = data.drop(["Location (Census Region)"], axis=1)
data = data.drop(["Seen a Star Wars film"], axis=1)

Here we split the data on the given label
This will split the data into features and labels. We remove the rows in which the label value is zero, for the labels that we'll be testing over (Fan of Star Wars, Star Trek Fan, Gender, Age, Household Income, and Education) the value of zero represents and Unknown or Null response, which cannot be accurately predicted.

In [338]:
#print all column names
for col in data.columns:
    print(col)

#this function splits the data by features and labels, given the label argument
#it returns two objects: data_X and data_Y
def splitData(label):
    #make copy of data to keep original data unaffected
    data_copy = data
    
    #drop null values
    data_copy.drop(data_copy[data_copy[label] == 0].index, inplace=True)
    
    #split and return data
    data_X = data_copy.loc[:,data_copy.columns != label]
    data_Y = data_copy[label]
    return data_X, data_Y

Fan of Star Wars
Seen The Phantom Menace
Seen Attack of the Clones
Seen Revenge of the Sith
Seen A New Hope
Seen The Empire Strikes Back
Seen Return of the Jedi
Rank for The Phantom Menace
Rank for Attack of the Clones
Rank for Revenge of the Sith
Rank for A New Hope
Rank for The Empire Strikes Back
Rank for Return of the Jedi
View of Han Solo
View of Luke Skywalker
View of Princess Leia Organa
View of Anakin Skywalker
View of Obi Wan Kenobi
View of Emperor Palpatine
View of Darth Vader
View of Lando Calrissian
View of Boba Fett
View of C-3P0
View of R2 D2
View of Jar Jar Binks
View of Padme Amidala
View of Yoda
Which character shot first?
Familiar with the Expanded Universe?
Fan of the Expanded Universe?
Star Trek Fan
Gender
Age
Household Income
Education


In [339]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#define function to create GridSearchCV object

scaler = StandardScaler()
MLPC = MLPClassifier(solver = 'lbfgs')

pipe = Pipeline([("Scaler",scaler),("MLPC",MLPC)])

#create list of hidden_layer sizes for param_grid
hidden_layer_list = []
for i in range(100,111, 2):
    hidden_layer_list += [(i,)]

    
#parameter grid for different hyperparamters in gridsearchCV
param_grid = {'MLPC__hidden_layer_sizes':hidden_layer_list,
             'MLPC__activation':['logistic','tanh','relu']
             }


In [340]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

labels = ['Fan of Star Wars']

MLPSearch= GridSearchCV(pipe,
                  param_grid,
                  scoring = 'accuracy',
                  cv=5)

print(label)
#display accuracy and confusion matrix for the GridSearchCV
data_X, data_Y = splitData(label)

acc = cross_val_score(MLPSearch, data_X, data_Y, cv=5)
print("Average Accuracy: {:.2f}%\n".format(acc.mean()*100))

MLPSearch.fit(data_X,data_Y)
print(MLPSearch.best_params_)



Fan of Star Wars




Average Accuracy: 80.37%

{'MLPC__activation': 'tanh', 'MLPC__hidden_layer_sizes': (106,)}


In [347]:
#sample record
record = [[1,1,1,1,1,1,5,6,4,2,3,1,3,4,5,5,4,3,2,0,3,4,4,1,5,4,0,1,1,-1,1,1,5,3],
         [1,1,1,1,1,1,5,6,4,2,3,1,5,4,4,3,4,1,4,3,0,5,5,1,4,5,0,-1,0,0,-1,1,4,3]]

print(MLPSearch.predict(record))
weights = MLPSearch.best_estimator_.steps[1][1].coefs_


sum_of_weights = []
for node in weights[0]:
    node_sum = 0
    for weight in node:
        node_sum += abs(weight)
    sum_of_weights.append(node_sum)
    
    
print(sum_of_weights)
avg = sum(sum_of_weights)/len(sum_of_weights)
[weight for weight in sum_of_weights if weight < avg]


[1 1]
[50.562058208264816, 44.72362162780554, 43.62877518340866, 46.11857313531044, 50.14270090021925, 47.58138030538523, 42.60847662422869, 37.313626685821816, 49.91936864529786, 40.74035261513271, 38.16239733029236, 39.636170224474, 38.86862282669871, 47.52425815001732, 52.17535691897222, 46.1198093790592, 51.376813784762476, 43.39572992291579, 54.85937452927196, 45.54472747063712, 46.6288376907021, 45.701958874319146, 37.65999945627548, 47.055389303129914, 45.19983801860467, 52.207774650612414, 44.9780616009885, 43.219343959890736, 46.7732861504657, 53.84073748212769, 45.44753320529329, 51.80848877305247, 47.766132746701835, 41.5104708137502]


[44.72362162780554,
 43.62877518340866,
 42.60847662422869,
 37.313626685821816,
 40.74035261513271,
 38.16239733029236,
 39.636170224474,
 38.86862282669871,
 43.39572992291579,
 45.54472747063712,
 45.701958874319146,
 37.65999945627548,
 45.19983801860467,
 44.9780616009885,
 43.219343959890736,
 45.44753320529329,
 41.5104708137502]