In [143]:
# Importing libraries
# we don't like warnings; you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'

In [144]:
# Importing data
df_ks = pd.read_csv("../Data/Cleaned/joined_data.csv")
# NOTE: May need to clean data further

In [145]:
df_ks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242630 entries, 0 to 242629
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   backers_count              242630 non-null  int64  
 1   blurb                      242630 non-null  object 
 2   blurb_length               242630 non-null  int64  
 3   spotlight                  242630 non-null  bool   
 4   staff_pick                 242630 non-null  bool   
 5   location_state             242630 non-null  object 
 6   location_displayable_name  242630 non-null  object 
 7   name                       242630 non-null  object 
 8   category                   242630 non-null  object 
 9   main_category              242630 non-null  object 
 10  currency                   242630 non-null  object 
 11  deadline                   242630 non-null  object 
 12  goal                       242630 non-null  float64
 13  launched                   24

In [146]:
# Drop the 'location_displayable_name' as it is redundant
df_ks = df_ks.drop(columns=['location_displayable_name'])

In [147]:
# Drop the backers as it is redundant
df_ks = df_ks.drop(columns=['backers'])

In [148]:
# Drop usd pledged as it is redudant
df_ks = df_ks.drop(columns=['usd pledged'])

In [149]:
# Drop currency as it is redudant
df_ks = df_ks.drop(columns=['currency'])

In [150]:
# Drop pledged as it is redudant
df_ks = df_ks.drop(columns=['pledged'])

In [151]:
# Drop goal as it is redudant
df_ks = df_ks.drop(columns=['goal'])

In [152]:
# Change category value: Children's Books --> childrens books
df_ks['category'] = df_ks['category'].replace(['Children\'s Books'], 'childrens books')

In [153]:
df_ks[df_ks['category']=='childrens books'].head()

Unnamed: 0,backers_count,blurb,blurb_length,spotlight,staff_pick,location_state,name,category,main_category,deadline,launched,state,country,usd_pledged_real,usd_goal_real
21,84,Astronauts onboard the space station are readi...,18,False,True,TX,Story Time From Space,childrens books,Publishing,2014-07-06,2014-05-07 19:44:28,0.0,US,5547.0,35000.0
122,4,Pronounced /erÉ™ pÄ« mÉ™/. Did someone say pie?,8,False,False,OR,Arie the Arapaima and His Marvelous Adventure ...,childrens books,Publishing,2017-06-07,2017-05-08 19:22:17,0.0,US,303.0,2900.0
153,374,Heartwarming tale of one familyâ€™s cherished ...,20,True,False,IL,Oliver the Ornament,childrens books,Publishing,2015-08-22,2015-07-13 14:15:45,1.0,US,52885.0,50000.0
190,98,"Teaching young kids about materials, starting ...",24,True,False,CA,A children's book about a little guy made of f...,childrens books,Publishing,2015-11-19,2015-10-20 17:35:43,1.0,US,10508.0,10000.0
245,26,It's bedtime but one of Hayley's favourite toy...,16,True,False,AB,Hayley's Favourite Toys,childrens books,Publishing,2017-10-20,2017-10-13 17:01:37,1.0,CA,646.71,520.25


In [154]:
df_ks.columns

Index(['backers_count', 'blurb', 'blurb_length', 'spotlight', 'staff_pick',
       'location_state', 'name', 'category', 'main_category', 'deadline',
       'launched', 'state', 'country', 'usd_pledged_real', 'usd_goal_real'],
      dtype='object')

In [155]:
# Trying out differnt Models off the shelf
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

In [156]:
# For Visualizing Decision Trees (sklearn)
#from sklearn.tree import DecisionTreeClassifier
import pydotplus #pip install pydotplus
from sklearn.tree import export_graphviz

# Let’s write an auxiliary function that will return grid for further visualization.

def tree_graph_to_png(tree, feature_names, png_file_to_save):
    tree_str = export_graphviz(tree, feature_names=feature_names, 
                                     filled=True, out_file=None)
    graph = pydotplus.graph_from_dot_data(tree_str)  
    graph.write_png(png_file_to_save)
    
    
def get_grid(data):
    x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
    y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
    return np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

In [157]:
# Initial model selection process
models = []

#models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier()))
#models.append(('GNB', GaussianNB()))
#models.append(("BNB", BernoulliNB()))

In [158]:
df_ks['state'].value_counts()

0.0    124115
1.0    118515
Name: state, dtype: int64

In [164]:
# Need to do Encode the categorical data
# Don't want to use LabelEncoder as we don't want to treat the nominal data as non-nominal
# Have to use One-Hot Encoding -- however, it does increase the #of features by a lot
# It may be worth removing some of the less popular types
df_ks_dummies = pd.get_dummies(df_ks, columns=['category'], prefix=['category'])
# Only look at some features
df_ks_dummies = df_ks_dummies.drop(columns=['backers_count','blurb','blurb_length','spotlight','staff_pick','location_state','name','main_category','deadline','launched','country','usd_pledged_real','usd_goal_real'])
# df_ks_dummies = df_ks_dummies.drop(columns=['backers_count','blurb','spotlight','staff_pick','location_state','name','main_category','deadline','launched','country','usd_pledged_real','usd_goal_real'])
df_ks_dummies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242630 entries, 0 to 242629
Columns: 160 entries, state to category_childrens books
dtypes: float64(1), uint8(159)
memory usage: 38.6 MB


In [165]:
# Features/Response
# NOTE: trying features I think are most relevant
feature_names = df_ks_dummies.drop(columns=['state']).columns
X = df_ks_dummies[feature_names]
y = df_ks_dummies.state

In [166]:
feature_names

Index(['category_3D Printing', 'category_Academic', 'category_Accessories',
       'category_Action', 'category_Animals', 'category_Animation',
       'category_Anthologies', 'category_Apparel', 'category_Apps',
       'category_Architecture',
       ...
       'category_Weaving', 'category_Web', 'category_Webcomics',
       'category_Webseries', 'category_Woodworking', 'category_Workshops',
       'category_World Music', 'category_Young Adult', 'category_Zines',
       'category_childrens books'],
      dtype='object', length=159)

In [167]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df_ks.state, random_state=42)

In [168]:
# Trying the decision tree
names = []
scores = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name     Score
0   DT  0.716196


In [169]:
strat_k_fold = StratifiedKFold(n_splits=10, random_state=10, shuffle=True)
#strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

names = []
scores = []

for name, model in models:
    
    score = cross_val_score(model, X, y, cv=strat_k_fold, scoring='accuracy').mean()
    names.append(name)
    scores.append(score)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

  Name    Score
0   DT  0.71562


In [170]:
# Visualize the tree

names = []
scores = []

#clf_tree = DecisionTreeClassifier()
clf_tree = DecisionTreeClassifier(criterion='entropy',max_depth= 1, random_state=42)
clf_tree.fit(X_train, y_train)
y_pred = model.predict(X_test)
scores.append(accuracy_score(y_test, y_pred))
names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name     Score
0   DT  0.716196


In [171]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, y_pred)))
print('Precision score: {}'.format(precision_score(y_test, y_pred)))
print('Recall score: {}'.format(recall_score(y_test, y_pred)))
print('F1 score: {}'.format(f1_score(y_test, y_pred)))

Accuracy score: 0.7161957202677306
Precision score: 0.7496581127825598
Recall score: 0.6290458672246785
F1 score: 0.6840762694756932


In [172]:
clf_tree.get_depth()

1

In [None]:
tree_graph_to_png(clf_tree, feature_names=['Age'], 
                 png_file_to_save='topic3_decision_tree2.png')

In [None]:
# Can use this to find decision path of different inputs:
# clf_tree.decision_path(X[, check_input])