### Notes:
* Decision trees can perform poorly if there are too many irrelevant values. Feature selection can help to improve accuracy by eliminating these featues. 

In [2]:
# Importing libraries
# we don't like warnings; you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'

In [3]:
# Importing data
df_ks = pd.read_csv("../Data/Cleaned/joined_data.csv")

In [13]:
df_ks.head()

Unnamed: 0,backers_count,blurb_length,spotlight,staff_pick,location_state,main_category,state,country,usd_pledged_real,usd_goal_real,...,category_Weaving,category_Web,category_Webcomics,category_Webseries,category_Woodworking,category_Workshops,category_World Music,category_Young Adult,category_Zines,category_childrens books
0,4,24,False,False,OH,Music,0.0,US,62.0,15000.0,...,0,0,0,0,0,0,1,0,0,0
1,11,24,True,False,Scotland,Art,1.0,GB,266.65,231.2,...,0,0,0,0,0,0,0,0,0,0
2,18,24,False,False,IL,Food,0.0,US,776.0,20000.0,...,0,0,0,0,0,0,0,0,0,0
3,42,23,False,True,BC,Fashion,0.0,CA,2144.39,13795.22,...,0,0,0,0,0,0,0,0,0,0
4,68,13,True,False,WI,Food,1.0,US,3239.0,2500.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Change category value: Children's Books --> childrens books
df_ks['category'] = df_ks['category'].replace(['Children\'s Books'], 'childrens books')

In [6]:
df_ks[df_ks['category']=='childrens books'].head()

Unnamed: 0,backers_count,blurb,blurb_length,spotlight,staff_pick,location_state,name,category,main_category,state,...,usd_pledged_real,usd_goal_real,launch_hour,launch_day,launch_month,launch_year,deadline_day,deadline_month,deadline_year,duration_days
21,84,Astronauts onboard the space station are readi...,18,False,True,TX,Story Time From Space,childrens books,Publishing,0.0,...,5547.0,35000.0,19,7,5,2014,6,7,2014,60
122,4,Pronounced /erÉ™ pÄ« mÉ™/. Did someone say pie?,8,False,False,OR,Arie the Arapaima and His Marvelous Adventure ...,childrens books,Publishing,0.0,...,303.0,2900.0,19,8,5,2017,7,6,2017,30
153,374,Heartwarming tale of one familyâ€™s cherished ...,20,True,False,IL,Oliver the Ornament,childrens books,Publishing,1.0,...,52885.0,50000.0,14,13,7,2015,22,8,2015,40
190,98,"Teaching young kids about materials, starting ...",24,True,False,CA,A children's book about a little guy made of f...,childrens books,Publishing,1.0,...,10508.0,10000.0,17,20,10,2015,19,11,2015,30
245,26,It's bedtime but one of Hayley's favourite toy...,16,True,False,AB,Hayley's Favourite Toys,childrens books,Publishing,1.0,...,646.71,520.25,17,13,10,2017,20,10,2017,7


In [7]:
df_ks['state'].value_counts()

0.0    124115
1.0    118515
Name: state, dtype: int64

# Decision Trees

In [8]:
# Trying out Model off the shelf
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

In [9]:
# Initial model selection process
models = []
models.append(('DT', DecisionTreeClassifier()))

## Encoding Categorical Data

* Need to do Encode the categorical data
* Don't want to use LabelEncoder as we don't want to treat the nominal data as non-nominal
* Have to use One-Hot Encoding -- however, it does increase the #of features by a lot
* It may be worth removing some of the less popular types

In [10]:
# Encoding Category
df_ks = pd.get_dummies(df_ks, columns=['category'], prefix=['category'])

In [10]:
# Encoding Main_Category -- This is redundant, so remove if slow
df_ks = pd.get_dummies(df_ks, columns=['main_category'], prefix=['main_category'])

In [11]:
# Encoding location_state
df_ks = pd.get_dummies(df_ks, columns=['location_state'], prefix=['loc_state'])

In [12]:
# Encoding country
df_ks = pd.get_dummies(df_ks, columns=['country'], prefix=['country'])

In [11]:
# Remove Text-based Description
df_ks = df_ks.drop(columns=['blurb','name'])

## Baseline Decision Tree 1 -- On all Features (Including Trivial) 

In [14]:
# Looking at all features (except for text, and self-generated)
feature_names = df_ks.drop(columns=['state','duration_days']).columns
X = df_ks[feature_names]
y = df_ks.state

In [15]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df_ks.state, random_state=42)

In [16]:
# Trying the decision tree
names = []
scores = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name     Score
0   DT  0.999192


### *Likely due to the categorical variables, I couldn't do a cross-validation run*

## Baseline Decision Tree 2 -- Hand Picked
Note we choose interesting and non-trivial 

In [17]:
# Dropping Un-interesting Features or generated
feature_names = df_ks.drop(columns=['state','backers_count','spotlight','staff_pick','usd_pledged_real','duration_days']).columns
X = df_ks[feature_names]
y = df_ks.state

In [18]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df_ks.state, random_state=42)

In [19]:
# Trying the decision tree
names = []
scores = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name   Score
0   DT  0.6778


In [20]:
strat_k_fold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
#strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

names = []
scores = []

for name, model in models:
    
    accuracy = cross_val_score(model, X, y, cv=strat_k_fold, scoring='accuracy').mean()
    
    names.append(name)
    scores.append(accuracy)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

  Name   Score
0   DT  0.6787


In [23]:
strat_k_fold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
#strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

names = []
scores = []
scorings = ['accuracy','precision','recall','f1']

for name, model in models:
    
    score = cross_validate(model, X, y, cv=strat_k_fold, scoring=scorings)
    
    names.append(name)
    scores.append(score)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

  Name                                              Score
0   DT  {'fit_time': [32.337549448013306, 34.379485130...


In [45]:
scores[0]['test_accuracy'].mean()

0.6787742653422907

In [46]:
scores[0]['test_precision'].mean()

0.6721449251018595

In [47]:
scores[0]['test_recall'].mean()

0.668421718769776

In [48]:
scores[0]['test_f1'].mean()

0.6702747817464727

In [44]:
for key, value in scores[0].items():
    print(key, ' :) ', value)

fit_time  :)  [32.33754945 34.37948513 33.76241851 33.36511397 33.24326229]
score_time  :)  [0.49799705 0.51598883 0.41903496 0.41240621 0.51055026]
test_accuracy  :)  [0.67701851 0.67720397 0.67959444 0.68239707 0.67765734]
test_precision  :)  [0.66928072 0.67116638 0.67369542 0.67639675 0.67018537]
test_recall  :)  [0.66970426 0.66493693 0.66721512 0.67063241 0.66961988]
test_f1  :)  [0.66949242 0.66803713 0.67043961 0.67350225 0.6699025 ]


## Testing Top 10 Important Features
Excluding those not helpful to future project owners

1. count_7_days
2. usd_goal_real
3. optimism
4. launch_day
5. deadline_day
6. blurb_length
7. duration_days
8. launch month
9. deadline month
10. category

In [50]:
# Importing data
df = pd.read_csv("../Data/Cleaned/generated_data.csv")

In [51]:
# Encoding Category
df = pd.get_dummies(df, columns=['category'], prefix=['category'])

In [52]:
# Keeping only Top 10 Features listed above
feature_names = df.drop(columns=['backers_count', 'blurb', 'spotlight', 'staff_pick',
       'location_state', 'name', 'main_category', 'deadline',
       'launched', 'state', 'country', 'usd_pledged_real',
       'launch_hour', 'launch_year', 'deadline_year']).columns
X = df[feature_names]
y = df.state

In [53]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df.state, random_state=42)

In [54]:
# Trying the decision tree
names = []
scores = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name     Score
0   DT  0.662518


In [55]:
strat_k_fold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
#strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

names = []
scores = []
scorings = ['accuracy','precision','recall','f1']

for name, model in models:
    
    score = cross_validate(model, X, y, cv=strat_k_fold, scoring=scorings)
    
    names.append(name)
    scores.append(score)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

  Name                                              Score
0   DT  {'fit_time': [18.654775619506836, 19.836926221...


In [56]:
scores[0]['test_accuracy'].mean()

0.6628693896055722

In [57]:
scores[0]['test_precision'].mean()

0.6557163883358591

In [58]:
scores[0]['test_recall'].mean()

0.652305615322955

In [59]:
scores[0]['test_f1'].mean()

0.654003251437925

In [60]:
for key, value in scores[0].items():
    print(key, ' :) ', value)

fit_time  :)  [18.65477562 19.83692622 17.92177725 17.70283413 19.82046676]
score_time  :)  [0.20183778 0.24835968 0.18318629 0.16023898 0.20871282]
test_accuracy  :)  [0.66069736 0.66022339 0.66488068 0.66418003 0.66436549]
test_precision  :)  [0.65261027 0.65383139 0.65738821 0.65668232 0.65806974]
test_recall  :)  [0.65291313 0.64688014 0.65561321 0.65485382 0.65126777]
test_f1  :)  [0.65276167 0.65033719 0.65649951 0.65576679 0.65465109]
