### Notes:
* Decision trees can perform poorly if there are too many irrelevant values. Feature selection can help to improve accuracy by eliminating these featues. 

In [122]:
# Importing libraries
# we don't like warnings; you can comment the following 2 lines if you'd like to
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()
from matplotlib import pyplot as plt
%config InlineBackend.figure_format = 'retina'

from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

In [2]:
# Importing data
df_ks = pd.read_csv("../Data/Cleaned/joined_data.csv")

In [3]:
df_ks.head()

Unnamed: 0,backers_count,blurb,blurb_length,spotlight,staff_pick,location_state,name,category,main_category,deadline,...,usd_goal_real,hours_since_last_project,launch_hour,launch_day,launch_month,launch_year,deadline_day,deadline_month,deadline_year,duration_days
0,4,Raising money to help my grandmother recover f...,24,False,False,OH,Grandma's are Life,World Music,Music,2016-11-18,...,15000.0,66.078889,15,19,10,2016,18,11,2016,30
1,11,My work is performance based but I branch out ...,24,True,False,Scotland,Meta,Performance Art,Art,2015-05-06,...,231.2,29.5575,0,8,4,2015,6,5,2015,28
2,18,A sanctuary for humans and felines alike! Come...,24,False,False,IL,Puss N' Books: A relaxing cat cafe and bookstore.,Spaces,Food,2015-11-26,...,20000.0,113.248611,16,27,10,2015,26,11,2015,30
3,42,Taste Makers is a socially conscious brand tha...,23,False,True,BC,TASTE MAKERS BY TRISH P,Ready-to-wear,Fashion,2015-07-30,...,13795.22,2.419444,20,15,6,2015,30,7,2015,45
4,68,"The BEST beef sticks, beef jerky and signature...",13,True,False,WI,The Meat Candy Experience,Small Batch,Food,2016-07-01,...,2500.0,3.911389,0,17,5,2016,1,7,2016,45


In [4]:
# Change category value: Children's Books --> childrens books
df_ks['category'] = df_ks['category'].replace(['Children\'s Books'], 'childrens books')

In [6]:
df_ks[df_ks['category']=='childrens books'].head()

Unnamed: 0,backers_count,blurb,blurb_length,spotlight,staff_pick,location_state,name,category,main_category,state,...,usd_pledged_real,usd_goal_real,launch_hour,launch_day,launch_month,launch_year,deadline_day,deadline_month,deadline_year,duration_days
21,84,Astronauts onboard the space station are readi...,18,False,True,TX,Story Time From Space,childrens books,Publishing,0.0,...,5547.0,35000.0,19,7,5,2014,6,7,2014,60
122,4,Pronounced /erÉ™ pÄ« mÉ™/. Did someone say pie?,8,False,False,OR,Arie the Arapaima and His Marvelous Adventure ...,childrens books,Publishing,0.0,...,303.0,2900.0,19,8,5,2017,7,6,2017,30
153,374,Heartwarming tale of one familyâ€™s cherished ...,20,True,False,IL,Oliver the Ornament,childrens books,Publishing,1.0,...,52885.0,50000.0,14,13,7,2015,22,8,2015,40
190,98,"Teaching young kids about materials, starting ...",24,True,False,CA,A children's book about a little guy made of f...,childrens books,Publishing,1.0,...,10508.0,10000.0,17,20,10,2015,19,11,2015,30
245,26,It's bedtime but one of Hayley's favourite toy...,16,True,False,AB,Hayley's Favourite Toys,childrens books,Publishing,1.0,...,646.71,520.25,17,13,10,2017,20,10,2017,7


In [5]:
df_ks['state'].value_counts()

0.0    124115
1.0    118515
Name: state, dtype: int64

# Decision Trees

In [6]:
# Trying out Model off the shelf
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score

In [34]:
# Initial model selection process
models = []
models.append(('DT', DecisionTreeClassifier()))

## Encoding Categorical Data

* Need to do Encode the categorical data
* Don't want to use LabelEncoder as we don't want to treat the nominal data as non-nominal
* Have to use One-Hot Encoding -- however, it does increase the #of features by a lot
* It may be worth removing some of the less popular types

In [8]:
# Encoding Category
df_ks = pd.get_dummies(df_ks, columns=['category'], prefix=['category'])

In [9]:
# Encoding Main_Category -- This is redundant, so remove if slow
df_ks = pd.get_dummies(df_ks, columns=['main_category'], prefix=['main_category'])

In [10]:
# Encoding location_state
df_ks = pd.get_dummies(df_ks, columns=['location_state'], prefix=['loc_state'])

In [11]:
# Encoding country
df_ks = pd.get_dummies(df_ks, columns=['country'], prefix=['country'])

In [12]:
# Remove Text-based Description
df_ks = df_ks.drop(columns=['blurb','name'])

## Baseline Decision Tree 1 -- On all Features (Including Trivial) 

In [14]:
# Looking at all features (except for text, and self-generated)
feature_names = df_ks.drop(columns=['state','duration_days']).columns
X = df_ks[feature_names]
y = df_ks.state

In [15]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df_ks.state, random_state=42)

In [16]:
# Trying the decision tree
names = []
scores = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name     Score
0   DT  0.999192


### *Likely due to the categorical variables, I couldn't do a cross-validation run*

## Baseline Decision Tree 2 -- Hand Picked
Note we choose interesting and non-trivial 

In [17]:
# Dropping Un-interesting Features or generated
feature_names = df_ks.drop(columns=['state','backers_count','spotlight','staff_pick','usd_pledged_real','duration_days','launched','deadline']).columns
X = df_ks[feature_names]
y = df_ks.state

In [18]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df_ks.state, random_state=42)

In [124]:
# Trying the decision tree
names = []
scores = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))
    print(precision_score(y_test, y_pred))
    print(recall_score(y_test, y_pred))
    print(f1_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

0.6458505061162584
0.6358875137605497
0.6433561713186405
0.6396000402644029
  Name     Score
0   DT  0.645851


In [36]:
strat_k_fold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
#strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

names = []
scores = []

for name, model in models:
    
    accuracy = cross_val_score(model, X, y, cv=strat_k_fold, scoring='accuracy').mean()
    
    names.append(name)
    scores.append(accuracy)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

  Name     Score
0   DT  0.677286


In [37]:
strat_k_fold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
#strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

names = []
scores = []
scorings = ['accuracy','precision','recall','f1']

for name, model in models:
    
    score = cross_validate(model, X, y, cv=strat_k_fold, scoring=scorings)
    
    names.append(name)
    scores.append(score)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

  Name                                              Score
0   DT  {'fit_time': [37.424882888793945, 37.812394380...


In [38]:
scores[0]['test_accuracy'].mean()

0.6777644973828463

In [39]:
scores[0]['test_precision'].mean()

0.6710316026686652

In [40]:
scores[0]['test_recall'].mean()

0.6675863814706999

In [41]:
scores[0]['test_f1'].mean()

0.6693032035805991

In [44]:
for key, value in scores[0].items():
    print(key, ' :) ', value)

fit_time  :)  [32.33754945 34.37948513 33.76241851 33.36511397 33.24326229]
score_time  :)  [0.49799705 0.51598883 0.41903496 0.41240621 0.51055026]
test_accuracy  :)  [0.67701851 0.67720397 0.67959444 0.68239707 0.67765734]
test_precision  :)  [0.66928072 0.67116638 0.67369542 0.67639675 0.67018537]
test_recall  :)  [0.66970426 0.66493693 0.66721512 0.67063241 0.66961988]
test_f1  :)  [0.66949242 0.66803713 0.67043961 0.67350225 0.6699025 ]


## Testing Top N Important Features

1. count_7_days
2. usd_goal_real
3. optimism
4. launch_day
5. deadline_day
6. blurb_length
7. duration_days
8. launch month
9. deadline month
10. hours_since_last_project
11. categories

In [71]:
# Importing data
df = pd.read_csv("../Data/Cleaned/latest_data.csv")

In [72]:
cleanup_nums = {"country": {"NonUS": 0, "US": 1}}
df.replace(cleanup_nums, inplace=True)

In [51]:
# Encoding Category -- already encoded in latest data
# df = pd.get_dummies(df, columns=['category'], prefix=['category'])

In [112]:
# Keeping Top 10 Features
#feature_names = ['usd_goal_real', 'count_7_days', 'optimism', 'hours_since_last_project', 'duration_days', 'launch_day', 'deadline_day', 'blurb_length', 'launch_month', 'deadline_month']
                 
# Keeping only Top 20 Features listed above
feature_names = ['usd_goal_real', 'count_7_days', 'optimism', 'hours_since_last_project', 'duration_days', 'launch_day', 'deadline_day', 'blurb_length', 'launch_month', 'deadline_month', 'launch_year', 'deadline_year', 'category_Tabletop Games', 'category_Shorts', 'category_Documentary', 'category_Product Design', 'category_Fiction', 'category_Food', 'country', 'category_Apps']

# Top 30 features
feature_names = ['usd_goal_real', 'count_7_days', 'optimism', 'hours_since_last_project', 'duration_days', 'launch_day', 'deadline_day', 'blurb_length', 'launch_month', 'deadline_month', 'launch_year', 'deadline_year', 'category_Tabletop Games', 'category_Shorts', 'category_Documentary', 'category_Product Design', 'category_Fiction', 'category_Food', 'country', 'category_Apps', 'category_Theater', 'location_state_NY', 'category_Video Games', 'category_Comics', 'category_Web', 'category_Nonfiction', 'category_Fashion', 'category_Hip-Hop', 'category_Photography', "category_Children's Books"]

In [113]:
X = df[feature_names]
y = df.state

In [114]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = df.state, random_state=42)

In [115]:
# Trying the decision tree
names = []
scores = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores.append(accuracy_score(y_test, y_pred))
    names.append(name)

tr_split = pd.DataFrame({'Name': names, 'Score': scores})
print(tr_split)

  Name     Score
0   DT  0.647417


In [116]:
strat_k_fold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
#strat_k_fold = StratifiedKFold(n_splits=10, random_state=10)

names = []
scores = []
scorings = ['accuracy','precision','recall','f1']

for name, model in models:
    
    score = cross_validate(model, X, y, cv=strat_k_fold, scoring=scorings)
    
    names.append(name)
    scores.append(score)

kf_cross_val = pd.DataFrame({'Name': names, 'Score': scores})
print(kf_cross_val)

  Name                                              Score
0   DT  {'fit_time': [5.612142562866211, 6.59948778152...


In [117]:
scores[0]['test_accuracy'].mean()

0.6434694802786136

In [118]:
scores[0]['test_precision'].mean()

0.6338383852904752

In [119]:
scores[0]['test_recall'].mean()

0.6395899253259081

In [120]:
scores[0]['test_f1'].mean()

0.6366991192437561