In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score


### Build dataset and trim features

In [2]:
# all of our usable dataframes
df = pd.read_csv('data/master_df.csv')
nlp_data = df[['blurb', 'slug', 'country']]  #also includes slug from category and some info from location

In [3]:
# read category data for name and parent name
df_category = pd.read_csv('data/category.csv')
df_category = df_category[['name', 'parent_name']]

In [4]:
val_start_date = '01/02/2021'
val_end_date = '07/08/2021'

In [6]:
datetime_index = pd.to_datetime(df['created_at'], format = 's')
df = df.set_index(datetime_index)
df.head()

ValueError: time data '1609376406' does not match format 's' (match)

In [5]:
# drop columns deemed unessecary
model_data = df.drop(columns = ['staff_pick','converted_pledged_amount', 'spotlight', 'backers_count','usd_pledged','is_starrable','current_currency','static_usd_rate','usd_exchange_rate','usd_type','id','name', 'slug', 'category', 'creator', 'location', 'photo', 'profile', 'urls', 'country_displayable_name', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'source_url', 'currency', 'pledged', 'blurb'])

# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# combine category data and all other date
model_data = pd.concat([model_data, df_category], axis=1, join='inner')

# create percentage funded column as another metric
# model_data['percentage_funded'] = model_data.converted_pledged_amount/model_data.goal_usd*100

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.launched_at)*0.00001157
model_data['launch_time'] = (model_data.launched_at-model_data.created_at)*0.00001157
model_data = model_data.drop(columns=['state_changed_at', 'deadline', 'launched_at', 'created_at'])
model_data

Unnamed: 0,country,created_at,deadline,launched_at,state,state_changed_at,goal_usd,name,parent_name,total_days_active,launch_time
0,US,1609376406,1611968831,1609545583,successful,1611968831,5000.000000,Cookbooks,Food,28.036979,1.957378
1,HK,1606278560,1614096106,1608912106,successful,1614096106,44624.926500,Cookbooks,Food,59.978880,30.470127
2,ES,1606477096,1607900340,1606751262,successful,1607900340,54.095050,Cookbooks,Food,13.294832,3.172101
3,US,1604500905,1610082068,1606194068,successful,1610082068,65000.000000,Cookbooks,Food,44.984160,19.589896
4,GB,1605454727,1608573895,1605981895,successful,1608573895,13823.881138,Cookbooks,Food,29.989440,6.099334
...,...,...,...,...,...,...,...,...,...,...,...
30987,US,1632678330,1634318870,1633022870,failed,1634318870,10000.000000,Software,Technology,14.994720,3.986328
30988,CA,1632726042,1638144718,1632957118,failed,1638144719,39722.546000,Software,Technology,60.020532,2.673549
30989,DK,1631517750,1635408514,1632816514,failed,1635408514,145430.220000,Software,Technology,29.989440,15.026699
30990,US,1429554526,1432913659,1430321659,failed,1432913660,35000.000000,Plays,Theater,29.989440,8.875729


In [6]:
model_data.isna().sum().sum()

1374

In [7]:
# drop rows with na values
model_data = model_data.dropna()

# drop rows that have state 'live' or 'cancelled'
# model_data = model_data.drop(model_data[model_data['state'] == 'live'].index, axis=0)
# model_data = model_data.drop(model_data[model_data['state'] == 'canceled'].index, axis=0)
# model_data = model_data.reset_index(drop=True)
# model_data.head()

In [8]:
# split into X and y variables
X = model_data.drop(columns=['state'])
y = model_data.state.to_frame()

In [9]:
# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Encoding of non neumerical features

In [10]:
# init encoder for x values
encoder = LabelEncoder()

# encode boolean and string type columns
encoding_columns = (X.select_dtypes(include=['bool', 'object'])).columns
for column in encoding_columns:
    X_train[column] = encoder.fit_transform(X_train[column])
    X_test[column] = encoder.transform(X_test[column])

In [11]:
# new encoder for y variable (to make sure that we can reverse encoding)
y_encoder = LabelEncoder()
y_train['state'] = y_encoder.fit_transform(y_train['state'])
y_test['state'] = y_encoder.transform(y_test['state'])

### Scale values using standard scaler

In [12]:
# init standard scaler
scaler = MinMaxScaler()

# scale Train and testing set, columns=X.columns to keep column headers.
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
X_train_scaled.head()

Unnamed: 0,country,created_at,deadline,launched_at,state_changed_at,goal_usd,name,parent_name,total_days_active,launch_time
0,0.541667,0.672996,0.671161,0.671142,0.671161,0.000155,0.947368,0.555556,0.322222,0.000149
1,0.375,0.686221,0.697782,0.696502,0.697782,0.00014,0.210526,0.444444,0.366667,0.01631
2,0.583333,0.933999,0.954436,0.946794,0.954436,5.6e-05,0.631579,0.222222,0.490339,0.020565
3,0.375,0.917635,0.922868,0.915186,0.922868,0.003074,0.552632,0.777778,0.518365,0.000267
4,1.0,0.741394,0.74291,0.74118,0.74291,0.000357,0.605263,0.555556,0.353246,0.00258


### Build Random forest classifier

In [13]:
# init and fit RF classifier
rf_classifier = RandomForestClassifier(oob_score=True)
rf_classifier.fit(X_train_scaled, y_train.state)

RandomForestClassifier(oob_score=True)

In [14]:
# perform predictions on testing data
predictions  = rf_classifier.predict(X_test_scaled)

In [15]:
# print classification report
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.76      0.74      0.75      2777
           1       0.87      0.88      0.88      5601

    accuracy                           0.84      8378
   macro avg       0.81      0.81      0.81      8378
weighted avg       0.83      0.84      0.83      8378



In [16]:
# view confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[2065,  712],
       [ 670, 4931]])

In [17]:
rf_classifier.oob_score_

0.8286781256394516

In [18]:
importances = rf_classifier.feature_importances_
zipper = zip(X.columns, importances)
for col, item in zipper:
    print (col, item)

country 0.02724469002585024
created_at 0.08090729457941205
deadline 0.08977121915057301
launched_at 0.08934406952345399
state_changed_at 0.0855430230919274
goal_usd 0.12783769440693604
name 0.20786476641640358
parent_name 0.07607960665002853
total_days_active 0.0827578645263805
launch_time 0.13264977162903477


In [19]:
model_data

Unnamed: 0,country,created_at,deadline,launched_at,state,state_changed_at,goal_usd,name,parent_name,total_days_active,launch_time
0,US,1609376406,1611968831,1609545583,successful,1611968831,5000.000000,Cookbooks,Food,28.036979,1.957378
1,HK,1606278560,1614096106,1608912106,successful,1614096106,44624.926500,Cookbooks,Food,59.978880,30.470127
2,ES,1606477096,1607900340,1606751262,successful,1607900340,54.095050,Cookbooks,Food,13.294832,3.172101
3,US,1604500905,1610082068,1606194068,successful,1610082068,65000.000000,Cookbooks,Food,44.984160,19.589896
4,GB,1605454727,1608573895,1605981895,successful,1608573895,13823.881138,Cookbooks,Food,29.989440,6.099334
...,...,...,...,...,...,...,...,...,...,...,...
27921,US,1632678330,1634318870,1633022870,failed,1634318870,10000.000000,Software,Technology,14.994720,3.986328
27922,CA,1632726042,1638144718,1632957118,failed,1638144719,39722.546000,Software,Technology,60.020532,2.673549
27923,DK,1631517750,1635408514,1632816514,failed,1635408514,145430.220000,Software,Technology,29.989440,15.026699
27924,US,1429554526,1432913659,1430321659,failed,1432913660,35000.000000,Plays,Theater,29.989440,8.875729
