In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score


### Build dataset and trim features

In [2]:
# all of our usable dataframes
df = pd.read_csv('data/master_df.csv')
nlp_data = df[['blurb', 'slug', 'country']]  #also includes slug from category and some info from location

In [3]:
# read category data for name and parent name
df_category = pd.read_csv('data/category.csv')
df_category = df_category[['name', 'parent_name']]

In [4]:
# drop columns deemed unessecary
model_data = df.drop(columns = ['backers_count','usd_pledged','is_starrable','current_currency','static_usd_rate','usd_exchange_rate','usd_type','id','name', 'slug', 'category', 'creator', 'location', 'photo', 'profile', 'urls', 'country_displayable_name', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'source_url', 'currency', 'pledged', 'blurb'])

# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# combine category data and all other date
model_data = pd.concat([model_data, df_category], axis=1, join='inner')

# create percentage funded column as another metric
model_data['percentage_funded'] = model_data.converted_pledged_amount/model_data.goal_usd*100

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.created_at)*0.00001157

model_data

Unnamed: 0,converted_pledged_amount,country,created_at,deadline,launched_at,spotlight,staff_pick,state,state_changed_at,goal_usd,name,parent_name,percentage_funded,total_days_active
0,5034,US,1609376406,1611968831,1609545583,True,False,successful,1611968831,5000.000000,Cookbooks,Food,100.680000,29.994357
1,48365,HK,1606278560,1614096106,1608912106,True,True,successful,1614096106,44624.926500,Cookbooks,Food,108.381131,90.449007
2,98,ES,1606477096,1607900340,1606751262,True,False,successful,1607900340,54.095050,Cookbooks,Food,181.162603,16.466933
3,127765,US,1604500905,1610082068,1606194068,True,True,successful,1610082068,65000.000000,Cookbooks,Food,196.561538,64.574056
4,14574,GB,1605454727,1608573895,1605981895,True,True,successful,1608573895,13823.881138,Cookbooks,Food,105.426254,36.088774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30987,501,US,1632678330,1634318870,1633022870,False,False,failed,1634318870,10000.000000,Software,Technology,5.010000,18.981048
30988,1,CA,1632726042,1638144718,1632957118,False,False,failed,1638144719,39722.546000,Software,Technology,0.002517,62.694081
30989,0,DK,1631517750,1635408514,1632816514,False,False,failed,1635408514,145430.220000,Software,Technology,0.000000,45.016139
30990,70,US,1429554526,1432913659,1430321659,False,False,failed,1432913660,35000.000000,Plays,Theater,0.200000,38.865169


In [5]:
model_data.isna().sum().sum()

1374

In [6]:
# drop rows with na values
model_data = model_data.dropna()

# # drop rows that have state 'live' or 'cancelled'
# model_data = model_data.drop(model_data[model_data['state'] == 'live'].index, axis=0)
# model_data = model_data.drop(model_data[model_data['state'] == 'canceled'].index, axis=0)
# model_data = model_data.reset_index(drop=True)
# model_data.head()

In [7]:
# split into X and y variables
X = model_data.drop(columns=['state'])
y = model_data.state.to_frame()

In [8]:
# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Encoding of non neumerical features

In [9]:
# init encoder for x values
encoder = LabelEncoder()

# encode boolean and string type columns
encoding_columns = (X.select_dtypes(include=['bool', 'object'])).columns
for column in encoding_columns:
    X_train[column] = encoder.fit_transform(X_train[column])
    X_test[column] = encoder.transform(X_test[column])

In [10]:
# new encoder for y variable (to make sure that we can reverse encoding)
y_encoder = LabelEncoder()
y_train['state'] = y_encoder.fit_transform(y_train['state'])
y_test['state'] = y_encoder.transform(y_test['state'])

### Scale values using standard scaler

In [11]:
# init standard scaler
scaler = MinMaxScaler()

# scale Train and testing set, columns=X.columns to keep column headers.
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
X_train_scaled.head()

Unnamed: 0,converted_pledged_amount,country,created_at,deadline,launched_at,spotlight,staff_pick,state_changed_at,goal_usd,name,parent_name,percentage_funded,total_days_active
0,2.936235e-05,0.375,0.866849,0.957796,0.963997,1.0,1.0,0.970015,7e-06,0.384615,0.111111,0.000158,0.146986
1,5.987428e-07,1.0,0.624222,0.653813,0.662349,0.0,0.0,0.662153,1e-06,0.512821,0.555556,1.4e-05,0.058515
2,0.0001779464,1.0,0.967247,0.982792,0.989079,1.0,0.0,0.995329,5e-05,0.641026,0.222222,0.000125,0.039882
3,3.144597e-05,1.0,0.705262,0.708593,0.713156,0.0,1.0,0.716293,0.0001,0.025641,0.555556,1.1e-05,0.021403
4,0.0003850395,0.708333,0.95239,0.954708,0.958791,1.0,0.0,0.966888,0.00015,0.641026,0.222222,9e-05,0.020963


### Build Random forest classifier

In [12]:
# init and fit RF classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train.state)

RandomForestClassifier()

In [13]:
# perform predictions on testing data
predictions  = rf_classifier.predict(X_test_scaled)

In [14]:
# print classification report
print(classification_report(y_test, predictions))


              precision    recall  f1-score   support

           0       0.78      0.10      0.18       278
           1       0.92      1.00      0.96      2804
           2       0.99      1.00      1.00       226
           3       1.00      1.00      1.00      5585

    accuracy                           0.97      8893
   macro avg       0.92      0.77      0.78      8893
weighted avg       0.97      0.97      0.96      8893



In [15]:
# view confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[  28,  248,    2,    0],
       [   8, 2796,    0,    0],
       [   0,    0,  226,    0],
       [   0,    0,    0, 5585]])