In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import xgboost as xgb

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# Find the dataset @ (CSV files 2022-04-21, https://webrobots.io/kickstarter-datasets/)
# download link: https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2022-04-21T03_20_08_060Z.zip

In [2]:
# get db connection
load_dotenv()
db_connection = os.getenv("KICKSTARTER_DB_URL")

# init database engine
engine = create_engine(db_connection)

In [3]:
# Query the database for data to build models.
query = """
        SELECT  state, 
                country, 
                fx_rate, 
                goal, 
                created_at, 
                launched_at, 
                state_changed_at, 
                deadline, 
                category.name, 
                parent_name 
        FROM kickstarters
        INNER JOIN category 
        ON kickstarters.id=category.kickstarter_id
        WHERE state='failed' OR state='successful';
        """
        #  	
model_data = pd.read_sql(query, engine)
model_data.head()

Unnamed: 0,state,country,fx_rate,goal,created_at,launched_at,state_changed_at,deadline,name,parent_name
0,successful,US,1.0,5000.0,1609376406,1609545583,1611968831,1611968831,Cookbooks,Food
1,successful,HK,0.1275,350000.0,1606278560,1608912106,1614096106,1614096106,Cookbooks,Food
2,successful,ES,1.081901,50.0,1606477096,1606751262,1607900340,1607900340,Cookbooks,Food
3,successful,US,1.0,65000.0,1604500905,1606194068,1610082068,1610082068,Cookbooks,Food
4,successful,GB,1.30414,10600.0,1605454727,1605981895,1608573895,1608573895,Cookbooks,Food


In [4]:
# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.launched_at)*0.00001157
model_data['launch_time'] = (model_data.launched_at-model_data.created_at)*0.00001157
model_data = model_data.drop(columns=['state_changed_at', 'deadline', 'launched_at'])

# Drop NA's (before pipeline, maybe later version could impute missing values)
model_data = model_data.dropna()

model_data.head()

Unnamed: 0,state,country,created_at,name,parent_name,goal_usd,total_days_active,launch_time
0,successful,US,1609376406,Cookbooks,Food,5000.0,28.036979,1.957378
1,successful,HK,1606278560,Cookbooks,Food,44624.9265,59.97888,30.470127
2,successful,ES,1606477096,Cookbooks,Food,54.09505,13.294832,3.172101
3,successful,US,1604500905,Cookbooks,Food,65000.0,44.98416,19.589896
4,successful,GB,1605454727,Cookbooks,Food,13823.881138,29.98944,6.099334


In [6]:
# split into X and y variables
X = model_data.drop(columns=['state'])
y = model_data.state

# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

## Begin building preprocessing pipeline

In [8]:
# encoding transformer
encoding_columns = list((X.select_dtypes(include=['bool', 'object'])).columns)
scaling_columns = list((X.select_dtypes(include=['int', 'float'])).columns)

column_transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), encoding_columns),
    (StandardScaler(), scaling_columns),
    remainder='drop'
    )

In [9]:
# init and fit RF classifier
rf_classifier = RandomForestClassifier()
rf_pipe = make_pipeline(column_transformer, rf_classifier)
rf_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['country', 'name',
                                                   'parent_name']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['created_at', 'goal_usd',
                                                   'total_days_active',
                                                   'launch_time'])])),
                ('randomforestclassifier', RandomForestClassifier())])

In [10]:
cross_val_score(rf_pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.8287403021093105

In [18]:
# perform predictions on testing and validation data
rf_predictions  = rf_pipe.predict(X_test)
rf_predictions

array(['successful', 'successful', 'successful', ..., 'successful',
       'failed', 'successful'], dtype=object)

In [19]:
# classification reports for the test and validation set
rf_report = classification_report(y_test, rf_predictions)
print (rf_report)

              precision    recall  f1-score   support

      failed       0.75      0.75      0.75      2718
  successful       0.88      0.88      0.88      5448

    accuracy                           0.83      8166
   macro avg       0.81      0.81      0.81      8166
weighted avg       0.83      0.83      0.83      8166



In [20]:
# view confusion matrix (0=failed, 1=success)
rf_cm = pd.DataFrame(confusion_matrix(y_test, rf_predictions), index=['Failed', 'Successful'], columns=['Failed', 'Successful'])
rf_cm

Unnamed: 0,Failed,Successful
Failed,2045,673
Successful,676,4772
