In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# Find the dataset @ (CSV files 2022-04-21, https://webrobots.io/kickstarter-datasets/)
# download link: https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2022-04-21T03_20_08_060Z.zip

In [2]:
# get db connection
load_dotenv()
db_connection = os.getenv("KICKSTARTER_DB_URL")

# init database engine
engine = create_engine(db_connection)

In [3]:
# Query the database for data to build models.
query = """
        SELECT  kickstarters.state, 
                country, 
                fx_rate, 
                goal, 
                created_at, 
                launched_at, 
                kickstarters.state_changed_at, 
                deadline, 
                parent_name,
                category.name,
                profile.state as profile_state 
        FROM kickstarters
        INNER JOIN category
        ON kickstarters.id=category.kickstarter_id
        INNER JOIN profile 
        ON kickstarters.id=profile.kickstarter_id
        WHERE kickstarters.state='failed' OR kickstarters.state='successful';
        """
        #  	
model_data = pd.read_sql(query, engine)
model_data.head()

Unnamed: 0,state,country,fx_rate,goal,created_at,launched_at,state_changed_at,deadline,parent_name,name,profile_state
0,successful,US,1.0,5000.0,1609376406,1609545583,1611968831,1611968831,Food,Cookbooks,inactive
1,successful,HK,0.1275,350000.0,1606278560,1608912106,1614096106,1614096106,Food,Cookbooks,active
2,successful,ES,1.081901,50.0,1606477096,1606751262,1607900340,1607900340,Food,Cookbooks,inactive
3,successful,US,1.0,65000.0,1604500905,1606194068,1610082068,1610082068,Food,Cookbooks,active
4,successful,GB,1.30414,10600.0,1605454727,1605981895,1608573895,1608573895,Food,Cookbooks,inactive


In [4]:
# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.launched_at)*0.00001157
model_data['launch_time'] = (model_data.launched_at-model_data.created_at)*0.00001157
model_data = model_data.drop(columns=['state_changed_at', 'deadline', 'launched_at'])

# Drop NA's (before pipeline, maybe later version could impute missing values)
model_data = model_data.dropna()

model_data.head()

Unnamed: 0,state,country,created_at,parent_name,name,profile_state,goal_usd,total_days_active,launch_time
0,successful,US,1609376406,Food,Cookbooks,inactive,5000.0,28.036979,1.957378
1,successful,HK,1606278560,Food,Cookbooks,active,44624.9265,59.97888,30.470127
2,successful,ES,1606477096,Food,Cookbooks,inactive,54.09505,13.294832,3.172101
3,successful,US,1604500905,Food,Cookbooks,active,65000.0,44.98416,19.589896
4,successful,GB,1605454727,Food,Cookbooks,inactive,13823.881138,29.98944,6.099334


In [5]:
# split into X and y variables
X = model_data.drop(columns=['state'])
y = model_data.state

# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

## Begin building pipeline

In [6]:
# encoding transformer
encoding_columns = list((X.select_dtypes(include=['bool', 'object'])).columns)
scaling_columns = list((X.select_dtypes(include=['int', 'float'])).columns)

column_transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), encoding_columns),
    (StandardScaler(), scaling_columns),
    remainder='drop'
    )

In [7]:
# init and fit RF classifier
rf_classifier = RandomForestClassifier()
rf_pipe = make_pipeline(column_transformer, rf_classifier)
rf_pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['country', 'parent_name',
                                                   'name', 'profile_state']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['created_at', 'goal_usd',
                                                   'total_days_active',
                                                   'launch_time'])])),
                ('randomforestclassifier', RandomForestClassifier())])

In [8]:
cross_val_score(rf_pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.8499153020892152

In [9]:
# perform predictions on testing and validation data
rf_predictions  = rf_pipe.predict(X_test)
rf_predictions

array(['successful', 'successful', 'successful', ..., 'successful',
       'failed', 'successful'], dtype=object)

In [10]:
# classification reports for the test and validation set
rf_report = classification_report(y_test, rf_predictions)
print (rf_report)

              precision    recall  f1-score   support

      failed       0.79      0.84      0.81      2772
  successful       0.90      0.87      0.89      4818

    accuracy                           0.86      7590
   macro avg       0.85      0.85      0.85      7590
weighted avg       0.86      0.86      0.86      7590



In [11]:
# view confusion matrix (0=failed, 1=success)
rf_cm = pd.DataFrame(confusion_matrix(y_test, rf_predictions), index=['Failed', 'Successful'], columns=['Failed', 'Successful'])
rf_cm

Unnamed: 0,Failed,Successful
Failed,2333,439
Successful,636,4182
