In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline

# Find the dataset @ (CSV files 2022-04-21, https://webrobots.io/kickstarter-datasets/)
# download link: https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2022-04-21T03_20_08_060Z.zip

In [2]:
# get db connection
load_dotenv()
db_connection = os.getenv("KICKSTARTER_DB_URL")

# init database engine
engine = create_engine(db_connection)

In [3]:
# Query the database for data to build models.
query = """
        SELECT  kickstarters.state, 
                country, 
                fx_rate, 
                goal, 
                created_at, 
                launched_at, 
                kickstarters.state_changed_at, 
                deadline, 
                parent_name,
                category.name
        FROM kickstarters
        INNER JOIN category
        ON kickstarters.id=category.kickstarter_id
        WHERE state='failed' OR state='successful';
        """
        #  	
model_data = pd.read_sql(query, engine)
model_data.head()

Unnamed: 0,state,country,fx_rate,goal,created_at,launched_at,state_changed_at,deadline,parent_name,name
0,successful,US,1.0,5000.0,1609376406,1609545583,1611968831,1611968831,Food,Cookbooks
1,successful,HK,0.1275,350000.0,1606278560,1608912106,1614096106,1614096106,Food,Cookbooks
2,successful,ES,1.081901,50.0,1606477096,1606751262,1607900340,1607900340,Food,Cookbooks
3,successful,US,1.0,65000.0,1604500905,1606194068,1610082068,1610082068,Food,Cookbooks
4,successful,GB,1.30414,10600.0,1605454727,1605981895,1608573895,1608573895,Food,Cookbooks


In [4]:
# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.launched_at)*0.00001157
model_data['launch_time'] = (model_data.launched_at-model_data.created_at)*0.00001157
model_data = model_data.drop(columns=['state_changed_at', 'deadline', 'launched_at', 'created_at'])

# Drop NA's (before pipeline, maybe later version could impute missing values)
model_data = model_data.dropna()

model_data.head()

Unnamed: 0,state,country,parent_name,name,goal_usd,total_days_active,launch_time
0,successful,US,Food,Cookbooks,5000.0,28.036979,1.957378
1,successful,HK,Food,Cookbooks,44624.9265,59.97888,30.470127
2,successful,ES,Food,Cookbooks,54.09505,13.294832,3.172101
3,successful,US,Food,Cookbooks,65000.0,44.98416,19.589896
4,successful,GB,Food,Cookbooks,13823.881138,29.98944,6.099334


In [5]:
# split into X and y variables
X = model_data.drop(columns=['state'])
y = model_data.state

# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state = 1)

## Begin building pipeline

In [6]:
# encoding transformer
encoding_columns = list((X.select_dtypes(include=['bool', 'object'])).columns)
scaling_columns = list((X.select_dtypes(include=['int', 'float'])).columns)

column_transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), encoding_columns),
    (StandardScaler(), scaling_columns),
    remainder='drop'
    )

### Random Forest pipeline

In [7]:
# init and fit RF classifier
rf_classifier = RandomForestClassifier()
rf_pipe = make_pipeline(column_transformer, rf_classifier)
rf_pipe.fit(X_train, y_train)

# perform predictions on testing and validation data
rf_predictions  = rf_pipe.predict(X_test)

# classification reports for the test and validation set
rf_report = classification_report(y_test, rf_predictions)
print (rf_report)

              precision    recall  f1-score   support

      failed       0.71      0.69      0.70      2759
  successful       0.84      0.86      0.85      5407

    accuracy                           0.80      8166
   macro avg       0.78      0.77      0.78      8166
weighted avg       0.80      0.80      0.80      8166



In [8]:
# pickle the model
pickle.dump(rf_pipe, open('rf_pipe.sav', 'wb'))

### SVM pipeline

In [9]:
# init and fit RF classifier
svm_classifier = SVC(kernel='linear', max_iter=500)
svm_pipe = make_pipeline(column_transformer, svm_classifier)
svm_pipe.fit(X_train, y_train)

# perform predictions on testing and validation data
svm_predictions  = svm_pipe.predict(X_test)

# classification reports for the test and validation set
svm_report = classification_report(y_test, svm_predictions)
print (svm_report)



              precision    recall  f1-score   support

      failed       0.83      0.02      0.03      2759
  successful       0.67      1.00      0.80      5407

    accuracy                           0.67      8166
   macro avg       0.75      0.51      0.42      8166
weighted avg       0.72      0.67      0.54      8166



In [10]:
# pickle the model
pickle.dump(svm_pipe, open('svm_pipe.sav', 'wb'))

### Neural network pipeline

In [11]:
# init and fit RF classifier
nn_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(4, 2), random_state=1)
nn_pipe = make_pipeline(column_transformer, nn_classifier)
nn_pipe.fit(X_train, y_train)

# perform predictions on testing and validation data
nn_predictions  = nn_pipe.predict(X_test)

# classification reports for the test and validation set
nn_report = classification_report(y_test, nn_predictions)
print (nn_report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


              precision    recall  f1-score   support

      failed       0.72      0.66      0.69      2759
  successful       0.83      0.87      0.85      5407

    accuracy                           0.80      8166
   macro avg       0.77      0.76      0.77      8166
weighted avg       0.79      0.80      0.79      8166



In [12]:
# pickle the model
pickle.dump(nn_pipe, open('nn_pipe.sav', 'wb'))