In [None]:
import pandas as pd
import numpy as np
import os.path
from pathlib import Path
import glob
import json
import datetime
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

from sklearn.svm import SVC


from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

import xgboost as xgb

# Find the dataset @ (CSV files 2022-04-21, https://webrobots.io/kickstarter-datasets/)
# download link: https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2022-04-21T03_20_08_060Z.zip

### Create master Data Frame
before running this code, create a folder inside the repo folder called 'raw_data' containing the exctracted CSV files from the download link.

The code will create a new directory called 'data' to store the processed data.

In [None]:
# create folder inside the current directory to hold the processed data
try:
    Path('data').mkdir(parents=True, exist_ok=False)
    print ('creating data directory')
except:
    print ('directory already exists')
    pass

In [None]:
# path to each file in the all_data folder
all_paths = glob.glob('raw_data/*.csv')

# list to append df's to
list_of_df = []

# loop through all paths and append each csv as a df
for filename in all_paths:
    df = pd.read_csv(filename, index_col=None, header=0)
    list_of_df.append(df)

# concat all df's into one df
master_df = pd.concat(list_of_df, axis=0, ignore_index=True)

# save df as master csv
master_df.to_csv('data/master_df.csv', index=False)

### Unpack columns containing JSON objects
The code below takes all columns that represent dictionaries and saves them as individual csv files.

In [None]:
def make_json(string):
    '''
    converts the string representation of a json object into a python dict.
    returns np.nan if the string contains commas.
    '''
    try:
        # replace single quotations to make the string represent a JSON object
        json_acceptable_string = string.replace("'", "\"")
        return(json.loads(json_acceptable_string))
    except:
        # if the string is still not JSON compatible return np.nan
        return np.nan

In [None]:
def unpack(data_frame, column):
    '''
    unpacks dict in column to dataframe.
    '''
    # convert all strings into JSON objects
    unpacked = data_frame[column].apply(make_json).to_frame()

    # unpack JSON into DataFrame (future version should include 'id' column to reference whan performing merge/concat operations)
    return pd.json_normalize(unpacked[column])

In [None]:
# define which columns are represented as dicts
unpack_list = ['category', 'creator', 'location', 'photo', 'profile', 'urls']

# unpack each column of dicts, save each as their own csv
for value in unpack_list:
    frame = unpack(master_df, value)
    frame.to_csv(f'data/{value}.csv', index=False)  

In [None]:
# read category data for name and parent name
df_category = pd.read_csv('data/category.csv')
df_category = df_category[['name', 'parent_name']]

In [None]:
# drop columns deemed unessecary
model_data = master_df.drop(columns = ['staff_pick','converted_pledged_amount', 'spotlight', 'backers_count','usd_pledged','is_starrable','current_currency','static_usd_rate','usd_exchange_rate','usd_type','id','name', 'slug', 'category', 'creator', 'location', 'photo', 'profile', 'urls', 'country_displayable_name', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'source_url', 'currency', 'pledged', 'blurb'])

# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# combine category data and all other date
model_data = pd.concat([model_data, df_category], axis=1, join='inner')

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.launched_at)*0.00001157
model_data['launch_time'] = (model_data.launched_at-model_data.created_at)*0.00001157
model_data = model_data.drop(columns=['state_changed_at', 'deadline', 'launched_at'])
model_data

In [None]:
# drop rows that have state 'live' or 'cancelled'
live_projects = model_data[model_data['state'] == 'live']
model_data = model_data.drop(model_data[model_data['state'] == 'live'].index, axis=0)
model_data = model_data.drop(model_data[model_data['state'] == 'canceled'].index, axis=0)
model_data = model_data.reset_index(drop=True)
model_data.head()

### Encoding of non neumerical features

In [None]:
# init encoder for x values
encoder = LabelEncoder()

# new encoder for y variable (to make sure that we can reverse encoding)
y_encoder = LabelEncoder()
model_data['state'] = y_encoder.fit_transform(model_data['state'])

# encode boolean and string type columns
encoding_columns = (model_data.select_dtypes(include=['bool', 'object'])).columns
for column in encoding_columns:
    model_data[column] = encoder.fit_transform(model_data[column])

### Build Validation and training datasets.

In [None]:
# Convert 'created at' to datetime format
model_data['Date Created'] = pd.to_datetime(model_data['created_at'], unit='s')
model_data['Date Created'] = model_data['Date Created'].dt.strftime('%Y/%m/%d')
model_data

In [None]:
# Drop NA's
model_data = model_data.dropna()
model_data

In [None]:
# Create validation dataset
val_start_date = '2021/02/01'
val_end_date = '2021/08/07'
val_mask = (model_data['Date Created'] >= val_start_date) & (model_data['Date Created'] <= val_end_date)
val_dataset = model_data[val_mask]
val_dataset

In [None]:
# Create train/test dataset
train_test_dataset = model_data.loc[val_mask == False]
train_test_dataset = train_test_dataset.drop(['Date Created', 'created_at'], axis=1)
train_test_dataset

### Train/test splits for Test and Validation models.

In [None]:
# split into X and y variables
X = train_test_dataset.drop(columns=['state'])
y = train_test_dataset.state

# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                       random_state = 1,
                                                       stratify = y)
X_train.shape

In [None]:
# Split validation dataset into X and y variables
X_val = val_dataset.drop(columns=['state', 'Date Created', 'created_at'])
y_val = val_dataset.state

# generate 70% train/validation split 
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_val, y_val, test_size=0.3,
                                                     random_state = 1,
                                                     stratify = y_val)
X_train_val.shape

### Scale values using standard scaler

In [None]:
# init standard scaler
scaler = StandardScaler()

# scale Train and testing set, columns=X.columns to keep column headers.
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
X_val_train_scaled = pd.DataFrame(scaler.fit_transform(X_train_val), columns=X_val.columns)
X_val_test_scaled = pd.DataFrame(scaler.transform(X_test_val), columns=X_val.columns)
X_train_scaled.head()

---

# Random Forest Classifier

In [None]:
# init and fit RF classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train)

# init new RF calssifier for validation set
rf_validation_classifier = RandomForestClassifier()
rf_validation_classifier.fit(X_val_train_scaled, y_train_val)

In [None]:
# perform predictions on testing and validation data
predictions  = rf_classifier.predict(X_test_scaled)
validation_predictions = rf_validation_classifier.predict(X_val_test_scaled)

In [None]:
# decode values using inverse transform
predictions_decoded = y_encoder.inverse_transform(predictions)
y_test_decoded = y_encoder.inverse_transform(y_test)
validation_predictions_decoded = y_encoder.inverse_transform(validation_predictions)
y_val_decoded = y_encoder.inverse_transform(y_test_val)

In [None]:
# view model importance of features for the model
importances = pd.DataFrame(zip(X.columns, rf_classifier.feature_importances_), columns = ['Feature','Importance value']).set_index('Feature')
importances

In [None]:
# classification reports for the test and validation set
test_report = classification_report(y_test_decoded, predictions_decoded)
val_report = classification_report(y_val_decoded, validation_predictions_decoded)
report_string = f'Random Forest test report\n-----------------------------------------------------\n{test_report}\nRandom Forest validation report\n-----------------------------------------------------\n{val_report}'
print(report_string)

In [None]:
# view confusion matrix (0=failed, 1=success)
cm = pd.DataFrame(confusion_matrix(y_test_decoded, predictions_decoded), index=['Cancelled','Failed', 'live', 'Successful'], columns=['Cancelled','Failed', 'live', 'Successful'])
val_cm = pd.DataFrame(confusion_matrix(y_val_decoded, validation_predictions_decoded), index=['Cancelled','Failed', 'live', 'Successful'], columns=['Cancelled','Failed', 'live', 'Successful'])
cm

In [None]:
# Calculate the ROC curve and AUC for the testing set
fpr_test, tpr_test, thresholds_test = roc_curve(y_test, predictions)
auc_test = round(auc(fpr_test, tpr_test), 3)

# Calculate the ROC curve and AUC for the validation set
fpr_val, tpr_val, thresholds_val = roc_curve(y_val, validation_predictions)
auc_val = round(auc(fpr_val, tpr_val), 3)

# Create a DataFrame with the fpr and tpr results
roc_df_test = pd.DataFrame({"FPR Test": fpr_test, "TPR Test": tpr_test})
roc_df_val = pd.DataFrame({"FPR Val": fpr_val, "TPR Val": tpr_val})

In [None]:
# Plot test set ROC curve
roc_df_test.plot(figsize = (10,6), x="FPR Test", y="TPR Test", title=f"Test ROC Curve (AUC={auc_test})")

In [None]:
# Plot val set ROC curve
roc_df_val.plot(figsize=(10,6), x="FPR Val", y="TPR Val", title=f"Validation ROC Curve (AUC={auc_val})")

---

# SVM Classifier

In [None]:
classifier = SVC(kernel='linear', max_iter=50)
classifier

In [None]:
classifier.fit(X_train, y_train)

In [None]:
# Score the accuracy
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_val, y_val)}")

In [None]:
# Make predictions using the test data
predictions = classifier.predict(X_val)
results = pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_val
}).reset_index(drop=True)
results.tail()

In [None]:
# Display the confusion matrix
cm = confusion_matrix(y_val, predictions)
cm_df = pd.DataFrame(
cm, index=['Actual 0', 'Actual 1', 'Actual 2', 'Actual 3'], columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3']
)
# Displaying results
print("Confusion Matrix")
display(cm_df)

In [None]:
# Calculate classification report
print(classification_report(y_val, predictions))

---
# SKLearn Neural Network Classifier

In [None]:
# init and fit Neural Networks classifier
X = [[0., 0.], [1., 1.]]
y = [0, 1]
nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                       hidden_layer_sizes=(4, 2), random_state=1)

nn_clf.fit(X, y)
#MLPClassifier(alpha=1e-05, hidden_layer_sizes=(6, 2), random_state=1,
              #solver='lbfgs')
#nn_clf.fit(X_train_scaled, y_train.state)

In [None]:
# Fit the data
nn_clf.fit(X_train, y_train)

In [None]:
print(f"Training Data Score: {nn_clf.score(X_train, y_train)}")
print(f"Testing Data Score: {nn_clf.score(X_val, y_val)}")

In [None]:
# Make predictions using the test data
predictions = nn_clf.predict(X_val)
results = pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_val
}).reset_index(drop=True)
results.tail()

In [None]:
# Display the confusion matrix

cm = confusion_matrix(y_val, predictions)
cm_df = pd.DataFrame(
cm, index=['Actual 0', 'Actual 1', 'Actual 2', 'Actual 3'], columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3']
)
# Displaying results
print("Confusion Matrix")
display(cm_df)

---
 # XGBoost Classifier

In [None]:
# init datasets as XGB matrices
xgb_train = xgb.DMatrix(np.array(X_train_resampled), label=np.array(y_train_resampled.bins))
xgb_test = xgb.DMatrix(np.array(X_test_scaled), label=np.array(y_test.bins))

# specify parameters via map
param = {
    'max_depth':5, 
    'eta':1, 
    'num_class':8, 
    'gamma':0,
    'max_depth':6,
    'min_child_weight':10,
    'num_parallel_tree':1
    }
num_round = 5

# fit XGB classifier
bst = xgb.train(param, xgb_train, num_round)

In [None]:
# generate predictions
predictions = bst.predict(xgb_test)