In [19]:
import pandas as pd
import numpy as np
import os.path
from pathlib import Path
import glob
import json

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Find the dataset @ (CSV files 2022-04-21, https://webrobots.io/kickstarter-datasets/)
# download link: https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2022-04-21T03_20_08_060Z.zip

### Create master Data Frame
before running this code, create a folder inside the repo folder called 'raw_data' contiaining the exctracted CSV files from the download link.

The code will create a new directory called 'data' to store the processed data.

In [20]:
# create folder inside the current directory to hold the processed data
try:
    Path('data').mkdir(parents=True, exist_ok=False)
    print ('creating data directory')
except:
    print ('directory already exists')
    pass

directory already exists


In [21]:
# path to each file in the all_data folder
all_paths = glob.glob('raw_data/*.csv')

# list to append df's to
list_of_df = []

# loop through all paths and append each csv as a df
for filename in all_paths:
    df = pd.read_csv(filename, index_col=None, header=0)
    list_of_df.append(df)

# concat all df's into one df
master_df = pd.concat(list_of_df, axis=0, ignore_index=True)

# save df as master csv
master_df.to_csv('data/master_df.csv', index=False)

### Unpack columns containing JSON objects
The code below takes all columns that represent dictionaries and saves them as individual csv files.

In [22]:
def make_json(string):
    '''
    converts the string representation of a json object into a python dict.
    returns np.nan if the string contains commas.
    '''
    try:
        # replace single quotations to make the string represent a JSON object
        json_acceptable_string = string.replace("'", "\"")
        return(json.loads(json_acceptable_string))
    except:
        # if the string is still not JSON compatible return np.nan
        return np.nan

In [23]:
def unpack(data_frame, column):
    '''
    unpacks dict in column to dataframe.
    '''
    # convert all strings into JSON objects
    unpacked = data_frame[column].apply(make_json).to_frame()

    # unpack JSON into DataFrame
    return pd.json_normalize(unpacked[column])

In [24]:
# define which columns are represented as dicts
unpack_list = ['category', 'creator', 'location', 'photo', 'profile', 'urls']

# init empty df 
unpacked_df = pd.DataFrame()

# unpack each column of dicts, save each as their own csv
for value in unpack_list:
    frame = unpack(master_df, value)
    frame.to_csv(f'data/{value}.csv', index=False)  

In [25]:
# read category data for name and parent name
df_category = pd.read_csv('data/category.csv')
df_category = df_category[['name', 'parent_name']]

In [26]:
# drop columns deemed unessecary
model_data = master_df.drop(columns = ['backers_count','usd_pledged','is_starrable','current_currency','static_usd_rate','usd_exchange_rate','usd_type','id','name', 'slug', 'category', 'creator', 'location', 'photo', 'profile', 'urls', 'country_displayable_name', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'source_url', 'currency', 'pledged', 'blurb'])

# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# combine category data and all other date
model_data = pd.concat([model_data, df_category], axis=1, join='inner')

# create percentage funded column as another metric
model_data['percentage_funded'] = model_data.converted_pledged_amount/model_data.goal_usd*100

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.created_at)*0.00001157
model_data['launch_time'] = (model_data.launched_at-model_data.created_at)*0.00001157
model_data = model_data.drop(columns=['state_changed_at', 'deadline', 'launched_at'])
model_data

Unnamed: 0,converted_pledged_amount,country,created_at,spotlight,staff_pick,state,goal_usd,name,parent_name,percentage_funded,total_days_active,launch_time
0,5034,US,1609376406,True,False,successful,5000.000000,Cookbooks,Food,100.680000,29.994357,1.957378
1,48365,HK,1606278560,True,True,successful,44624.926500,Cookbooks,Food,108.381131,90.449007,30.470127
2,98,ES,1606477096,True,False,successful,54.095050,Cookbooks,Food,181.162603,16.466933,3.172101
3,127765,US,1604500905,True,True,successful,65000.000000,Cookbooks,Food,196.561538,64.574056,19.589896
4,14574,GB,1605454727,True,True,successful,13823.881138,Cookbooks,Food,105.426254,36.088774,6.099334
...,...,...,...,...,...,...,...,...,...,...,...,...
30987,3395,US,1434905504,True,False,successful,3000.000000,Fiction,Publishing,113.166667,104.846831,74.857391
30988,21112,US,1440643700,True,False,successful,12000.000000,Fiction,Publishing,175.933333,38.111337,8.121897
30989,2000,US,1441111356,True,False,successful,2000.000000,Fiction,Publishing,100.000000,60.256653,0.277773
30990,3125,US,1436479764,True,False,successful,3000.000000,Fiction,Publishing,104.166667,83.577804,53.588364


In [30]:
model_data['state'].unique()

array(['successful', 'live', 'failed', 'canceled'], dtype=object)

In [32]:
filter = (model_data['state'] == 'successful')
suc_count = model_data[filter]
suc_count


Unnamed: 0,converted_pledged_amount,country,created_at,spotlight,staff_pick,state,goal_usd,name,parent_name,percentage_funded,total_days_active,launch_time
0,5034,US,1609376406,True,False,successful,5000.000000,Cookbooks,Food,100.680000,29.994357,1.957378
1,48365,HK,1606278560,True,True,successful,44624.926500,Cookbooks,Food,108.381131,90.449007,30.470127
2,98,ES,1606477096,True,False,successful,54.095050,Cookbooks,Food,181.162603,16.466933,3.172101
3,127765,US,1604500905,True,True,successful,65000.000000,Cookbooks,Food,196.561538,64.574056,19.589896
4,14574,GB,1605454727,True,True,successful,13823.881138,Cookbooks,Food,105.426254,36.088774,6.099334
...,...,...,...,...,...,...,...,...,...,...,...,...
30987,3395,US,1434905504,True,False,successful,3000.000000,Fiction,Publishing,113.166667,104.846831,74.857391
30988,21112,US,1440643700,True,False,successful,12000.000000,Fiction,Publishing,175.933333,38.111337,8.121897
30989,2000,US,1441111356,True,False,successful,2000.000000,Fiction,Publishing,100.000000,60.256653,0.277773
30990,3125,US,1436479764,True,False,successful,3000.000000,Fiction,Publishing,104.166667,83.577804,53.588364


In [33]:
filter = (model_data['state'] == 'failed')
failed_count = model_data[filter]
failed_count

Unnamed: 0,converted_pledged_amount,country,created_at,spotlight,staff_pick,state,goal_usd,name,parent_name,percentage_funded,total_days_active,launch_time
72,189,CA,1413576209,False,False,failed,3177.80368,Community Gardens,Food,5.947504,33.279832,3.248740
73,312,US,1413372787,False,False,failed,50000.00000,Community Gardens,Food,0.624000,33.506870,5.356598
74,629,US,1405465675,False,True,failed,35000.00000,Community Gardens,Food,1.797143,124.169633,94.138541
75,3855,US,1406151525,False,False,failed,5000.00000,Community Gardens,Food,77.100000,115.265836,83.624929
76,17,AU,1412089227,False,False,failed,1482.56870,Community Gardens,Food,1.146658,51.121420,6.095608
...,...,...,...,...,...,...,...,...,...,...,...,...
30951,26,DK,1560246641,False,False,failed,3635.75550,Print,Journalism,0.715120,50.217213,20.227773
30952,12,FR,1558984155,False,False,failed,21638.01980,Web,Journalism,0.055458,94.773422,34.794542
30953,148,US,1525030776,False,False,failed,4500.00000,Journalism,,3.288889,470.915013,424.932697
30954,52,US,1561361890,False,False,failed,1000.00000,Web,Journalism,5.200000,33.382863,3.393423


In [37]:
filter = (model_data['state'] == 'canceled')
canceled_count = model_data[filter]
canceled_count

Unnamed: 0,converted_pledged_amount,country,created_at,spotlight,staff_pick,state,goal_usd,name,parent_name,percentage_funded,total_days_active,launch_time
684,226,US,1547604548,False,False,canceled,10000.000000,Literary Spaces,Publishing,2.260000,63.984321,6.663776
685,1,US,1529520578,False,False,canceled,2500.000000,Literary Spaces,Publishing,0.040000,31.095810,1.106370
686,4,SG,1518364229,False,False,canceled,73.259107,Literary Spaces,Publishing,5.460072,30.988752,2.449762
687,600,US,1517355605,False,False,canceled,60000.000000,Literary Spaces,Publishing,1.000000,32.109816,2.120376
688,0,US,1513713218,False,False,canceled,5000.000000,Literary Spaces,Publishing,0.000000,30.078772,0.089332
...,...,...,...,...,...,...,...,...,...,...,...,...
28155,755,US,1401403239,False,False,canceled,5000.000000,Plays,Theater,15.100000,63.157865,8.888722
28156,50,GB,1389870654,False,False,canceled,6520.698650,Plays,Theater,0.766789,176.347001,137.407831
28157,103,US,1400796548,False,False,canceled,3000.000000,Plays,Theater,3.433333,34.813748,4.824308
28158,305,US,1382144981,False,False,canceled,250000.000000,Plays,Theater,0.122000,269.985476,210.006596


In [38]:
filter = (model_data['state'] == 'live')
live_count = model_data[filter]
live_count

Unnamed: 0,converted_pledged_amount,country,created_at,spotlight,staff_pick,state,goal_usd,name,parent_name,percentage_funded,total_days_active,launch_time
24,25199,US,1642591824,False,True,live,10000.00000,Product Design,Design,251.990000,93.042099,48.099591
25,21318,JP,1633395153,False,False,live,2337.28200,Design,,912.085063,199.517558,154.364985
26,17790,SG,1644901751,False,False,live,1465.18214,Product Design,Design,1214.183514,77.424693,20.444896
27,9782,JP,1639977920,False,False,live,2337.28200,Product Design,Design,418.520315,122.355295,77.202699
28,18151,JP,1636003615,False,False,live,1558.18800,Design,,1164.878692,168.338004,123.184436
...,...,...,...,...,...,...,...,...,...,...,...,...
24630,138,GB,1645185417,False,False,live,26082.79460,Farms,Food,0.529084,66.881392,6.944164
24631,216,IT,1643289716,False,False,live,8655.20792,Drinks,Food,2.495607,85.718127,26.780547
24632,14181,US,1642709137,False,False,live,5000.00000,Small Batch,Food,283.620000,91.853142,31.915914
24633,1746,US,1644029991,False,False,live,175000.00000,Restaurants,Food,0.997714,76.557394,16.620166


In [None]:
# init encoder for x values
encoder = LabelEncoder()

# encode boolean and string type columns
encoding_columns = (model_data.select_dtypes(include=['bool', 'object'])).columns
for column in encoding_columns:
    model_data[column] = encoder.fit_transform(model_data[column])

In [None]:
model_data

In [None]:
# Convert 'created at' to datetime format
import datetime
model_data['Date Created'] = pd.to_datetime(model_data['created_at'], unit='s')
model_data['Date Created'] = model_data['Date Created'].dt.strftime('%Y/%m/%d')
model_data

In [None]:
# Drop NA's
model_data = model_data.dropna()
model_data

In [None]:
# Create validation dataset
val_start_date = '2021/02/01'
val_end_date = '2021/08/07'
val_mask = (model_data['Date Created'] >= val_start_date) & (model_data['Date Created'] <= val_end_date)
val_dataset = model_data[val_mask]
val_dataset

In [None]:
# Split validation dataset into X and y variables
X = val_dataset.drop(columns=['state','Date Created'])
y = val_dataset.state

In [None]:
# generate 70% train/validation split 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3,
                                                     random_state = 1,
                                                     stratify = y)

In [None]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
classifier = SVC(kernel='linear',max_iter=500000)
classifier

In [None]:
# Fit the data
classifier.fit(X_train, y_train)

In [None]:
# Score the accuracy
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_val, y_val)}")

In [None]:
# Make predictions using the test data
predictions = classifier.predict(X_val)
results = pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_val
}).reset_index(drop=True)
results.tail()

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val, predictions)
cm_df = pd.DataFrame(
cm, index=['Actual 0', 'Actual 1', 'Actual 2', 'Actual 3'], columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3']
)
# Displaying results
print("Confusion Matrix")
display(cm_df)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_val, predictions))

In [None]:
# Create train/test dataset
train_test_dataset = model_data.loc[val_mask == False]
train_test_dataset = train_test_dataset.drop(['Date Created'], axis=1)
train_test_dataset

In [None]:
# split into X and y variables
X = train_test_dataset.drop(columns=['state'])
y = train_test_dataset.state#.to_frame()

In [None]:
# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                       random_state = 1,
                                                       stratify = y)

In [None]:
# init standard scaler
scaler = StandardScaler()

# scale Train and testing set, columns=X.columns to keep column headers.
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
X_train_scaled.head()

In [None]:
# Instantiate a linear SVM model
from sklearn.svm import SVC
classifier = SVC(kernel='linear',max_iter=500000)
classifier

In [None]:
# Fit the data
classifier.fit(X_train, y_train)

In [None]:
# Score the accuracy
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

In [None]:
# Make predictions using the test data
predictions = classifier.predict(X_test)
results = pd.DataFrame({
    "Prediction": predictions, 
    "Actual": y_test
}).reset_index(drop=True)
results.tail()

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
cm, index=['Actual 0', 'Actual 1', 'Actual 2', 'Actual 3'], columns=['Predicted 0', 'Predicted 1', 'Predicted 2', 'Predicted 3']
)
# Displaying results
print("Confusion Matrix")
display(cm_df)

In [None]:
# Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))