In [1]:
import pandas as pd
import numpy as np
import os.path
from pathlib import Path
import glob
import json

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# Find the dataset @ (CSV files 2022-04-21, https://webrobots.io/kickstarter-datasets/)
# download link: https://s3.amazonaws.com/weruns/forfun/Kickstarter/Kickstarter_2022-04-21T03_20_08_060Z.zip

### Create master Data Frame
before running this code, create a folder inside the repo folder called 'raw_data' contiaining the exctracted CSV files from the download link.

The code will create a new directory called 'data' to store the processed data.

In [2]:
# create folder inside the current directory to hold the processed data
try:
    Path('data').mkdir(parents=True, exist_ok=False)
    print ('creating data directory')
except:
    print ('directory already exists')
    pass

directory already exists


In [3]:
# path to each file in the all_data folder
all_paths = glob.glob('raw_data/*.csv')

# list to append df's to
list_of_df = []

# loop through all paths and append each csv as a df
for filename in all_paths:
    df = pd.read_csv(filename, index_col=None, header=0)
    list_of_df.append(df)

# concat all df's into one df
master_df = pd.concat(list_of_df, axis=0, ignore_index=True)

# save df as master csv
master_df.to_csv('data/master_df.csv', index=False)

### Unpack columns containing JSON objects
The code below takes all columns that represent dictionaries and saves them as individual csv files.

In [4]:
def make_json(string):
    '''
    converts the string representation of a json object into a python dict.
    returns np.nan if the string contains commas.
    '''
    try:
        # replace single quotations to make the string represent a JSON object
        json_acceptable_string = string.replace("'", "\"")
        return(json.loads(json_acceptable_string))
    except:
        # if the string is still not JSON compatible return np.nan
        return np.nan

In [5]:
def unpack(data_frame, column):
    '''
    unpacks dict in column to dataframe.
    '''
    # convert all strings into JSON objects
    unpacked = data_frame[column].apply(make_json).to_frame()

    # unpack JSON into DataFrame
    return pd.json_normalize(unpacked[column])

In [6]:
# define which columns are represented as dicts
unpack_list = ['category', 'creator', 'location', 'photo', 'profile', 'urls']

# init empty df 
unpacked_df = pd.DataFrame()

# unpack each column of dicts, save each as their own csv
for value in unpack_list:
    frame = unpack(master_df, value)
    frame.to_csv(f'data/{value}.csv', index=False)  

In [7]:
# read category data for name and parent name
df_category = pd.read_csv('data/category.csv')
df_category = df_category[['name', 'parent_name']]

In [8]:
# drop columns deemed unessecary
model_data = master_df.drop(columns = ['backers_count','usd_pledged','is_starrable','current_currency','static_usd_rate','usd_exchange_rate','usd_type','id','name', 'slug', 'category', 'creator', 'location', 'photo', 'profile', 'urls', 'country_displayable_name', 'currency_symbol', 'currency_trailing_code', 'disable_communication', 'source_url', 'currency', 'pledged', 'blurb'])

# create goal_usd column so that all goal amounts are in the same units, drop fx_rate and goals afterward
model_data['goal_usd'] = model_data['fx_rate']*model_data['goal']
model_data = model_data.drop(columns=['fx_rate', 'goal'])

# combine category data and all other date
model_data = pd.concat([model_data, df_category], axis=1, join='inner')

# create percentage funded column as another metric
model_data['percentage_funded'] = model_data.converted_pledged_amount/model_data.goal_usd*100

# create total days active column as another metric
model_data['total_days_active'] = (model_data.deadline-model_data.created_at)*0.00001157

model_data

Unnamed: 0,converted_pledged_amount,country,created_at,deadline,launched_at,spotlight,staff_pick,state,state_changed_at,goal_usd,name,parent_name,percentage_funded,total_days_active
0,5034,US,1609376406,1611968831,1609545583,True,False,successful,1611968831,5000.000000,Cookbooks,Food,100.680000,29.994357
1,48365,HK,1606278560,1614096106,1608912106,True,True,successful,1614096106,44624.926500,Cookbooks,Food,108.381131,90.449007
2,98,ES,1606477096,1607900340,1606751262,True,False,successful,1607900340,54.095050,Cookbooks,Food,181.162603,16.466933
3,127765,US,1604500905,1610082068,1606194068,True,True,successful,1610082068,65000.000000,Cookbooks,Food,196.561538,64.574056
4,14574,GB,1605454727,1608573895,1605981895,True,True,successful,1608573895,13823.881138,Cookbooks,Food,105.426254,36.088774
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30987,501,US,1632678330,1634318870,1633022870,False,False,failed,1634318870,10000.000000,Software,Technology,5.010000,18.981048
30988,1,CA,1632726042,1638144718,1632957118,False,False,failed,1638144719,39722.546000,Software,Technology,0.002517,62.694081
30989,0,DK,1631517750,1635408514,1632816514,False,False,failed,1635408514,145430.220000,Software,Technology,0.000000,45.016139
30990,70,US,1429554526,1432913659,1430321659,False,False,failed,1432913660,35000.000000,Plays,Theater,0.200000,38.865169


In [9]:
model_data = model_data.dropna()

In [10]:
# split into X and y variables
X = model_data.drop(columns=['state'])
y = model_data.state.to_frame()

In [11]:
# generate 70% train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
# init encoder for x values
encoder = LabelEncoder()

# encode boolean and string type columns
encoding_columns = (X.select_dtypes(include=['bool', 'object'])).columns
for column in encoding_columns:
    X_train[column] = encoder.fit_transform(X_train[column])
    X_test[column] = encoder.transform(X_test[column])

In [13]:
# new encoder for y variable (to make sure that we can reverse encoding)
y_encoder = LabelEncoder()
y_train['state'] = y_encoder.fit_transform(y_train['state'])
y_test['state'] = y_encoder.transform(y_test['state'])

In [14]:
# init standard scaler
scaler = StandardScaler()

# scale Train and testing set, columns=X.columns to keep column headers.
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
X_train_scaled.head()

Unnamed: 0,converted_pledged_amount,country,created_at,deadline,launched_at,spotlight,staff_pick,state_changed_at,goal_usd,name,parent_name,percentage_funded,total_days_active
0,1.350203,0.640519,-0.726412,-0.229513,-0.226096,0.773781,2.19333,-0.228084,0.078701,-1.202777,-0.349168,-0.000552,3.102612
1,-0.041114,0.640519,-1.51472,-1.57619,-1.566122,0.773781,2.19333,-1.576563,-0.06648,-0.276991,0.809205,-0.014866,-0.396204
2,-0.049736,0.640519,0.840073,0.781327,0.785422,-1.292356,-0.455928,0.784109,-0.06648,1.01911,1.195329,-0.017471,-0.360987
3,-0.018276,-0.261861,1.201581,1.269381,1.253784,0.773781,-0.455928,1.272817,-0.046351,0.463638,-1.121416,-0.015276,0.433382
4,-0.049743,0.640519,-0.690516,-0.747387,-0.744317,-1.292356,-0.455928,-0.74665,-0.042284,1.574581,0.036956,-0.017473,-0.361069


## Begin Modelling here
