In [1]:
# Import needed modules
import pandas as pd
import numpy as np

# Turn of pandas warnings
pd.options.mode.chained_assignment = None

In [2]:
# Import the data and combine into one Dataframe
li = []
for i in range(0,56):
    li.append(pd.read_csv(f'data/Kickstarter0{str(i).zfill(2)}.csv'))
df = pd.concat(li, axis=0, ignore_index = True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 37 columns):
backers_count               209222 non-null int64
blurb                       209214 non-null object
category                    209222 non-null object
converted_pledged_amount    209222 non-null int64
country                     209222 non-null object
created_at                  209222 non-null int64
creator                     209222 non-null object
currency                    209222 non-null object
currency_symbol             209222 non-null object
currency_trailing_code      209222 non-null bool
current_currency            209222 non-null object
deadline                    209222 non-null int64
disable_communication       209222 non-null bool
friends                     300 non-null object
fx_rate                     209222 non-null float64
goal                        209222 non-null float64
id                          209222 non-null int64
is_backing                  300 

## Features that are going to be dropped:

- permissions, is_backing, is_starred, friends:   
    -> only 300 values and some of them are only NaNs     
- slug, source_url, url:  
    -> same information can be found in category and name
- creator, id, profile:  
    -> information about the creator is useless for us
- currency_symbol, currency_trailing_code, current_currency, usd_type, static_usd_rate:   
    -> redundant information
- disable_communication, is_starrable, photo, location, pledged, spotlight:
    -> redundant information

In [4]:
# Drop features
df.drop(['permissions', 'slug', 'source_url', 'urls', 'creator', 'currency_symbol', 'currency_trailing_code', 
        'current_currency', 'usd_pledged',
        'disable_communication', 'id', 'is_starrable', 'photo', 'location', 'pledged',
        'profile', 'spotlight', 'static_usd_rate', 'usd_type', 
        'is_backing', 'is_starred', 'friends'], axis=1, inplace=True)

# Drop all projects that are anything but successful or failed/canceled
df = df.query("state == 'successful' or state == 'failed or state == canceled'")
df.reset_index(inplace=True, drop=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192664 entries, 0 to 192663
Data columns (total 15 columns):
backers_count               192664 non-null int64
blurb                       192662 non-null object
category                    192664 non-null object
converted_pledged_amount    192664 non-null int64
country                     192664 non-null object
created_at                  192664 non-null int64
currency                    192664 non-null object
deadline                    192664 non-null int64
fx_rate                     192664 non-null float64
goal                        192664 non-null float64
launched_at                 192664 non-null int64
name                        192664 non-null object
staff_pick                  192664 non-null bool
state                       192664 non-null object
state_changed_at            192664 non-null int64
dtypes: bool(1), float64(2), int64(6), object(6)
memory usage: 20.8+ MB


## Reformat and extract data

- Change time format  
- Extract the category in which a project was posted  
- Create new variables for how long a project ran etc.  
- Convert goal amount to usd  
- Change class type data to 0s and 1s  
- Drop NaNs
- Create variables for blurb and name length, since plain ext is useless to us

In [6]:
# Reformat the category, so we can access the genre in which the project was posted
df.category = df.category.map(lambda x: eval(x))
df.category = df.category.map(lambda x: x['slug'].partition('/')[0])

In [7]:
#format deadline, created at, launched at, stated_changed_at

#deadline
df.deadline = pd.to_datetime(df.deadline, unit='s')
df.deadline = df.deadline.dt.date

#created at
df.created_at = pd.to_datetime(df.created_at, unit='s')
df.created_at = df.created_at.dt.date

#launched at 
df.launched_at = pd.to_datetime(df.launched_at, unit='s')
df.launched_at = df.launched_at.dt.date

#state_changed_at
df.state_changed_at = pd.to_datetime(df.state_changed_at, unit='s')
df.state_changed_at = df.state_changed_at.dt.date

In [8]:
#create new features (days_total, days_until_success, days_until_launch) 
df['days_until_launch'] = df.launched_at - df.created_at
df['days_until_success'] = df.state_changed_at - df.launched_at 
df['days_total'] = df.deadline - df.launched_at 
df['days_diff_total_success'] = df.days_total - df.days_until_success 

In [9]:
#transform dateformat to integer
df['days_total'] = df['days_total'].dt.days.astype('int16')
df['days_diff_total_success'] = df['days_diff_total_success'].dt.days.astype('int16')
df['days_until_launch'] = df['days_until_launch'].dt.days.astype('int16')
df['days_until_success'] = df['days_until_success'].dt.days.astype('int16')

#Check for negative values of days_diff_total_success
#df.query("days_diff_total_success < 0").days_diff_total_success.describe()

In [10]:
#create new column for converted goal
#goal is given in native currency, so we have to multiply it with the conversion rate for usd
df['converted_goal_amount'] = df.goal * df.fx_rate

In [11]:
# Convert state and staff_pick to 0s and 1s
df.replace(to_replace=['canceled', 'failed', 'successful'], value=[0, 0, 1], inplace=True)
df.staff_pick = df.staff_pick.astype('int16')

In [12]:
# Drop NaNs in Dataframe
df.dropna(inplace=True)

In [13]:
# Create new features blurb_length and name_length
df['blurb_length'] = df.blurb.map(lambda x: len(x))
df['name_length'] = df.name.map(lambda x: len(x))
df.drop(['blurb', 'name', 'goal', 'fx_rate'], axis=1, inplace=True)

In [14]:
# Converted goal to integer
df.converted_goal_amount = df.converted_goal_amount.astype('int64')

## Export data

- Export as one whole dataset  
- Export after split into train and test

In [15]:
# Save dataframe as a whole
df.to_csv('KickstarterData_full.csv')

In [16]:
# Split the dataframe so we have an untouched validation set for later
from sklearn.model_selection import train_test_split
X = df.drop('state', axis=1)
y = df.state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [17]:
# Combine features and target value
Trainset = pd.concat([X_train, y_train], axis=1)
Trainset = Trainset.reset_index(drop=True)
Testset = pd.concat([X_test, y_test], axis=1)
Testset = Testset.reset_index(drop=True)

In [18]:
# Export datasets
Trainset.to_csv('Kickstarter_Train.csv', index=False)
Testset.to_csv('Kickstarter_Test.csv', index=False)