In [1]:
import pandas as pd
import numpy as np

In [2]:
#df = pd.read_csv('data/Kickstarter000.csv')
li = []
for i in range(0,56):
    li.append(pd.read_csv(f'data/Kickstarter0{str(i).zfill(2)}.csv'))
df = pd.concat(li, axis=0, ignore_index = True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209222 entries, 0 to 209221
Data columns (total 37 columns):
backers_count               209222 non-null int64
blurb                       209214 non-null object
category                    209222 non-null object
converted_pledged_amount    209222 non-null int64
country                     209222 non-null object
created_at                  209222 non-null int64
creator                     209222 non-null object
currency                    209222 non-null object
currency_symbol             209222 non-null object
currency_trailing_code      209222 non-null bool
current_currency            209222 non-null object
deadline                    209222 non-null int64
disable_communication       209222 non-null bool
friends                     300 non-null object
fx_rate                     209222 non-null float64
goal                        209222 non-null float64
id                          209222 non-null int64
is_backing                  300 

### Drops:
    - Permissions, is_backing, is_starred, friends -> only 300 not nan values
    - slug, source_url, url -> same as category and name
    - creator
    - currency_symbol, currency_trailing_code, current_currency
    - disable_communication
    - fx_rate
    - id
    - is_starrable
    - photo
    - location
    - pledged
    - profile
    - spotlight
    - static_usd_rate
    - usd_type

In [4]:
df.drop(['permissions', 'slug', 'source_url', 'urls', 'creator', 'currency_symbol', 'currency_trailing_code', 
        'current_currency',
        'disable_communication', 'fx_rate', 'id', 'is_starrable', 'photo', 'location', 'pledged',
        'profile', 'spotlight', 'static_usd_rate', 'usd_type', 
        'is_backing', 'is_starred', 'friends'], axis=1, inplace=True)
df = df.query("state == 'successful' or state == 'failed'")
df.reset_index(inplace=True, drop=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192664 entries, 0 to 192663
Data columns (total 15 columns):
backers_count               192664 non-null int64
blurb                       192662 non-null object
category                    192664 non-null object
converted_pledged_amount    192664 non-null int64
country                     192664 non-null object
created_at                  192664 non-null int64
currency                    192664 non-null object
deadline                    192664 non-null int64
goal                        192664 non-null float64
launched_at                 192664 non-null int64
name                        192664 non-null object
staff_pick                  192664 non-null bool
state                       192664 non-null object
state_changed_at            192664 non-null int64
usd_pledged                 192664 non-null float64
dtypes: bool(1), float64(2), int64(6), object(6)
memory usage: 20.8+ MB


In [6]:
df.category = df.category.map(lambda x: eval(x))
df.category = df.category.map(lambda x: x['slug'].partition('/')[0])

In [7]:
#format deadline, created at, launched at, stated_changed_at

#deadline
df.deadline = pd.to_datetime(df.deadline, unit='s')
df.deadline = df.deadline.dt.date

#created at
df.created_at = pd.to_datetime(df.created_at, unit='s')
df.created_at = df.created_at.dt.date

#launched at 
df.launched_at = pd.to_datetime(df.launched_at, unit='s')
df.launched_at = df.launched_at.dt.date

#state_changed_at
df.state_changed_at = pd.to_datetime(df.state_changed_at, unit='s')
df.state_changed_at = df.state_changed_at.dt.date

In [8]:
#create new columns (days_total, days_until_success, days_until_launch) 
df['days_until_launch'] = df.apply(lambda row: row.launched_at - row.created_at, axis = 1)
df['days_until_success'] = df.apply(lambda row: row.state_changed_at - row.launched_at, axis = 1) 
df['days_total'] = df.apply(lambda row: row.deadline - row.launched_at, axis = 1) 
df['days_diff_total_success'] = df.apply(lambda row: row.days_total - row.days_until_success, axis = 1) 

In [9]:
#transform dateformat to integer
df['days_total'] = df['days_total'].dt.days.astype('int16')
df['days_diff_total_success'] = df['days_diff_total_success'].dt.days.astype('int16')
df['days_until_launch'] = df['days_until_launch'].dt.days.astype('int16')
df['days_until_success'] = df['days_until_success'].dt.days.astype('int16')

#Check for negative values of days_diff_total_success
#df.query("days_diff_total_success < 0").days_diff_total_success.describe()

In [10]:
df.to_csv('KickstarterData_full.csv')

In [11]:
from sklearn.model_selection import train_test_split
X = df.drop('state', axis=1)
y = df.state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [12]:
Trainset = pd.concat([X_train, y_train], axis=1)
Trainset = Trainset.reset_index(drop=True)
Testset = pd.concat([X_test, y_test], axis=1)
Testset = Testset.reset_index(drop=True)

In [13]:
Trainset.to_csv('Kickstarter_Train.csv')
Testset.to_csv('Kickstarter_Test.csv')