# Kickstarter Data

Obtained from:

https://webrobots.io/kickstarter-datasets/

- Data as at Nov 2018 (since 2015)

## Data Preparation

The aggreagation of data files is separated into the script *data_aggregation.py*.

## Data Preprocessing

In [49]:
import pandas as pd
import numpy as np
from datetime import datetime

df = pd.read_csv('/Volumes/TysonSSD/kickstarter_data/df.csv')
df.head()

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,slug,source_url,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_pledged,usd_type
0,1,Un cortometraje de protesta contra la violenci...,"{""id"":32,""name"":""Shorts"",""slug"":""film & video/...",0,MX,Mexico,1569216038,"{""id"":234859980,""name"":""Sebastian Najera"",""slu...",MXN,$,...,juarez-0,https://www.kickstarter.com/discover/categorie...,False,False,live,1569353446,0.051476,"{""web"":{""project"":""https://www.kickstarter.com...",0.967743,domestic
1,378,2005 sändes första avsnittet av den legendaris...,"{""id"":292,""name"":""Comedy"",""slug"":""film & video...",55169,SE,Sweden,1479480632,"{""id"":1181966419,""name"":""New Stories"",""is_regi...",SEK,kr,...,efter-rallarsving,https://www.kickstarter.com/discover/categorie...,True,False,successful,1482577200,0.107792,"{""web"":{""project"":""https://www.kickstarter.com...",54750.81596,domestic
2,314,A historical fantasy EPIC set in 15-century Af...,"{""id"":29,""name"":""Animation"",""slug"":""film & vid...",20555,US,the United States,1531880676,"{""id"":44069577,""name"":""Roye Okupe"",""is_registe...",USD,$,...,malika-animated-series-episode-one-pilot,https://www.kickstarter.com/discover/categorie...,True,False,successful,1535162340,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",20555.0,domestic
3,1,"Hip Hop album titled ""Deep Off In This Game""","{""id"":39,""name"":""Hip-Hop"",""slug"":""music/hip-ho...",1,US,the United States,1565206561,"{""id"":457047149,""name"":""Jose Ramirez"",""slug"":""...",USD,$,...,ktr-presents-deep-off-in-this-game,https://www.kickstarter.com/discover/categorie...,False,False,live,1569025922,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",1.0,international
4,7,"A set of 7 enamel pins inspired by the best, p...","{""id"":262,""name"":""Accessories"",""slug"":""fashion...",129,US,the United States,1542332380,"{""id"":174204392,""name"":""Eren Simpson"",""slug"":""...",USD,$,...,the-office-b-team-enamel-pins,https://www.kickstarter.com/discover/categorie...,True,False,successful,1543695202,1.0,"{""web"":{""project"":""https://www.kickstarter.com...",129.0,domestic


In [76]:
# only want 'successful','failed' or 'live'
df = df[((df['state'] == 'successful') | (df['state'] == 'failed') | (df['state'] == 'live'))]
df.shape

(195899, 48)

In [50]:
# related to datetime
df['created_at'] = pd.to_datetime(df['created_at'],unit='s')
df['deadline'] = pd.to_datetime(df['deadline'],unit='s')
df['launched_at'] = pd.to_datetime(df['launched_at'],unit='s')
df['state_changed_at'] = pd.to_datetime(df['state_changed_at'],unit='s')

In [51]:
# name and slug from json strings
import json
df['name'] = df['category'].apply(lambda x: json.loads(x)['name'].lower())
df['slug'] = df['category'].apply(lambda x: json.loads(x)['slug'].split("/")[0])

In [52]:
# unwanted columns
del df['creator']
del df['category']
del df['location']
del df['photo']
del df['profile']
del df['source_url']
del df['urls']
del df['permissions']
del df['converted_pledged_amount']

## Feature Engineering

In [56]:
# length of blurb
df['blurb_len'] = df['blurb'].apply(lambda x: len(str(x).split(' ')))

# target flag
df['is_successful'] = np.where(df['state'] == 'successful', 1, 0)
df['is_failed'] = np.where(df['state'] == 'failed', 1, 0)
df['is_live'] = np.where(df['state'] == 'live', 1, 0)

In [61]:
# dates
df['create_year'] = df['created_at'].apply(lambda x: x.year)
df['create_month'] = df['created_at'].apply(lambda x: x.month)
df['create_day'] = df['created_at'].apply(lambda x: x.day)

df['launch_year'] = df['launched_at'].apply(lambda x: x.year)
df['launch_month'] = df['launched_at'].apply(lambda x: x.month)
df['launch_day'] = df['launched_at'].apply(lambda x: x.day)

df['deadline_year'] = df['deadline'].apply(lambda x: x.year)
df['deadline_month'] = df['deadline'].apply(lambda x: x.month)
df['deadline_day'] = df['deadline'].apply(lambda x: x.day)

df['state_change_year'] = df['state_changed_at'].apply(lambda x: x.year)
df['state_change_month'] = df['state_changed_at'].apply(lambda x: x.month)
df['state_change_day'] = df['state_changed_at'].apply(lambda x: x.day)

Unnamed: 0,backers_count,blurb,converted_pledged_amount,country,country_displayable_name,created_at,currency,currency_symbol,currency_trailing_code,current_currency,...,create_day,launch_year,launch_month,launch_day,deadline_year,deadline_month,deadline_day,state_change_year,state_change_month,state_change_day
0,1,Un cortometraje de protesta contra la violenci...,0,MX,Mexico,2019-09-23 05:20:38,MXN,$,True,USD,...,23,2019,9,24,2019,11,23,2019,9,24
1,378,2005 sändes första avsnittet av den legendaris...,55169,SE,Sweden,2016-11-18 14:50:32,SEK,kr,True,USD,...,18,2016,11,21,2016,12,24,2016,12,24
2,314,A historical fantasy EPIC set in 15-century Af...,20555,US,the United States,2018-07-18 02:24:36,USD,$,True,USD,...,18,2018,7,24,2018,8,25,2018,8,25
3,1,"Hip Hop album titled ""Deep Off In This Game""",1,US,the United States,2019-08-07 19:36:01,USD,$,True,USD,...,7,2019,9,21,2019,11,15,2019,9,21
4,7,"A set of 7 enamel pins inspired by the best, p...",129,US,the United States,2018-11-16 01:39:40,USD,$,True,USD,...,16,2018,11,16,2018,12,1,2018,12,1


In [82]:
df['backing_days'] = (df['deadline'] - df['launched_at']).apply(lambda x: x.days) # take integer days
df['is_spotlight'] = np.where(df['spotlight'] == True, 1, 0)
df['is_staff_pick'] = np.where(df['staff_pick'] == True, 1, 0)

# pledge, backers, days metrics
df['backers_per_day'] = df['backers_count']/df['backing_days']
df['pledged_per_backer'] = df['usd_pledged']/df['backers_count']
df['pledged_per_day'] = df['usd_pledged']/df['backing_days']

df.head()

Unnamed: 0,backers_count,blurb,country,country_displayable_name,created_at,currency,currency_symbol,currency_trailing_code,current_currency,deadline,...,deadline_day,state_change_year,state_change_month,state_change_day,backing_days,is_spotlight,is_staff_pick,backers_per_day,pledged_per_backer,pledged_per_day
0,1,Un cortometraje de protesta contra la violenci...,MX,Mexico,2019-09-23 05:20:38,MXN,$,True,USD,2019-11-23 20:30:45,...,23,2019,9,24,60,0,0,0.016667,0.967743,0.016129
1,378,2005 sändes första avsnittet av den legendaris...,SE,Sweden,2016-11-18 14:50:32,SEK,kr,True,USD,2016-12-24 11:00:00,...,24,2016,12,24,32,1,0,11.8125,144.843428,1710.962999
2,314,A historical fantasy EPIC set in 15-century Af...,US,the United States,2018-07-18 02:24:36,USD,$,True,USD,2018-08-25 01:59:00,...,25,2018,8,25,31,1,0,10.129032,65.461783,663.064516
3,1,"Hip Hop album titled ""Deep Off In This Game""",US,the United States,2019-08-07 19:36:01,USD,$,True,USD,2019-11-15 20:40:00,...,15,2019,9,21,55,0,0,0.018182,1.0,0.018182
4,7,"A set of 7 enamel pins inspired by the best, p...",US,the United States,2018-11-16 01:39:40,USD,$,True,USD,2018-12-01 20:13:22,...,1,2018,12,1,15,1,0,0.466667,18.428571,8.6


In [77]:
df.head(100).to_csv('example_df.csv', index = False)

In [84]:
set(df.name)

{'3d printing',
 'academic',
 'accessories',
 'action',
 'animals',
 'animation',
 'anthologies',
 'apparel',
 'apps',
 'architecture',
 'art',
 'art books',
 'audio',
 'bacon',
 'blues',
 'calendars',
 'camera equipment',
 'candles',
 'ceramics',
 "children's books",
 'childrenswear',
 'chiptune',
 'civic design',
 'classical music',
 'comedy',
 'comic books',
 'comics',
 'community gardens',
 'conceptual art',
 'cookbooks',
 'country & folk',
 'couture',
 'crafts',
 'crochet',
 'dance',
 'design',
 'digital art',
 'diy',
 'diy electronics',
 'documentary',
 'drama',
 'drinks',
 'electronic music',
 'embroidery',
 'events',
 'experimental',
 'fabrication tools',
 'faith',
 'family',
 'fantasy',
 "farmer's markets",
 'farms',
 'fashion',
 'festivals',
 'fiction',
 'film & video',
 'fine art',
 'flight',
 'food trucks',
 'footwear',
 'gadgets',
 'games',
 'gaming hardware',
 'glass',
 'graphic design',
 'graphic novels',
 'hardware',
 'hip-hop',
 'horror',
 'illustration',
 'immersive',

In [67]:
df.backing_days[0].days

60