In [1]:
import pandas as pd
from datetime import datetime

In [2]:
def convert_time(df, column):
    col_lst = [datetime.fromtimestamp(i) for i in df[column]]
    df[column] = pd.Series(col_lst)

In [3]:
file = 'data/Kickstarter_2019-10-17T03_20_19_421Z.json'
df0 = pd.read_json(file, lines=True)

In [4]:
df = pd.DataFrame(list(df0.data))

In [5]:
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'country_displayable_name', 'created_at', 'creator',
       'currency', 'currency_symbol', 'currency_trailing_code',
       'current_currency', 'deadline', 'disable_communication', 'friends',
       'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred',
       'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged',
       'profile', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type'],
      dtype='object')

In [6]:
df.shape

(205227, 38)

In [13]:
df.category[0]

{'id': 270,
 'name': 'Gaming Hardware',
 'slug': 'games/gaming hardware',
 'position': 1,
 'parent_id': 12,
 'color': 51627,
 'urls': {'web': {'discover': 'http://www.kickstarter.com/discover/categories/games/gaming%20hardware'}}}

In [7]:
df['category'][0]['slug']

'games/gaming hardware'

In [8]:
main_cat_lst = [i['slug'].split('/')[0] for i in df['category']]
df['main_cat'] = main_cat_lst
df.main_cat.head()

0    games
1    music
2    music
3      art
4    music
Name: main_cat, dtype: object

In [9]:
sub_cat_lst = [i['name'] for i in df.category]
df['sub_cat'] = sub_cat_lst
df.sub_cat.head()

0    Gaming Hardware
1              Metal
2               Jazz
3        Mixed Media
4               Jazz
Name: sub_cat, dtype: object

In [10]:
convert_time(df, 'created_at')
convert_time(df, 'launched_at')
convert_time(df, 'deadline')
convert_time(df, 'state_changed_at')

In [11]:
df.created_at.head()

0   2017-04-11 01:28:29
1   2013-05-23 12:23:41
2   2013-08-16 08:13:59
3   2018-07-10 13:38:49
4   2013-04-28 20:31:00
Name: created_at, dtype: datetime64[ns]

In [12]:
df.loc[:5, ['name', 'main_cat', 'pledged', 'goal', 'converted_pledged_amount', 'usd_pledged', 'currency', 'current_currency', 'usd_type', 'fx_rate', 'static_usd_rate']]

Unnamed: 0,name,main_cat,pledged,goal,converted_pledged_amount,usd_pledged,currency,current_currency,usd_type,fx_rate,static_usd_rate
0,Playable Certification (Canceled),games,24989.0,375000.0,2963,2963.98652185,SEK,USD,domestic,0.101884,0.118612
1,Deadiron's Next Album!,music,3485.0,1000.0,3485,3485.0,USD,USD,domestic,1.0,1.0
2,Lindsey Holland Rick Holland Christmas Project...,music,3575.0,3500.0,3575,3575.0,USD,USD,international,1.0,1.0
3,The Belly of the Whale is the Belly of You,art,5523.0,5500.0,5523,5523.0,USD,USD,domestic,1.0,1.0
4,"Quartet Style... Recording my jazz vocal CD ""B...",music,100.0,9000.0,100,100.0,USD,USD,international,1.0,1.0
5,GETTING BY Language: Spanish 1,publishing,50.0,15000.0,50,50.0,USD,USD,domestic,1.0,1.0


In [13]:
df[df.current_currency != 'USD'].loc[:, ['name', 'main_cat', 'pledged', 'goal', 'converted_pledged_amount', 'usd_pledged', 'currency', 'current_currency', 'usd_type', 'fx_rate', 'static_usd_rate']].head()

Unnamed: 0,name,main_cat,pledged,goal,converted_pledged_amount,usd_pledged,currency,current_currency,usd_type,fx_rate,static_usd_rate
1867,Project Space Boost,games,1.0,500.0,1,1.0,USD,CAD,,1.320395,1.0
1995,Surprise Senpai - An Adult Visual Novel/RPG/Da...,games,1611.0,2500.0,2356,1772.4819681,EUR,CAD,,1.462672,1.100237
2162,The Beast Within,games,1.0,1506.0,1,1.0,USD,CAD,,1.320395,1.0
2373,ilumee: 4K LED Projector with Ultimate Short T...,technology,35683.0,50000.0,32212,35683.0,USD,EUR,,0.902728,1.0
2688,Amazing Original Vector Logo Design and Branding,design,791.0,100.0,714,791.0,USD,EUR,,0.902728,1.0


In [14]:
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'country_displayable_name', 'created_at', 'creator',
       'currency', 'currency_symbol', 'currency_trailing_code',
       'current_currency', 'deadline', 'disable_communication', 'friends',
       'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred',
       'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged',
       'profile', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'main_cat', 'sub_cat'],
      dtype='object')

In [15]:
%%time
loc_name_lst = [None]*len(df)
state_lst = [None]*len(df)
for index, row in df.iterrows():
    if type(row['location']) == dict:
        loc_name_lst[index] = row['location']['name']
        state_lst[index] = row['location']['state']

Wall time: 20.4 s


In [16]:
df['loc_name'] = loc_name_lst
df['loc_state'] = state_lst

In [17]:
df.loc[:10, ['loc_name', 'state', 'country', 'country_displayable_name']]

Unnamed: 0,loc_name,state,country,country_displayable_name
0,Uppsala,canceled,SE,Sweden
1,Cleveland Heights,successful,US,the United States
2,Buffalo,successful,US,the United States
3,Manhattan,successful,US,the United States
4,Hollywood,failed,US,the United States
5,Los Angeles,failed,US,the United States
6,De Land,successful,US,the United States
7,Indianapolis,live,US,the United States
8,Grand Rapids,successful,US,the United States
9,Miami,failed,US,the United States


In [18]:
df.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'country_displayable_name', 'created_at', 'creator',
       'currency', 'currency_symbol', 'currency_trailing_code',
       'current_currency', 'deadline', 'disable_communication', 'friends',
       'fx_rate', 'goal', 'id', 'is_backing', 'is_starrable', 'is_starred',
       'launched_at', 'location', 'name', 'permissions', 'photo', 'pledged',
       'profile', 'slug', 'source_url', 'spotlight', 'staff_pick', 'state',
       'state_changed_at', 'static_usd_rate', 'urls', 'usd_pledged',
       'usd_type', 'main_cat', 'sub_cat', 'loc_name', 'loc_state'],
      dtype='object')

In [19]:
df.creator[0]

{'id': 1984360892,
 'name': 'Felipe Garcia',
 'is_registered': None,
 'chosen_currency': None,
 'is_superbacker': None,
 'avatar': {'thumb': 'https://ksr-ugc.imgix.net/assets/017/344/415/2854e64bf720927babc73e4d686186cb_original.JPG?ixlib=rb-2.1.0&w=40&h=40&fit=crop&v=1498991353&auto=format&frame=1&q=92&s=b0b7954efdeb54319b1e4faa9b703dcf',
  'small': 'https://ksr-ugc.imgix.net/assets/017/344/415/2854e64bf720927babc73e4d686186cb_original.JPG?ixlib=rb-2.1.0&w=160&h=160&fit=crop&v=1498991353&auto=format&frame=1&q=92&s=b20d7bb4f7e472f70e2ba23ca346a4d7',
  'medium': 'https://ksr-ugc.imgix.net/assets/017/344/415/2854e64bf720927babc73e4d686186cb_original.JPG?ixlib=rb-2.1.0&w=160&h=160&fit=crop&v=1498991353&auto=format&frame=1&q=92&s=b20d7bb4f7e472f70e2ba23ca346a4d7'},
 'urls': {'web': {'user': 'https://www.kickstarter.com/profile/1984360892'},
  'api': {'user': 'https://api.kickstarter.com/v1/users/1984360892?signature=1571370022.6e2fbc9ffaad29f35e678667bab789d0d6859abd'}}}

In [20]:
%%time
creator_id_lst = [None]*len(df)
creator_name_lst = [None]*len(df)
for index, row in df.iterrows():
    if type(row['creator']) == dict:
        creator_id_lst[index] = row['creator']['id']
        creator_name_lst[index] = row['creator']['name']

Wall time: 19.4 s


In [21]:
df['creator_id'] = creator_id_lst
df['creator_name'] = creator_name_lst

In [22]:
df.creator_name.head()

0              Felipe Garcia
1                   Deadiron
2               Rick Holland
3    Julia "Jelly"  Morrison
4            John Eric Booth
Name: creator_name, dtype: object

In [18]:
df.blurb[3]

'Whales are washing up around the world with deadly tummy aches. The culprit? Our plastic waste.  This is not OK for me.  Is it OKFORU?'

In [28]:
df.slug[2]

'lindsey-holland-rick-holland-christmas-project-voc'

In [23]:
df['blurb_len'] = [len(i) for i in df.blurb]

In [24]:
drop_list = ['converted_pledged_amount', 'currency_trailing_code', 'current_currency', 'fx_rate', 
             'blurb', 'category', 'friends', 'is_backing', 'permissions', 'source_url',
             'urls', 'photo', 'profile', 'creator', 'location', 'usd_type']
keep_col = [i for i in df.columns if i not in drop_list]
keep_col

['backers_count',
 'country',
 'country_displayable_name',
 'created_at',
 'currency',
 'currency_symbol',
 'deadline',
 'disable_communication',
 'goal',
 'id',
 'is_starrable',
 'launched_at',
 'name',
 'pledged',
 'slug',
 'spotlight',
 'staff_pick',
 'state',
 'state_changed_at',
 'static_usd_rate',
 'usd_pledged',
 'main_cat',
 'sub_cat',
 'loc_name',
 'loc_state',
 'creator_id',
 'creator_name',
 'blurb_len']

In [25]:
df.to_json('data/cleaned_full_dataset.json')

In [26]:
df.to_csv('data/cleaned_useful_dataset.csv', header=True, columns=keep_col)

In [27]:
df2 = pd.read_csv('data/cleaned_useful_dataset.csv', index_col=0)

In [28]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 205227 entries, 0 to 205226
Data columns (total 28 columns):
backers_count               205227 non-null int64
country                     205227 non-null object
country_displayable_name    205227 non-null object
created_at                  205227 non-null object
currency                    205227 non-null object
currency_symbol             205227 non-null object
deadline                    205227 non-null object
disable_communication       205227 non-null bool
goal                        205227 non-null float64
id                          205227 non-null int64
is_starrable                205227 non-null bool
launched_at                 205227 non-null object
name                        205227 non-null object
pledged                     205227 non-null float64
slug                        205227 non-null object
spotlight                   205227 non-null bool
staff_pick                  205227 non-null bool
state                       20

In [29]:
df2.head()

Unnamed: 0,backers_count,country,country_displayable_name,created_at,currency,currency_symbol,deadline,disable_communication,goal,id,...,state_changed_at,static_usd_rate,usd_pledged,main_cat,sub_cat,loc_name,loc_state,creator_id,creator_name,blurb_len
0,31,SE,Sweden,2017-04-11 01:28:29,SEK,kr,2017-12-24 05:00:00,False,375000.0,1520616480,...,2017-12-19 08:47:25,0.118612,2963.986522,games,Gaming Hardware,Uppsala,Uppsala,1984360892,Felipe Garcia,129
1,39,US,the United States,2013-05-23 12:23:41,USD,$,2013-08-31 20:59:00,False,1000.0,1580890627,...,2013-08-31 20:59:02,1.0,3485.0,music,Metal,Cleveland Heights,OH,1340525642,Deadiron,130
2,42,US,the United States,2013-08-16 08:13:59,USD,$,2013-10-22 19:58:57,False,3500.0,1826214726,...,2013-10-22 19:58:58,1.0,3575.0,music,Jazz,Buffalo,NY,1679924770,Rick Holland,59
3,21,US,the United States,2018-07-10 13:38:49,USD,$,2018-09-03 14:00:00,False,5500.0,1314966709,...,2018-09-03 14:00:00,1.0,5523.0,art,Mixed Media,Manhattan,NY,673151337,"Julia ""Jelly"" Morrison",134
4,1,US,the United States,2013-04-28 20:31:00,USD,$,2013-07-05 14:43:05,False,9000.0,1694208217,...,2013-07-05 14:43:07,1.0,100.0,music,Jazz,Hollywood,CA,2018121664,John Eric Booth,121


In [29]:
df2.loc['n'] = 1

NameError: name 'df2' is not defined

In [31]:
df2.slug.head()

0                               playable-certification
1                                 deadirons-next-album
2    lindsey-holland-rick-holland-christmas-project...
3           the-belly-of-the-whale-is-the-belly-of-you
4    quartet-style-recording-my-jazz-vocal-cd-back-...
Name: slug, dtype: object