In [1]:
import requests
from bs4 import BeautifulSoup
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from zipfile import ZipFile
import json
%matplotlib inline

In [2]:
df = pd.read_csv('../data/kickstarter_sub.zip')

del df['Unnamed: 0']

df.head()

def handle_category(category):
    category = json.loads(category)
    category_id = category['id']
    if category['slug'] is not None:
        category_list = category['slug'].split('/')
        category_val = category_list[0]
        sub_category = category_list[1] if len(category_list) > 1 else None
    else:
        category_val = None
        sub_category = None
    sub_category_url = category['urls']['web']['discover'] if 'urls' in category \
                      else None
    category_parent_id = category['parent_id'] if 'parent_id' in category \
                          else None
    category_position = category['position'] if 'position' in category \
                        else None

    return category_id, category_val, sub_category, sub_category_url, \
        category_parent_id, category_position


df['category_id'], \
df['category_val'], \
df['sub_category'], \
df['sub_category_url'], \
df['category_parent_id'], \
df['category_position'] = zip(*df['category'].apply(handle_category))

df.drop('category', axis=1, inplace=True)

df.rename(columns={'category_val': 'category'}, inplace=True)

def handle_creator(creator):
    
    creator = creator.replace(' "', ' \'').replace('" ', '\' ')
    creator = creator.replace('", ','\', ')
    creator = creator.replace('("', '(\'')
    creator = creator.replace('""', '\'"')
    creator = creator.replace('"ON!', '\'ON!')
    creator = json.loads(creator)
    creator_id = creator['id']
    creator_name = creator['name']
    is_creator_registered = creator['is_registered']
    creator_chosen_currency = creator['chosen_currency']
    is_creator_superbacker = creator['is_superbacker']
    creator_profile_url = creator['urls']['web']['user']
    
    return creator_id, creator_name, is_creator_registered, creator_chosen_currency, \
            is_creator_superbacker, creator_profile_url 

def handle_creator(string):
    
    def handle_null(val):
        return None if str(val).lower() == 'null' else val

    id_ = int(string[string.find('"id"')+5:string.find("name")-2])
    name_ = string[string.find('"name"')+7:string.find("is_registered")-2]
    is_registered = string[string.find('"is_registered"')+16:string.find("chosen_currency")-2]
    chosen_currency = string[string.find('"chosen_currency"')+18:string.find("is_superbacker")-2]
    is_superbacker = string[string.find('"is_superbacker"')+17:string.find("avatar")-2]
    profile_url = string[string.find('"urls"')+22:string.find("api")-3]
    
    
    id_, name_, is_registered, chosen_currency, is_superbacker, profile_url  =\
                                                    tuple(map(handle_null, [id_, name_, is_registered, \
                                                                     chosen_currency, is_superbacker, \
                                                                     profile_url]))
    
    return id_, name_, is_registered, chosen_currency, is_superbacker, profile_url
    

df['creator_id'], \
df['creator_name'], \
df['is_creator_registered'], \
df['creator_chosen_currency'], \
df['is_creator_superbacker'], \
df['creator_profile_url'] = zip(*df['creator'].apply(handle_creator))

del df['creator']

def handle_location(location):
    if pd.isnull(location):
        return np.nan, np.nan, np.nan, np.nan, np.nan 
    location = json.loads(location)
    location_id = location['id']
    location_name = location['name']
    location_type = location['type']
    location_state = location['state']
    is_location_root = location['is_root']
    
    return location_id, location_name, location_type, location_state, \
                is_location_root
    

df['location_id'], \
df['location_name'], \
df['location_type'], \
df['location_state'], \
df['is_location_root'] = zip(*df['location'].apply(handle_location))

df.drop('location', axis=1, inplace=True)

# drop photo
df.drop('photo', axis=1, inplace=True)

def handle_url(url):
    if pd.isnull(url):
        return np.nan, np.nan
    url = json.loads(url)
    project_url = url['web'].get('project')
    rewards_url = url['web'].get('rewards')
    
    return project_url, rewards_url

df['project_url'], \
df['rewards_url'] = zip(*df['urls'].apply(handle_url))

df.drop('urls', axis=1, inplace=True)

def handle_profile(profile):
    if pd.isnull(profile):
        return np.nan, np.nan, np.nan, np.nan
    print (profile)
    profile = json.loads(profile)
    profile_id = profile['id']
    project_id = profile['project_id']
    profile_state = profile['state']
    profile_name = profile['name']
    
    return profile, project_id, profile_state, profile_name
    

def handle_profile(string):
    if pd.isnull(string):
        return np.nan, np.nan, np.nan, np.nan
    id_ = int(string[string.find('"id"')+5:string.find('"project_id"')-1])
    project_id = int(string[string.find('"project_id"')+13:string.find('"state"')-1])
    state_ = string[string.find('"state"')+8:string.find('"state_changed_at"')-1]
    name_ = string[string.find('"name"')+7:string.find('"blurb"')-1]

    def handle_null(val):
        return np.nan if str(val).lower() == 'null' else val

    id_, project_id, state_, name_ = tuple(map(handle_null, [id_, project_id, state_, name_]))
    
    return id_, project_id, state_, name_

df['profile_id'], \
df['project_id'], \
df['profile_state'], \
df['profile_name'] = zip(*df['profile'].apply(handle_profile))

df.drop('profile', axis=1, inplace=True)

# dropping country displayable name
df.drop('country_displayable_name',axis=1, inplace=True)

df.drop(['friends', 'is_backing', 'permissions', \
         'is_creator_registered', 'creator_chosen_currency', \
        'is_creator_superbacker', 'is_starred'], axis=1, inplace=True)

df['state_changed_at'] = pd.to_datetime(df['state_changed_at'], unit='s')

df['created_at'] = pd.to_datetime(df['created_at'], unit='s')

df['launched_at'] = pd.to_datetime(df['launched_at'], unit='s')

df['deadline'] = pd.to_datetime(df['deadline'], unit='s')

## profile name not adding any value, therefore removing it
df.drop('profile_name', axis=1, inplace=True)


for i in list(df):
    df.loc[df[i].isnull(),i] = np.nan 

df.to_csv('../data/cleaned_data.zip', compression='zip')

  interactivity=interactivity, compiler=compiler, result=result)
