In [1]:
import numpy as np
import json
import pandas as pd
import simplejson as json
import glob
import os
from pandas.io.json import json_normalize


In [2]:
# Puts files in a list for reading.
path = 'C:/Users/Tom/Documents/Kickstarter/JSONs/' 
list_files = os.listdir(path)
files = []

for file in list_files:
    file = path + '/' + file
    files.append(file)


In [None]:
'''
Reads all files in chunks, normalizes the JSON files, takes out the necessary columns, concatenates the files, 
and exports the file to csv before data cleaning. Can take about 15 minutes to run (massive improvement from before.)
'''
filenum = 0
for file in files:
    chunks = []
    reader = pd.read_json(file, lines=True, chunksize=4096)   

    for chunk in reader:
        chunks.append(chunk['data'])
        
    normalized = []
    
    for chunk in range(len(chunks)):
        normalized.append(json_normalize(chunks[chunk]))
        normalized[chunk] = normalized[chunk][['id', 'backers_count', 'blurb', 'category.name', 'category.slug', 
                                               'country', 'currency', 'goal', 'launched_at', 'deadline', 
                                               'location.displayable_name', 'location.country', 'location.state',
                                                'location.type', 'name', 'usd_pledged', 'slug', 'spotlight', 
                                               'staff_pick', 'static_usd_rate', 'state']]
    master = pd.concat(normalized, sort=False)
    new_filename = 'file' + str(filenum) + '.csv'
    filenum += 1
    master.to_csv(new_filename)


In [None]:
'''
The prior cell exported 38 CSV files. This step combines the files, appends them into one CSV, and exports that csv.
'''
master = []
# local path to csvs

list_csvs = os.listdir(path)

for file in list_csvs:
    file = path + '/' + file
    files.append(file)
    from_csv = pd.read_csv(file, index_col=0)
    master.append(from_csv)
  
master = pd.concat(master)
master.drop_duplicates(inplace=True)

In [None]:
'''
The rest of this code cleans the data in the CSV.
'''
import datetime
master['launched_at'] = master['launched_at'].apply(datetime.datetime.utcfromtimestamp)
master['deadline'] = master['deadline'].apply(datetime.datetime.utcfromtimestamp)


In [None]:
master['year'] = master['launched_at'].apply(lambda x: datetime.date.timetuple(x)[0])
master['month'] =  master['launched_at'].apply(lambda x: datetime.date.timetuple(x)[1])
master['day'] = master['launched_at'].apply(lambda x: datetime.date.timetuple(x)[2])
master['hour'] = master['launched_at'].apply(lambda x: datetime.date.timetuple(x)[3])

In [None]:
master['days_to_deadline'] = (master['deadline'] - master['launched_at']).apply(lambda x: x/np.timedelta64(1,'D'))
master['days_to_deadline'] = master['days_to_deadline'].apply(lambda x: format(x, '.0f'))

In [None]:
# Feature engineering: changing goals to constant currency and rounding to 2 decimal places
master['goal_USD'] = master['goal'] * master['static_usd_rate']

In [None]:
# Changing names to do . notation instead of brackets later
master['category_name'] = master['category.name']
master['category_slug'] = master['category.slug'].apply(lambda x: x.split('/')[0])
master = master.drop(['category.name', 'category.slug'], axis=1)

In [None]:
# Changing object data types to str
[master[column].apply(lambda x: str(x)) for column in 
['id', 'blurb', 'country', 'currency', 'location.displayable_name', 'location.country', 'location.state', 
 'location.type', 'name', 'slug', 'state', 'category_name', 'category_slug']]; 

In [None]:
# Adding blurb length to model
def word_count(string):
    tokens = string.split()
    n_tokens = len(tokens)
    return(n_tokens)

master['blurb_length'] = master['blurb'].apply(lambda x: word_count(str(x)))

In [None]:
# Checking for missing values
#master.isnull().sum()

In [None]:
# Cleans location names. Goal here is to remove dots in names to do . notation later on.
master['location_type'] = master['location.type'].fillna('Miscellaneous')
master['location_country'] = master['location.country'].fillna(master['country'])
master['location_state'] = master['location.state'].fillna('-')
master['blurb'] = master['blurb'].fillna('-')
master['location_displayable_name'] = master['location.displayable_name'].fillna('-')
master = master.drop(['country', 'location.type', 'location.state', 'location.displayable_name'], axis=1)

In [None]:
# Changes projects to a straight success / fail
def binary_state(x):
    if round(x.usd_pledged) >= round(x.goal_USD):
        return 'successful'
    else:
        return 'failed'
master['binary_state'] = master.apply(lambda x: binary_state(x), axis=1)    

In [None]:
# At the end, this is written to a local file. Loaded as kickstarter.csv
master.to_csv('kickstarter.csv')