In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline
sns.set;

### Initial Scraping Data of the KickStarter

In [110]:
colnames=['Project_Name', ' Inventor', 'Number_of_Backer', 
          'Total_Pledged', 'Goal', 'Location', 'Category', 
          'Number_of_Pledge_Options', 'Pledge_Detail', 'Status','useless'] 
Ks = pd.read_csv('KickStarter_data_utf8.csv', header=None, names=colnames)
Ks.reset_index(inplace = True)

In [111]:
Ks = Ks[colnames].drop(['useless'], axis = 1)

In [112]:
Ks.head(2)

Unnamed: 0,Project_Name,Inventor,Number_of_Backer,Total_Pledged,Goal,Location,Category,Number_of_Pledge_Options,Pledge_Detail,Status
0,This is not a Kickstarter shirt,Yancey,532.0,8554.0,500,New York-NY,Fashion,2,"('413 backers', 'US$ 15'), ('113 backers', 'US...",success
1,Family Secrets. A new doc style cooking show,Janet,117.0,5465.0,5000,New York-NY,Food,7,"('10 backers', '$10'), ('42 backers', '$20'), ...",success


### Rescraped Faillinks from the Initial Scraped

In [113]:
Ks2 = pd.read_csv('KickStarter_Faillinks_Rescrape.csv', header = None, names = colnames)

In [114]:
Ks2 = Ks2[colnames].drop(['useless'], axis = 1)

In [115]:
Ks2.head(2)

Unnamed: 0,Project_Name,Inventor,Number_of_Backer,Total_Pledged,Goal,Location,Category,Number_of_Pledge_Options,Pledge_Detail,Status
0,New & original design ~ London bus full size s...,Adian,2.0,6,69900.0,Shropshire-UK,Product Design,5,"('0 backers', '£1'), ('0 backers', '£20'), ('0...",live
1,The Oak Tree,Richard,2.0,750,16000.0,Marbella-Spain,Restaurants,4,"('0 backers', '€500'), ('1 backer', '€500'), (...",live


### Concat two datasets

In [116]:
frame = [Ks, Ks2]
df = pd.concat(frame)

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34655 entries, 0 to 3795
Data columns (total 10 columns):
Project_Name                34655 non-null object
 Inventor                   33061 non-null object
Number_of_Backer            34644 non-null float64
Total_Pledged               34655 non-null object
Goal                        34655 non-null object
Location                    34655 non-null object
Category                    34655 non-null object
Number_of_Pledge_Options    34655 non-null object
Pledge_Detail               34638 non-null object
Status                      34655 non-null object
dtypes: float64(1), object(9)
memory usage: 2.9+ MB


In [118]:
df = df.drop_duplicates(keep=False)

In [119]:
df.info()
#looks like there was 117 duplicate projects, made a decision to drop them.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34538 entries, 0 to 3795
Data columns (total 10 columns):
Project_Name                34538 non-null object
 Inventor                   32948 non-null object
Number_of_Backer            34527 non-null float64
Total_Pledged               34538 non-null object
Goal                        34538 non-null object
Location                    34538 non-null object
Category                    34538 non-null object
Number_of_Pledge_Options    34538 non-null object
Pledge_Detail               34521 non-null object
Status                      34538 non-null object
dtypes: float64(1), object(9)
memory usage: 2.9+ MB


In [120]:
df = df.dropna()

In [121]:
df.info()
#also going to drop all nan rows

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32920 entries, 0 to 3795
Data columns (total 10 columns):
Project_Name                32920 non-null object
 Inventor                   32920 non-null object
Number_of_Backer            32920 non-null float64
Total_Pledged               32920 non-null object
Goal                        32920 non-null object
Location                    32920 non-null object
Category                    32920 non-null object
Number_of_Pledge_Options    32920 non-null object
Pledge_Detail               32920 non-null object
Status                      32920 non-null object
dtypes: float64(1), object(9)
memory usage: 2.8+ MB


In [144]:
#df.loc[df.Total_Pledged == '¥1043200'] = df.loc[df.Total_Pledged == '¥1043200'].Total_Pledged.tolist()[0].lstrip('¥')

In [146]:
#all columns were scrapped and wrap into a string in order to avoid errors
#now we need to change some of the columns into numbers so that we can use for fitting.
df[['Number_of_Backer', 
     'Total_Pledged', 
     'Goal', 
     'Number_of_Pledge_Options']] = df[['Number_of_Backer', 
                                        'Total_Pledged', 
                                        'Goal', 
                                        'Number_of_Pledge_Options']].astype(float).astype(int)

Unnamed: 0,Project_Name,Inventor,Number_of_Backer,Total_Pledged,Goal,Location,Category,Number_of_Pledge_Options,Pledge_Detail,Status
0,This is not a Kickstarter shirt,Yancey,532,8554,500,New York-NY,Fashion,2,"('413 backers', 'US$ 15'), ('113 backers', 'US...",success
1,Family Secrets. A new doc style cooking show,Janet,117,5465,5000,New York-NY,Food,7,"('10 backers', '$10'), ('42 backers', '$20'), ...",success
3,"Shire Suds waste-free, handmade soap",Sarah,11,620,200,Old Town-ME,Crafts,4,"('4 backers', '$10'), ('2 backers', '$25'), ('...",success
4,The Let's Make a Bunch of Logos Project Part 3,Brandon,475,26243,25,Las Vegas-NV,Graphic Design,4,"('274 backers', '$25'), ('126 backers', '$50')...",success
5,9000 Wolves Short Film Project,Jamie,12,558,300,Reno-NV,Shorts,6,"('2 backers', '$1'), ('0 backers', '$10'), ('6...",success
6,Judas Kiss: A new gay film about second chance...,J.T.,85,10075,5000,Seattle-WA,Film & Video,11,"('6 backers', 'US$ 5'), ('12 backers', 'US$ 20...",success
8,The Urban Bounty: Growing Your Own Fresh Produ...,Green,149,8225,7500,Somerville-MA,Food,9,"('15 backers', '$15'), ('13 backers', '$25'), ...",success
9,Watch The Gap Fund Raiser for Dancers and Coll...,Ellen,80,2636,2500,Austin-TX,Dance,6,"('23 backers', 'US$ 1'), ('27 backers', 'US$ 2...",success
10,(ab)Normality,Sixth,53,2693,2600,Everett-WA,Performances,8,"('3 backers', 'US$ 10'), ('4 backers', 'US$ 15...",success
11,TEDDY-1,joko,283,11868,8000,San Francisco-CA,Comics,9,"('14 backers', '$5'), ('178 backers', '$30'), ...",success


In [147]:
df.to_csv('KickStarter_Luther.csv', index = False)