# Kickstarter

What will make your project at Kickstarter successful?

## Import stuff

In [681]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')

%matplotlib inline

## Load the data

In [682]:
data = pd.read_csv('./DSI_kickstarterscrape_dataset.csv')

## Look at the data

### Take an initial look at the data and especially the columns

In [683]:
data.head(3)

Unnamed: 0,project id,name,url,category,subcategory,location,status,goal,pledged,funded percentage,backers,funded date,levels,reward levels,updates,comments,duration
0,39409,WHILE THE TREES SLEEP,http://www.kickstarter.com/projects/emiliesaba...,Film & Video,Short Film,"Columbia, MO",successful,10500.0,11545.0,1.099524,66,"Fri, 19 Aug 2011 19:28:17 -0000",7,"$25,$50,$100,$250,$500,$1,000,$2,500",10,2,30.0
1,126581,Educational Online Trading Card Game,http://www.kickstarter.com/projects/972789543/...,Games,Board & Card Games,"Maplewood, NJ",failed,4000.0,20.0,0.005,2,"Mon, 02 Aug 2010 03:59:00 -0000",5,"$1,$5,$10,$25,$50",6,0,47.18
2,138119,STRUM,http://www.kickstarter.com/projects/185476022/...,Film & Video,Animation,"Los Angeles, CA",live,20000.0,56.0,0.0028,3,"Fri, 08 Jun 2012 00:00:31 -0000",10,"$1,$10,$25,$40,$50,$100,$250,$1,000,$1,337,$9,001",1,0,28.0


### How many rows and columns

In [684]:
data.shape

(45957, 17)

### Get more info about the data

In [685]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45957 entries, 0 to 45956
Data columns (total 17 columns):
project id           45957 non-null int64
name                 45957 non-null object
url                  45957 non-null object
category             45957 non-null object
subcategory          45957 non-null object
location             44635 non-null object
status               45957 non-null object
goal                 45957 non-null float64
pledged              45945 non-null float64
funded percentage    45957 non-null float64
backers              45957 non-null int64
funded date          45957 non-null object
levels               45957 non-null int64
reward levels        45898 non-null object
updates              45957 non-null int64
comments             45957 non-null int64
duration             45957 non-null float64
dtypes: float64(4), int64(5), object(8)
memory usage: 6.0+ MB


### Get more info about missing data

In [686]:
data.isnull().sum()

project id              0
name                    0
url                     0
category                0
subcategory             0
location             1322
status                  0
goal                    0
pledged                12
funded percentage       0
backers                 0
funded date             0
levels                  0
reward levels          59
updates                 0
comments                0
duration                0
dtype: int64

### Look for duplicates

In [687]:
data.duplicated().sum()

89

### Get statistical info about the data 
(might need to be redone after other things are fixed with the data). <BR />Here we can also see signs of outliers...will check this later.

In [688]:
data.describe()

Unnamed: 0,project id,goal,pledged,funded percentage,backers,levels,updates,comments,duration
count,45957.0,45957.0,45945.0,45957.0,45957.0,45957.0,45957.0,45957.0,45957.0
mean,1080800000.0,11942.71,4980.75,1.850129,69.973192,8.004939,4.08508,8.379529,39.995547
std,621805700.0,188758.3,56741.62,88.492706,688.628479,4.233907,6.43922,174.015737,17.414458
min,39409.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,543896200.0,1800.0,196.0,0.044,5.0,5.0,0.0,0.0,30.0
50%,1078345000.0,4000.0,1310.0,1.0,23.0,7.0,2.0,0.0,32.0
75%,1621596000.0,9862.0,4165.0,1.11564,59.0,10.0,6.0,3.0,48.39
max,2147460000.0,21474840.0,10266840.0,15066.0,87142.0,80.0,149.0,19311.0,91.96


## Work the data (EDA/Munging)

### Duplicates

#### Drop the 89 duplicated rows

In [689]:
data = data.drop_duplicates(subset=None, keep='first')

### Missing values

#### Drop rows with missing values in pledge column or reward levels column

In [690]:
data = data.dropna(axis=0, how='any', thresh=None, subset=['pledged', 'reward levels'], inplace=False)

#### Replace missing value in location column with X, X (city, state)

In [691]:
data = data.fillna('X, X')

### Check for outliers

In [692]:
data.describe()

Unnamed: 0,project id,goal,pledged,funded percentage,backers,levels,updates,comments,duration
count,45797.0,45797.0,45797.0,45797.0,45797.0,45797.0,45797.0,45797.0,45797.0
mean,1078608000.0,11935.5,4989.628,1.629066,69.556783,8.011988,4.086578,8.180055,39.980255
std,620674700.0,189065.9,56832.55,75.324487,688.658472,4.223183,6.441296,171.194596,17.402192
min,39409.0,0.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,542478400.0,1800.0,200.0,0.044,5.0,5.0,0.0,0.0,30.0
50%,1076062000.0,4000.0,1315.0,1.0,23.0,7.0,2.0,0.0,31.98
75%,1618457000.0,9900.0,4169.0,1.115385,60.0,10.0,6.0,3.0,48.34
max,2147460000.0,21474840.0,10266840.0,15066.0,87142.0,80.0,149.0,19311.0,91.96


### Name column: Turn all letters to lower case

In [693]:
data2['name'] = data2['name'].str.lower()
data2.head(3)

Unnamed: 0,project id,name,url,category,subcategory,status,goal,pledged,funded percentage,backers,...,month_nr,date,city,state,canceled,failed,live,successful,suspended,global
0,39409,while the trees sleep,http://www.kickstarter.com/projects/emiliesaba...,Film_Video,Short Film,successful,10500.0,11545.0,1.099524,66,...,8,2011-08-19,Columbia,MO,0,0,0,1,0,US
1,126581,educational online trading card game,http://www.kickstarter.com/projects/972789543/...,Games,Board_Card Games,failed,4000.0,20.0,0.005,2,...,8,2010-08-02,Maplewood,NJ,0,1,0,0,0,US
2,138119,strum,http://www.kickstarter.com/projects/185476022/...,Film_Video,Animation,live,20000.0,56.0,0.0028,3,...,6,2012-06-08,Los Angeles,CA,0,0,1,0,0,US


### Funded date column: Split column into different columns

In [694]:
data['funded date'].head()

0    Fri, 19 Aug 2011 19:28:17 -0000
1    Mon, 02 Aug 2010 03:59:00 -0000
2    Fri, 08 Jun 2012 00:00:31 -0000
3    Sun, 08 Apr 2012 02:14:00 -0000
4    Wed, 01 Jun 2011 15:25:39 -0000
Name: funded date, dtype: object

### Work the date/time column

#### Make new columns (weekday, nr in mothn, month, year, time

In [695]:
new_date = data['funded date'].str.split(", ", n=1, expand=True)
data["weekday"]= new_date[0] 
data["date"]= new_date[1] 

In [696]:
new_date2 = data['date'].str.split(" ", n=1, expand=True)
data["nr_in_month"]= new_date2[0] 
data["date2"]= new_date2[1] 

In [697]:
new_date3 = data['date2'].str.split(" ", n=1, expand=True)
data["month"]= new_date3[0] 
data["date3"]= new_date3[1] 

In [698]:
new_date4 = data['date3'].str.split(" ", n=1, expand=True)
data["year"]= new_date4[0] 
data["time"]= new_date4[1] 

In [699]:
# remove columns not needed
data = data.drop(['date', 'date2', 'date3'], axis=1)

#### Make column for number of day in week (1 = monday, 2 = tuesday etc.)

In [700]:
data['weekday_nr'] = data['weekday'].replace({'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun':7})

#### Make month column to number (1 = jan, 2 = feb etc.)

In [701]:
data['month_nr'] = data['month'].replace({'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 
                                          'Jun': '06', 'Jul':'07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 
                                          'Nov': '11', 'Dec': '12'})

#### Combind day_nr, month_nr and year to new date column

In [702]:
data['date'] = data['month_nr'] + '/' + data['nr_in_month'] + '/' + data['year']

data.head(3)

Unnamed: 0,project id,name,url,category,subcategory,location,status,goal,pledged,funded percentage,...,comments,duration,weekday,nr_in_month,month,year,time,weekday_nr,month_nr,date
0,39409,WHILE THE TREES SLEEP,http://www.kickstarter.com/projects/emiliesaba...,Film & Video,Short Film,"Columbia, MO",successful,10500.0,11545.0,1.099524,...,2,30.0,Fri,19,Aug,2011,19:28:17 -0000,5,8,08/19/2011
1,126581,Educational Online Trading Card Game,http://www.kickstarter.com/projects/972789543/...,Games,Board & Card Games,"Maplewood, NJ",failed,4000.0,20.0,0.005,...,0,47.18,Mon,2,Aug,2010,03:59:00 -0000,1,8,08/02/2010
2,138119,STRUM,http://www.kickstarter.com/projects/185476022/...,Film & Video,Animation,"Los Angeles, CA",live,20000.0,56.0,0.0028,...,0,28.0,Fri,8,Jun,2012,00:00:31 -0000,5,6,06/08/2012


#### Make date time format out of date column

In [703]:
data['date'] = pd.to_datetime(data['date'], format='%m/%d/%Y')

In [704]:
# turn to right format
data['nr_in_month'] = pd.to_numeric(data['nr_in_month'])
data['year'] = pd.to_numeric(data['year'])
data['month_nr'] = pd.to_numeric(data['month_nr'])

### Reward level column

In [705]:
# remove dollar signs from reward levels
data['reward levels'] = data['reward levels'].str.replace('$', '', regex=True)

In [706]:
# NOTICE! need to split into differnt cells if I'm using

### Location column

#### Split location column into 2 new columns (city and state) ...state can also be country

In [707]:
new_location = data['location'].str.split(", ", n=1, expand=True)
data["city"]= new_location[0] 
data["state"]= new_location[1] 

In [708]:
# remove column not needed
data = data.drop(['location'], axis=1)

### Status column

#### Look at project status on a aggregated level

In [709]:
data.groupby('status').size()

status
canceled         59
failed        18922
live           3904
successful    22908
suspended         4
dtype: int64

#### Look at project status on state lavel

In [710]:
data.groupby(['state', 'status']).size()

state                 status    
AK                    canceled        1
                      failed         54
                      live           13
                      successful     41
AL                    failed         88
                      live           19
                      successful     59
AR                    failed         54
                      live           13
                      successful     38
AZ                    canceled        3
                      failed        319
                      live           78
                      successful    246
Afghanistan           failed          5
                      live            2
                      successful      9
Argent                successful      4
Argentina             failed          7
                      live            3
                      successful     18
Armenia               failed          1
                      live            1
                      successful      3
Austral

#### Make dummies out of staus column

In [711]:
status_dummies = pd.get_dummies(data['status'])

In [712]:
# add dummies column to df
data2 = pd.concat([data, status_dummies], axis=1)
data2.head(3)

Unnamed: 0,project id,name,url,category,subcategory,status,goal,pledged,funded percentage,backers,...,weekday_nr,month_nr,date,city,state,canceled,failed,live,successful,suspended
0,39409,WHILE THE TREES SLEEP,http://www.kickstarter.com/projects/emiliesaba...,Film & Video,Short Film,successful,10500.0,11545.0,1.099524,66,...,5,8,2011-08-19,Columbia,MO,0,0,0,1,0
1,126581,Educational Online Trading Card Game,http://www.kickstarter.com/projects/972789543/...,Games,Board & Card Games,failed,4000.0,20.0,0.005,2,...,1,8,2010-08-02,Maplewood,NJ,0,1,0,0,0
2,138119,STRUM,http://www.kickstarter.com/projects/185476022/...,Film & Video,Animation,live,20000.0,56.0,0.0028,3,...,5,6,2012-06-08,Los Angeles,CA,0,0,1,0,0


### Make a new column stating if its an international or national project

In [713]:
data2.state.unique()

array(['MO', 'NJ', 'CA', 'MI', 'OR', 'TN', 'IL', 'X', 'NY', 'DC', 'NE',
       'ID', 'FL', 'TX', 'CO', 'ME', 'Taiwan', 'IN', 'OH', 'Norway', 'MA',
       'MN', 'PA', 'NC', 'WV', 'CT', 'Chile', 'MD', 'HI', 'VA', 'WA',
       'AZ', 'OK', 'NV', 'Haiti', 'GA', 'AL', 'UT', 'Canada', 'LA', 'SC',
       'Ecuador', 'WI', 'Jamaica', 'Argentina', 'Hong Kong', 'Germany',
       'NM', 'Guatemala', 'NH', 'IA', 'WY', 'Australia', 'RI', 'Sweden',
       'France', 'DE', 'South Africa', 'AK', 'Nepal', 'MT', 'KY', 'VT',
       'Kenya', 'Bosnia and Herzegovina', 'Iceland', 'Mexico', 'KS',
       'Hungary', 'Indonesia', 'China', 'SD', 'Cuba', 'Peru', 'Italy',
       'Netherlands', 'Singapore', 'Ethiopia', 'New Zealand',
       'United Kingdom', 'Austria', 'Turkey', 'AR', 'Mt', 'Congo',
       'Colombia', 'India', 'Mongolia', 'MS', 'Israel', 'Dominica',
       'Spain', 'Finland', 'Czech Republic', 'Japan',
       'Virgin Islands, U.S.', 'Lebanon', 'Armenia', 'Portugal', 'Qatar',
       'Morocco', 'Martiniq

#### Looking at unique values we first need to fix some problems with the column

In [714]:
# Look at the row with city and state in state column
data2.loc[data2['state'] == 'Middleburg, MD']

Unnamed: 0,project id,name,url,category,subcategory,status,goal,pledged,funded percentage,backers,...,weekday_nr,month_nr,date,city,state,canceled,failed,live,successful,suspended
34811,1637912974,Light Ball,http://www.kickstarter.com/projects/819627388/...,Technology,Technology,failed,19500.0,103.0,0.005282,3,...,6,3,2012-03-31,10,"Middleburg, MD",0,1,0,0,0


In [715]:
# Replace 10 with Middleburg
data2['city'] = data2['city'].replace('10', 'Middleburg')

In [716]:
# Make sure that city now is Middleburg
data2.loc[34811 , : ]

project id                                                  1637912974
name                                                        Light Ball
url                  http://www.kickstarter.com/projects/819627388/...
category                                                    Technology
subcategory                                                 Technology
status                                                          failed
goal                                                             19500
pledged                                                            103
funded percentage                                           0.00528205
backers                                                              3
funded date                            Sat, 31 Mar 2012 03:59:00 -0000
levels                                                               6
reward levels                               25,100,200,750,2,000,3,500
updates                                                              0
commen

In [717]:
# Fix the state column
data2['state'] = data2['state'].replace('Middleburg, MD', 'MD')

In [718]:
# Make sure that state now is MD
data2.loc[34811 , : ]

project id                                                  1637912974
name                                                        Light Ball
url                  http://www.kickstarter.com/projects/819627388/...
category                                                    Technology
subcategory                                                 Technology
status                                                          failed
goal                                                             19500
pledged                                                            103
funded percentage                                           0.00528205
backers                                                              3
funded date                            Sat, 31 Mar 2012 03:59:00 -0000
levels                                                               6
reward levels                               25,100,200,750,2,000,3,500
updates                                                              0
commen

#### Now we are going to make the new column (stating US, international or X)

In [719]:
data2['temp'] = data2['state'].str.len()

In [720]:
def label_global (row):
   if row['temp'] == 2 :
      return 'US'
   elif row['temp'] > 2 :
      return 'international'
   else:
      return 'X'

In [721]:
data2['global'] = data2.apply (lambda row: label_global(row), axis=1)

In [722]:
data2.head(12)

Unnamed: 0,project id,name,url,category,subcategory,status,goal,pledged,funded percentage,backers,...,date,city,state,canceled,failed,live,successful,suspended,temp,global
0,39409,WHILE THE TREES SLEEP,http://www.kickstarter.com/projects/emiliesaba...,Film & Video,Short Film,successful,10500.0,11545.0,1.099524,66,...,2011-08-19,Columbia,MO,0,0,0,1,0,2.0,US
1,126581,Educational Online Trading Card Game,http://www.kickstarter.com/projects/972789543/...,Games,Board & Card Games,failed,4000.0,20.0,0.005,2,...,2010-08-02,Maplewood,NJ,0,1,0,0,0,2.0,US
2,138119,STRUM,http://www.kickstarter.com/projects/185476022/...,Film & Video,Animation,live,20000.0,56.0,0.0028,3,...,2012-06-08,Los Angeles,CA,0,0,1,0,0,2.0,US
3,237090,GETTING OVER - One son's search to finally kno...,http://www.kickstarter.com/projects/charnick/g...,Film & Video,Documentary,successful,6000.0,6535.0,1.089167,100,...,2012-04-08,Los Angeles,CA,0,0,0,1,0,2.0,US
4,246101,The Launch of FlyeGrlRoyalty &quot;The New Nam...,http://www.kickstarter.com/projects/flyegrlroy...,Fashion,Fashion,failed,3500.0,0.0,0.0,0,...,2011-06-01,Novi,MI,0,1,0,0,0,2.0,US
5,316217,Dinner Party - a short film about friendship.....,http://www.kickstarter.com/projects/249354515/...,Film & Video,Short Film,successful,3500.0,3582.0,1.023331,39,...,2011-06-22,Portland,OR,0,0,0,1,0,2.0,US
6,325034,Mezzo,http://www.kickstarter.com/projects/geoffsaysh...,Film & Video,Short Film,failed,1000.0,280.0,0.28,8,...,2012-02-18,Collegedale,TN,0,1,0,0,0,2.0,US
7,407836,Help APORTA continue to make handwoven/knit ac...,http://www.kickstarter.com/projects/1078097864...,Fashion,Fashion,successful,2000.0,2180.0,1.09,46,...,2011-12-30,Chicago,IL,0,0,0,1,0,2.0,US
8,436325,Music - Comedy - Album!,http://www.kickstarter.com/projects/mattgriffo...,Music,Music,successful,1000.0,1125.0,1.125,30,...,2010-04-18,Chicago,IL,0,0,0,1,0,2.0,US
9,610918,The Apocalypse Calendar,http://www.kickstarter.com/projects/tqvinn/the...,Art,Illustration,successful,7500.0,9836.0,1.311527,255,...,2011-11-01,Chicago,IL,0,0,0,1,0,2.0,US


In [723]:
# Drop temp column
data2 = data2.drop(['temp'], axis=1)

#### Look at unique values for new column

In [724]:
data2['global'].value_counts()

US               42170
international     2321
X                 1306
Name: global, dtype: int64

### Fix problems in category and subcategory columns

In [725]:
data2['category'].dtypes

dtype('O')

In [726]:
data2['subcategory'].dtypes

dtype('O')

In [727]:
data2['category'].unique()

array(['Film & Video', 'Games', 'Fashion', 'Music', 'Art', 'Technology',
       'Dance', 'Publishing', 'Theater', 'Comics', 'Design',
       'Photography', 'Food', 'Film &amp; Video'], dtype=object)

In [728]:
data2['subcategory'].unique()

array(['Short Film', 'Board & Card Games', 'Animation', 'Documentary',
       'Fashion', 'Music', 'Illustration', 'Film &amp; Video',
       'Open Software', 'Indie Rock', 'Dance', 'Fiction', 'Nonfiction',
       'Theater', 'Games', 'Art Book', 'Country & Folk', 'Comics',
       'Webseries', 'Technology', 'Performance Art', 'Narrative Film',
       'Video Games', 'Product Design', 'Rock', 'Painting', 'Photography',
       'Conceptual Art', 'Jazz', 'Open Hardware', 'Classical Music',
       'Food', 'Art', 'Pop', 'Journalism', 'Poetry', 'Electronic Music',
       'World Music', 'Sculpture', 'Publishing', "Children's Book",
       'Public Art', 'Mixed Media', 'Graphic Design', 'Hip-Hop',
       'Periodical', 'Crafts', 'Design', 'Digital Art',
       'Board &amp; Card Games', 'Country &amp; Folk'], dtype=object)

In [729]:
data2 = data2.replace('Film &amp; Video', 'Film_Video')
data2 = data2.replace('Film & Video', 'Film_Video')
data2 = data2.replace('Country &amp; Folk', 'Country_Folk')
data2 = data2.replace('Children\'s Book', 'Childrens Book')
data2 = data2.replace('Board &amp; Card Games', 'Board_Card Games')
data2 = data2.replace('Country & Folk', 'Country_Folk')
data2 = data2.replace('Board & Card Games', 'Board_Card Games')

In [730]:
data2['category'].value_counts()

Film_Video     13531
Music          10882
Publishing      4752
Art             3976
Theater         2479
Design          1755
Games           1726
Photography     1506
Food            1431
Fashion         1135
Comics          1065
Technology       802
Dance            757
Name: category, dtype: int64

In [731]:
data2['subcategory'].value_counts()

Documentary         4004
Short Film          3939
Music               3232
Film_Video          2492
Theater             2479
Indie Rock          1934
Rock                1788
Narrative Film      1551
Photography         1506
Food                1431
Fashion             1135
Webseries           1121
Fiction             1095
Art                 1070
Comics              1065
Nonfiction          1056
Country_Folk        1056
Product Design      1046
Video Games          902
Dance                757
Pop                  745
Publishing           671
Childrens Book       651
Public Art           563
Board_Card Games     553
Painting             517
Performance Art      504
Classical Music      480
Hip-Hop              476
Jazz                 445
Mixed Media          442
Journalism           429
World Music          426
Animation            424
Technology           369
Sculpture            354
Art Book             338
Electronic Music     300
Periodical           290
Design               283


### Reward levels

In [746]:
data2['reward levels'].head(10)

0                      25,50,100,250,500,1,000,2,500
1                                       1,5,10,25,50
2            1,10,25,40,50,100,250,1,000,1,337,9,001
3    1,10,25,30,50,75,85,100,110,250,500,1,000,5,000
4                               10,25,50,100,150,250
5                          5,25,50,100,250,500,1,000
6                                     5,10,25,50,100
7                         10,20,50,100,250,500,1,000
8           5,8,10,15,20,30,50,100,120,250,500,1,000
9              1,20,35,50,60,100,110,500,1,000,1,500
Name: reward levels, dtype: object

In [745]:
# The reward levels are given as a string
data2['reward levels'].dtypes

dtype('O')

#### Min reward levels

In [769]:
# Get all the values before the first comma (min value)
data2['min reward level'] = [x[0] for x in data2['reward levels'].str.split(',', n=1)]

data2['min reward level'].head(10)

0    25
1     1
2     1
3     1
4    10
5     5
6     5
7    10
8     5
9     1
Name: min reward level, dtype: object

In [772]:
# the 10 most common min reward levels
data2['min reward level'].value_counts().head(10)

1      17147
5      11133
10      9746
25      2223
15      1250
20      1169
2        804
3        514
50       386
100      187
Name: min reward level, dtype: int64

#### Max reward levels

In [775]:
data2['reward levels'].str[-10:].head(10)

0    ,000,2,500
1    5,10,25,50
2    ,337,9,001
3    ,000,5,000
4    00,150,250
5    ,500,1,000
6    ,25,50,100
7    ,500,1,000
8    ,500,1,000
9    ,000,1,500
Name: reward levels, dtype: object

In [None]:
### THIS IS NOT FINISHED! 

### Levels

Just get a breif look

In [None]:
# Levels column is number of levels

In [739]:
data2['levels'].head()

0     7
1     5
2    10
3    13
4     6
Name: levels, dtype: int64

In [740]:
# max numbers of reward levels
data2['levels'].max()

80

In [741]:
# min numbers of reward levels
data2['levels'].min()

1

In [743]:
# average numbers of reward levels
data2['levels'].mean()

8.011987684782847

In [744]:
# median for numbers of reward levels
data2['levels'].median()

7.0

### Get correct order of df

In [733]:
for col in data2.columns: 
    print(col) 

project id
name
url
category
subcategory
status
goal
pledged
funded percentage
backers
funded date
levels
reward levels
updates
comments
duration
weekday
nr_in_month
month
year
time
weekday_nr
month_nr
date
city
state
canceled
failed
live
successful
suspended
global


In [734]:
data2 = data2[['project id', 'name', 'url', 'category', 'subcategory', 'city', 'state', 'global', 'status', 'canceled',
               'failed', 'live', 'successful', 'suspended', 'goal', 'pledged', 'funded percentage', 'backers',
               'funded date', 'levels', 'reward levels', 'min reward level', 'updates', 'comments', 'duration', 
               'weekday', 'weekday_nr', 'nr_in_month', 'month', 'month_nr', 'year', 'time', 'date']]

data2.head(3)

Unnamed: 0,project id,name,url,category,subcategory,city,state,global,status,canceled,...,comments,duration,weekday,weekday_nr,nr_in_month,month,month_nr,year,time,date
0,39409,WHILE THE TREES SLEEP,http://www.kickstarter.com/projects/emiliesaba...,Film_Video,Short Film,Columbia,MO,US,successful,0,...,2,30.0,Fri,5,19,Aug,8,2011,19:28:17 -0000,2011-08-19
1,126581,Educational Online Trading Card Game,http://www.kickstarter.com/projects/972789543/...,Games,Board_Card Games,Maplewood,NJ,US,failed,0,...,0,47.18,Mon,1,2,Aug,8,2010,03:59:00 -0000,2010-08-02
2,138119,STRUM,http://www.kickstarter.com/projects/185476022/...,Film_Video,Animation,Los Angeles,CA,US,live,0,...,0,28.0,Fri,5,8,Jun,6,2012,00:00:31 -0000,2012-06-08


In [735]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45797 entries, 0 to 45956
Data columns (total 32 columns):
project id           45797 non-null int64
name                 45797 non-null object
url                  45797 non-null object
category             45797 non-null object
subcategory          45797 non-null object
city                 45797 non-null object
state                45796 non-null object
global               45797 non-null object
status               45797 non-null object
canceled             45797 non-null int64
failed               45797 non-null int64
live                 45797 non-null int64
successful           45797 non-null int64
suspended            45797 non-null int64
goal                 45797 non-null float64
pledged              45797 non-null float64
funded percentage    45797 non-null float64
backers              45797 non-null int64
funded date          45797 non-null object
levels               45797 non-null int64
reward levels        45797 non-null 

## Read data to CSV

In [736]:
data2.to_csv('clean_data.csv', index=False)