In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as mplt
%matplotlib inline
import seaborn as sns

In [5]:
df = pd.read_csv("train.csv")

In [6]:
df.shape

(108129, 14)

In [7]:
df.head(3)

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0
2,kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,USD,1243027560,1243027818,1242163613,1242164398,0,0


In [8]:
df.created_at = pd.to_datetime(df.created_at, unit='s')
df.deadline = pd.to_datetime(df.deadline, unit='s')
df.launched_at = pd.to_datetime(df.launched_at, unit='s')
df.state_changed_at = pd.to_datetime(df.state_changed_at, unit='s')

In [9]:
df['time_to_state_change'] = (df.state_changed_at - df.created_at).dt.days
df['time_to_launch'] = (df.launched_at - df.created_at).dt.days
df['launch_and_state_changed'] = (df.launched_at - df.state_changed_at).dt.days
df['deadline_and_launched_at'] = (df.deadline - df.launched_at).dt.days
df['deadline_and_state_changed_at'] = (df.deadline - df.state_changed_at).dt.days

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108129 entries, 0 to 108128
Data columns (total 19 columns):
project_id                       108129 non-null object
name                             108126 non-null object
desc                             108120 non-null object
goal                             108129 non-null float64
keywords                         108129 non-null object
disable_communication            108129 non-null bool
country                          108129 non-null object
currency                         108129 non-null object
deadline                         108129 non-null datetime64[ns]
state_changed_at                 108129 non-null datetime64[ns]
created_at                       108129 non-null datetime64[ns]
launched_at                      108129 non-null datetime64[ns]
backers_count                    108129 non-null int64
final_status                     108129 non-null int64
time_to_state_change             108129 non-null int64
time_to_launch       

In [11]:
df['name'] = df.name.fillna('').str.lower()
df['desc'] = df.desc.fillna('').str.lower()
df['keywords'] = df.keywords.str.lower()

In [12]:
df['state_mentioned_in_name'] = 0
df.loc[
    df.name.str.contains('\(canceled|suspended|failed|successful\)'),
    'state_mentioned_in_name'
] = 1

In [13]:
df.disable_communication = df.disable_communication.apply(lambda x: 1 if x else 0)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108129 entries, 0 to 108128
Data columns (total 20 columns):
project_id                       108129 non-null object
name                             108129 non-null object
desc                             108129 non-null object
goal                             108129 non-null float64
keywords                         108129 non-null object
disable_communication            108129 non-null int64
country                          108129 non-null object
currency                         108129 non-null object
deadline                         108129 non-null datetime64[ns]
state_changed_at                 108129 non-null datetime64[ns]
created_at                       108129 non-null datetime64[ns]
launched_at                      108129 non-null datetime64[ns]
backers_count                    108129 non-null int64
final_status                     108129 non-null int64
time_to_state_change             108129 non-null int64
time_to_launch      

In [19]:
df.currency.unique()

array(['USD', 'GBP', 'CAD', 'AUD', 'NZD', 'EUR', 'SEK', 'NOK', 'DKK'],
      dtype=object)

In [16]:
currency_converter = {
    'USD': 1, #US DOLLAR
    'GBP': 1.38, #BRITISH POUND
    'CAD': 0.78, #CANADAIAN DOLLAR
    'AUD': 0.78, #AUSTRALIAN DOLLAR
    'NZD': 0.72, #NEW ZEALAND DOLLAR
    'EUR': 1.23, #EURO
    'SEK': 0.12, #SWEDISH KRONA
    'NOK': 0.13, #NORWEGIAN KRONE
    'DKK': 0.17, #DANISH KRONE
    'CHF': 1.07, #SWISS FRANC
    'HKD': 0.13, #HONG KONG DOLLAR
    'SGD': 0.76, #SINGAPORE DOLLAR
    'MXN': 0.053 #MEXICAN PESO
}

In [17]:
def convert_curr(curr, val):
    return currency_converter[curr] * val

In [18]:
df['converted_curr'] = df.apply(lambda x: convert_curr(x['currency'], x['goal']), axis=1)
# df.converted_curr

In [20]:
cnt_grp = {
    'CA': 0, 'US': 0, 'MX': 0,
    'IE': 1, 'NL': 1, 'FR': 1, 'BE': 1, 'ES': 1, 'DE': 1, 'AT': 1, 'LU': 1, 'DK': 1, 'CH': 1,
    'IT': 1, 'GB': 1, 'NO': 1, 'SE': 1,
    'SG': 2, 'HK': 2,
    'AU': 3, 'NZ': 3
}

In [22]:
df['country_grp'] = df.country.apply(lambda x: 'G' + str(cnt_grp[x]))

In [23]:
df = pd.concat(
    [
        df,
        pd.get_dummies(df.country_grp)
    ],
    axis=1
)

In [24]:
columns_to_remove = [
    'name',
    'created_at',
    'launched_at',
    'deadline',
    'state_changed_at',
    'country',
    'country_grp',
    'currency'
]