In [1]:
# %load ../snippets/start.py
# system packages
import os, sys
import warnings
warnings.filterwarnings('ignore')

# basic wrangling
import numpy as np
import pandas as pd

# eda tools
import missingno as msno
import pandas_profiling
import pickle

In [2]:
# %load ../snippets/visual.py
# must go first
%matplotlib inline
%config InlineBackend.figure_format='retina'

# plotting
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_context("poster", font_scale=1.3)

# Update matplotlib defaults to something nicer
mpl_update = {'font.size':16,
              'xtick.labelsize':14,
              'ytick.labelsize':14,
              'figure.figsize':[12.0,8.0],
              'axes.color_cycle':['#0055A7', '#2C3E4F', '#26C5ED', '#00cc66', '#D34100', '#FF9700','#091D32'],
              'axes.labelsize':20,
              'axes.labelcolor':'#677385',
              'axes.titlesize':20,
              'lines.color':'#0055A7',
              'lines.linewidth':3,
              'text.color':'#677385'}
mpl.rcParams.update(mpl_update)

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

samplesubmission.csv
test.csv
train.csv



In [3]:
train_df = pd.read_csv("../input/train.csv")

In [4]:
test_df = pd.read_csv("../input/test.csv")

In [5]:
test_df.shape

(63465, 12)

In [6]:
test_df.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at
0,kkst917493670,Bràthair.,"My first film, of many to come. Trying to purs...",7000.0,brathair,False,US,USD,1449619185,1449619185,1446002581,1446159585
1,kkst1664901914,THE SCREENWRITER,A young man that has earned his master's in sc...,35000.0,the-screenwriter,False,US,USD,1453435620,1453435620,1450297323,1450411620
2,kkst925125077,The Hornets Nest the Fairmont Heights Story,Film about a high school constructed for negro...,49500.0,the-hornets-nest-the-fairmont-heights-story,False,US,USD,1451780700,1451780700,1448581356,1448672128
3,kkst1427645275,BROTHERS Season 2 - Groundbreaking Transgender...,The acclaimed series about a group of transgen...,40000.0,brothers-season-2-groundbreaking-transgender-male,False,US,USD,1445021518,1445021530,1440966830,1442429518
4,kkst1714249266,Blackdom the movie,Blackdom's history offers a new narrative tha...,20000.0,blackdom-the-movie,False,US,USD,1462068840,1462068844,1455765276,1458334890


In [7]:
train_df.shape

(108129, 14)

In [8]:
train_df.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,created_at,launched_at,backers_count,final_status
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,1240600507,1240602723,3,1
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,1242429000,1242432018,1240960224,1240975592,2,0
2,kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,USD,1243027560,1243027818,1242163613,1242164398,0,0
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,USD,1243555740,1243556121,1240963795,1240966730,18,1
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,1243769880,1243770317,1241177914,1241180541,1,0


In [9]:
train_df['final_status'].value_counts()

0    73568
1    34561
Name: final_status, dtype: int64

In [10]:
train_df['disable_communication'].value_counts()

False    107806
True        323
Name: disable_communication, dtype: int64

In [11]:
train_df['country'].value_counts()

US    92033
GB     8758
CA     3736
AU     1880
NL      705
NZ      355
SE      240
DK      196
NO      114
IE      111
DE        1
Name: country, dtype: int64

In [12]:
train_df['currency'].value_counts()

USD    92033
GBP     8758
CAD     3736
AUD     1880
EUR      817
NZD      355
SEK      240
DKK      196
NOK      114
Name: currency, dtype: int64

In [13]:
test_df.isnull().sum()

project_id               0
name                     0
desc                     4
goal                     0
keywords                 0
disable_communication    0
country                  0
currency                 0
deadline                 0
state_changed_at         0
created_at               0
launched_at              0
dtype: int64

In [14]:
train_df.isnull().sum()

project_id               0
name                     3
desc                     9
goal                     0
keywords                 0
disable_communication    0
country                  0
currency                 0
deadline                 0
state_changed_at         0
created_at               0
launched_at              0
backers_count            0
final_status             0
dtype: int64

In [15]:
train_df = train_df.dropna()

In [16]:
test_df = test_df.fillna(0)

In [17]:
test_df.columns

Index(['project_id', 'name', 'desc', 'goal', 'keywords',
       'disable_communication', 'country', 'currency', 'deadline',
       'state_changed_at', 'created_at', 'launched_at'],
      dtype='object')

In [18]:
train_df.columns

Index(['project_id', 'name', 'desc', 'goal', 'keywords',
       'disable_communication', 'country', 'currency', 'deadline',
       'state_changed_at', 'created_at', 'launched_at', 'backers_count',
       'final_status'],
      dtype='object')

# Dataset Creation for Tree Based Models

In [19]:
c_features = train_df[['disable_communication','country', 'currency']]

In [20]:
c_features_test = test_df[['disable_communication','country', 'currency']]

In [21]:
n_features = train_df[['goal','backers_count']]

In [22]:
n_features_test = test_df[['goal']]

In [23]:
dt_features = train_df[['deadline','state_changed_at', 'created_at', 'launched_at']]

In [24]:
dt_features_test = test_df[['deadline','state_changed_at', 'created_at', 'launched_at']]

In [25]:
text_features = train_df[['name', 'desc','keywords']]

In [26]:
text_features_test = test_df[['name', 'desc','keywords']]

In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
class_le = LabelEncoder()

In [29]:
tree_c_features = c_features.apply(class_le.fit_transform)

In [30]:
tree_c_features_test = c_features_test.apply(class_le.fit_transform)

In [31]:
dt_features['deadline'] = pd.to_datetime(dt_features['deadline'],unit='s')

In [32]:
dt_features_test['deadline'] = pd.to_datetime(dt_features_test['deadline'],unit='s')

In [33]:
dt_features['state_changed_at'] = pd.to_datetime(dt_features['state_changed_at'],unit='s')

In [34]:
dt_features_test['state_changed_at'] = pd.to_datetime(dt_features_test['state_changed_at'],unit='s')

In [35]:
dt_features['created_at'] = pd.to_datetime(dt_features['created_at'],unit='s')

In [36]:
dt_features_test['created_at'] = pd.to_datetime(dt_features_test['created_at'],unit='s')

In [37]:
dt_features['launched_at'] = pd.to_datetime(dt_features['launched_at'],unit='s')

In [38]:
dt_features_test['launched_at'] = pd.to_datetime(dt_features_test['launched_at'],unit='s')

In [39]:
final_df_test = dt_features_test.join(tree_c_features_test)

In [40]:
final_df = dt_features.join(tree_c_features)

In [41]:
final_df = final_df.join(n_features)

In [42]:
final_df_test = final_df_test.join(n_features_test)

In [43]:
final_df['final_status'] = train_df['final_status']

In [44]:
final_df_test

Unnamed: 0,deadline,state_changed_at,created_at,launched_at,disable_communication,country,currency,goal
0,2015-12-08 23:59:45,2015-12-08 23:59:45,2015-10-28 03:23:01,2015-10-29 22:59:45,0,20,12,7000.0
1,2016-01-22 04:07:00,2016-01-22 04:07:00,2015-12-16 20:22:03,2015-12-18 04:07:00,0,20,12,35000.0
2,2016-01-03 00:25:00,2016-01-03 00:25:00,2015-11-26 23:42:36,2015-11-28 00:55:28,0,20,12,49500.0
3,2015-10-16 18:51:58,2015-10-16 18:52:10,2015-08-30 20:33:50,2015-09-16 18:51:58,0,20,12,40000.0
4,2016-05-01 02:14:00,2016-05-01 02:14:04,2016-02-18 03:14:36,2016-03-18 21:01:30,0,20,12,20000.0
5,2016-04-27 17:26:34,2016-04-27 17:26:34,2016-03-16 23:40:26,2016-03-28 17:26:34,0,20,12,10000.0
6,2015-12-14 10:03:43,2015-12-14 10:03:43,2015-11-08 14:56:59,2015-11-14 10:03:43,0,12,4,1000.0
7,2016-02-26 20:00:00,2016-02-26 20:00:01,2015-12-24 13:47:17,2015-12-28 22:03:04,0,9,5,300.0
8,2015-10-08 20:59:00,2015-10-08 20:59:01,2015-09-08 20:59:17,2015-09-08 22:16:04,0,20,12,6000.0
9,2015-10-06 15:10:22,2015-10-06 15:10:23,2015-08-06 18:40:15,2015-09-01 15:10:22,0,20,12,8000.0


In [45]:
import pickle

In [46]:
pickle.dump(final_df,open('../proxy/dataset1','wb'))

In [46]:
pickle.dump(final_df_test,open('../proxy/dataset1_test','wb'))

In [47]:
text_features = train_df[['name', 'desc','keywords']]

In [48]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [49]:
stops = set(stopwords.words("english"))
def cleanData(text, lowercase = False, remove_stops = False, stemming = False):
    txt = str(text)
    txt = re.sub(r'[^A-Za-z0-9\s]',r'',txt)
    txt = re.sub(r'\n',r' ',txt)
    
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])
        
    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stops])
    
    if stemming:
        st = PorterStemmer()
        txt = " ".join([st.stem(w) for w in txt.split()])

    return txt

In [50]:
train_df['desc'] = train_df['desc'].map(lambda x: cleanData(x, lowercase=True, remove_stops=True, stemming=True))

In [51]:
# initialise the functions - we'll create separate models for each type.
countvec = CountVectorizer(analyzer='word', ngram_range = (1,1),max_features=500)
tfidfvec = TfidfVectorizer(analyzer='word', ngram_range = (1,1),max_features=500)

In [52]:
bagofwords = countvec.fit_transform(train_df['desc'])
tfidfdata = tfidfvec.fit_transform(train_df['desc'])

In [53]:
bow_df = pd.DataFrame(bagofwords.todense())
tfidf_df = pd.DataFrame(tfidfdata.todense())

In [54]:
bow_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
bow_df.columns = ['col'+ str(x) for x in bow_df.columns]
tfidf_df.columns = ['col' + str(x) for x in tfidf_df.columns]

In [56]:
pickle.dump(bow_df,open('../proxy/dataset3_textb','wb'))

In [57]:
pickle.dump(tfidf_df,open('../proxy/dataset3_texttf','wb'))

In [58]:
bow_df_df.shape

(108119, 500)