### Introduction

This notebook is created and shared to help people learn and understand the process of solving a problem which involves text variables. Apart from creating new variables, you'll learn to extract ~650 text (count) features and use them in training a xgboost model. This script scores ~0.70 on public leaderboard.
For any questions, feel free to raise issues.

In [5]:
# load libraries

import pandas as pd
import numpy as np
import re
import datetime
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb

pd.set_option('display.max_colwidth',100)

In [6]:
#load data/home/dar/challenge/Machine_Learning/datafiles
train = pd.read_csv('/home/dar/challenge/Machine_Learning/datafiles/train.csv')
test = pd.read_csv('/home/dar/challenge/Machine_Learning/datafiles/test.csv')

In [7]:
# convert unix time format
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
#    print train[x]
    train[x] = train[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.fromtimestamp(int(k)).strftime('%Y-%m-%d %H:%M:%S'))

### Some features

In [8]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()

test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()

In [9]:
cols_to_use = ['name','desc']
len_feats = ['name_len','desc_len']
count_feats = ['name_count','desc_count']

for i in np.arange(2):
    train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
    test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)

In [10]:
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()

test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()

### Some more features

In [11]:
# converting string variables to datetime
unix_cols = ['deadline','state_changed_at','launched_at','created_at']

for x in unix_cols:
    train[x] = train[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))
    test[x] = test[x].apply(lambda k: datetime.datetime.strptime(k, '%Y-%m-%d %H:%M:%S'))

In [12]:
# there should be simpler way - might take longer
# creating list with time difference between 1) launched_at and created_at 2) deadline and launched_at

time1 = []
time3 = []
for i in np.arange(train.shape[0]):
    time1.append(np.round((train.loc[i, 'launched_at'] - train.loc[i, 'created_at']).total_seconds()).astype(int))
    time3.append(np.round((train.loc[i, 'deadline'] - train.loc[i, 'launched_at']).total_seconds()).astype(int))

In [13]:
train['time1'] = np.log(time1)
train['time3'] = np.log(time3)

In [14]:
# for test data
time5 = []
time6 = []
for i in np.arange(test.shape[0]):
    time5.append(np.round((test.loc[i, 'launched_at'] - test.loc[i, 'created_at']).total_seconds()).astype(int))
    time6.append(np.round((test.loc[i, 'deadline'] - test.loc[i, 'launched_at']).total_seconds()).astype(int))

In [15]:
test['time1'] = np.log(time5)
test['time3'] = np.log(time6)

In [16]:
feat = ['disable_communication','country']

for x in feat:
    le = LabelEncoder()
    le.fit(list(train[x].values) + list(test[x].values))
    train[x] = le.transform(list(train[x]))
    test[x] = le.transform(list(test[x]))

In [17]:
train['goal'] = np.log1p(train['goal'])
test['goal'] = np.log1p(test['goal'])

### Text Cleaning

In [18]:
# creating a full list of descriptions from train and etst
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)

In [19]:
# this function cleans punctuations, digits and irregular tabs. Then converts the sentences to lower
def desc_clean(word):
    p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
    p1 = p1.lower()
    return p1

kickdesc = kickdesc.map(desc_clean)

In [20]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]

stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]

kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]

kickdesc = [' '.join(x) for x in kickdesc]

### Creating Count Features

In [45]:
# Due to memory error, limited the number of features to 650
cv = CountVectorizer(max_features=2000)

In [46]:
alldesc = cv.fit_transform(kickdesc).todense()

In [47]:
#create a data frame
combine = pd.DataFrame(alldesc)
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)

In [48]:
#split the text features

train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]

test_text.reset_index(drop=True,inplace=True)

### Finalizing train and test data before merging

In [49]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','time1','time3','goal']

In [50]:
target = train['final_status']

In [51]:
train = train.loc[:,cols_to_use]
test = test.loc[:,cols_to_use]

In [52]:
X_train = pd.concat([train, train_text],axis=1)
X_test = pd.concat([test, test_text],axis=1)

In [53]:
print X_train.shape
print X_test.shape

(108129, 109)
(63465, 109)


### Model Training

In [57]:
#import pandas_ml as pdml
#df = pdml.ModelFrame(data=X_train, label = target) 
#estimator = df.xgboost.XGBClassifier()
#df.head()
dtrain = xgb.DMatrix(data=X_train, label = target)
dtest = xgb.DMatrix(data=X_test)

In [58]:
params = {
    'objective':'binary:logistic',
    'eval_metric':'error',
    'eta':0.025,
    'max_depth':6,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'min_child_weight':5
    
}

In [None]:
# You can probably get better accuracy with rounds > 1000. 
bst = xgb.cv(params, dtrain, num_boost_round=3000,nfold=5L)

[0]	cv-test-error:0.316412+0.001646	cv-train-error:0.313944+0.002180
[1]	cv-test-error:0.314469+0.002762	cv-train-error:0.311762+0.002445
[2]	cv-test-error:0.314322+0.002378	cv-train-error:0.310712+0.002235
[3]	cv-test-error:0.313045+0.001742	cv-train-error:0.309771+0.001499
[4]	cv-test-error:0.312416+0.002054	cv-train-error:0.309459+0.001371
[5]	cv-test-error:0.312185+0.003046	cv-train-error:0.309140+0.000953
[6]	cv-test-error:0.312425+0.002733	cv-train-error:0.309332+0.000811
[7]	cv-test-error:0.312573+0.003049	cv-train-error:0.309965+0.001047
[8]	cv-test-error:0.312574+0.002844	cv-train-error:0.309121+0.000766
[9]	cv-test-error:0.312444+0.002691	cv-train-error:0.309115+0.000822
[10]	cv-test-error:0.312157+0.002536	cv-train-error:0.309165+0.000762
[11]	cv-test-error:0.312749+0.002566	cv-train-error:0.309718+0.000845
[12]	cv-test-error:0.312795+0.002787	cv-train-error:0.310016+0.000985
[13]	cv-test-error:0.313008+0.002880	cv-train-error:0.310511+0.001284
[14]	cv-test-error:0.312897+0.

In [82]:
bst_train = xgb.train(params, dtrain, num_boost_round=3000)

In [83]:
p_test = bst_train.predict(dtest)

In [84]:
sub = pd.DataFrame()

test1 = pd.read_csv('/home/dar/challenge/Machine_Learning/datafiles/test.csv')
sub['project_id'] = test1['project_id']
sub['final_status'] = p_test

In [85]:
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]

In [86]:
sub.to_csv("xgb3.csv",index=False) #0.70