In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

In [40]:
users = pd.read_csv('../data/train_users_2.csv')
test_users = pd.read_csv('../data/test_users.csv')
sessions = pd.read_csv('../data/sessions.csv')

In [41]:
users['target'] = users.apply(lambda r: 1 if r['country_destination'] != 'NDF' else 0, axis=1)

In [42]:
users.groupby('target')['id'].count()

target
0    124543
1     88908
Name: id, dtype: int64

In [43]:
users = pd.concat([users[users['target'] == 0].head(88000), users[users['target'] == 1].head(88000)])

In [46]:
users = users.sample(frac=1)

In [79]:
users['gender_revealed'] = ((users['gender'] != '-unknown-') & (users['gender'] != 'OTHER'))

In [71]:
def process_age(age):
    if age > 1000:
        return 2015 - age
    elif age < 100:
        return age
    else: 
        return -1

users['age'] = users.apply(lambda r: process_age(r['age']), axis=1)
users['age_group'] = sum([(users['age'] < 0).astype(int),
    (users['age'] < 18).astype(int),
    (users['age'] < 25).astype(int),
    (users['age'] < 30).astype(int),
    (users['age'] < 35).astype(int),
    (users['age'] < 40).astype(int),
    (users['age'] < 60).astype(int),
    (users['age'] < 100).astype(int)])

In [81]:
users['signup_flow_group'] = sum([
    (users['signup_flow'] < 3),
    (users['signup_flow'] < 7),
    (users['signup_flow'] < 100)
])

In [91]:
kept_aff_channels = set(['api', 'other', 'content'])

users['affiliate_channel_group'] = users.apply(lambda r: r['affiliate_channel'] if r['affiliate_channel'] in kept_aff_channels else 'unimportant', axis=1)

In [99]:
kept_first_device_types = set(['Mac Desktop', 'Other/Unknown'])

users['first_device_type_group'] = users.apply(lambda r: r['first_device_type'] if r['first_device_type'] in kept_first_device_types else 'unimportant', axis=1)

In [48]:
users['first_affiliate_tracked'] = users['first_affiliate_tracked'].fillna('-unknown-')

In [100]:
users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,signup_app,first_device_type,first_browser,country_destination,target,age_group,gender_revealed,signup_flow_group,affiliate_channel_group,first_device_type_group
51166,l1nce2xpuv,2012-12-05,20121205202520,,FEMALE,40.0,facebook,0,en,direct,...,Web,Windows Desktop,IE,NDF,0,2,True,3,unimportant,unimportant
79164,ggldfdrdv5,2013-05-24,20130524011757,2013-05-24,MALE,26.0,basic,0,en,direct,...,Web,Mac Desktop,Chrome,NL,1,5,True,3,unimportant,Mac Desktop
4200,k8n0y2rphe,2011-04-12,20110412203206,,-unknown-,-1.0,facebook,2,en,direct,...,Web,Windows Desktop,IE,NDF,0,8,False,3,unimportant,unimportant
36723,3yfheape9l,2012-08-16,20120816074816,,FEMALE,-1.0,basic,0,en,direct,...,Web,iPad,Mobile Safari,NDF,0,8,True,3,unimportant,unimportant
75448,cqbratofs5,2013-05-07,20130507161522,2013-10-26,MALE,36.0,basic,0,en,sem-brand,...,Web,Windows Desktop,Chrome,US,1,3,True,3,unimportant,unimportant


In [126]:
to_drop = ['date_first_booking', 'id', 'target', 'country_destination', 'date_account_created', 'timestamp_first_active', 
           'age', 'gender', 'signup_flow', 'affiliate_channel', 'first_device_type', 
            'language']
data = users.drop(to_drop, axis=1)

In [127]:
data

Unnamed: 0,signup_method,affiliate_provider,first_affiliate_tracked,signup_app,first_browser,age_group,gender_revealed,signup_flow_group,affiliate_channel_group,first_device_type_group
51166,facebook,direct,linked,Web,IE,2,True,3,unimportant,unimportant
79164,basic,direct,linked,Web,Chrome,5,True,3,unimportant,Mac Desktop
4200,facebook,direct,untracked,Web,IE,8,False,3,unimportant,unimportant
36723,basic,direct,untracked,Web,Mobile Safari,8,True,3,unimportant,unimportant
75448,basic,google,omg,Web,Chrome,3,True,3,unimportant,unimportant
...,...,...,...,...,...,...,...,...,...,...
57734,facebook,facebook-open-graph,untracked,Web,Firefox,2,True,3,other,Mac Desktop
119297,basic,direct,linked,Web,IE,8,False,3,unimportant,unimportant
106201,facebook,google,omg,Web,Chrome,5,True,3,unimportant,unimportant
3716,basic,direct,untracked,Web,IE,3,False,3,unimportant,unimportant


In [128]:
X = pd.get_dummies(data)
y = users['target']

In [129]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(max_depth=14)
np.mean(cross_val_score(model, X, y, cv=4))

0.6819034090909091