In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('./train_users_2.csv')
test = pd.read_csv('./test_users.csv')
train.drop('id', axis=1, inplace=True)
labels = train['country_destination'].values
train.drop('country_destination', axis=1, inplace=True)
print 'There are %d data points in training set.' % (train.shape[0])
print train.dtypes

There are 213451 data points in training set.
date_account_created        object
timestamp_first_active       int64
date_first_booking          object
gender                      object
age                        float64
signup_method               object
signup_flow                  int64
language                    object
affiliate_channel           object
affiliate_provider          object
first_affiliate_tracked     object
signup_app                  object
first_device_type           object
first_browser               object
dtype: object


# Data cleaning:
## According to the data exploration notebook, there are a few steps we need to clean the data
1. For age, we need to replace users with invalid age(too large or too small) with nan
2. For gender, we need to replace unknown gender with nan
3. For timestamp, we need to delete the data with NULL timestamp
4. Convert categorical features to type "category"
5. Convert date and time data to type "date"

In [3]:
train_index = train.shape[0]
id_test = test['id']
df_all = pd.concat((train, test), axis=0, ignore_index=True)
df_all.drop(['id'], axis=1, inplace=True)
print df_all.columns

Index([u'affiliate_channel', u'affiliate_provider', u'age',
       u'date_account_created', u'date_first_booking',
       u'first_affiliate_tracked', u'first_browser', u'first_device_type',
       u'gender', u'language', u'signup_app', u'signup_flow', u'signup_method',
       u'timestamp_first_active'],
      dtype='object')


### Concat train and test data to do data cleaning

In [4]:
df_all.loc[df_all['age']>95] = np.nan
df_all.loc[df_all['age']<15] = np.nan
df_all['gender'].replace('-unknown', np.nan, inplace=True)
df_all = df_all.dropna(axis=0, how='all')
categorical = ['affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'first_browser'
              , 'first_device_type', 'gender', 'language', 'signup_app', 'signup_flow', 'signup_method']
for cat in categorical:
    df_all = pd.concat((df_all, pd.get_dummies(df_all[cat].astype('category'), prefix=cat)), axis=1)
    df_all.drop([cat], axis=1, inplace=True)

In [5]:
# Now let us take a look at missing data
df_null = df_all.isnull().sum() / df_all.shape[0] * 100
print df_null[df_null>0]

# Since more than half of the data for date_first_booking is missing, I will simply drop it
df_all.drop(['date_first_booking'], axis=1, inplace=True)

age                   42.856567
date_first_booking    67.892596
dtype: float64


### Convert date_created to year, month, day

In [6]:
date_created = df_all['date_account_created'].apply(lambda x:x.split('-'))
df_all['year_created'] = date_created.apply(lambda x: int(x[0]))
df_all['month_created'] = date_created.apply(lambda x: int(x[1]))
df_all['day_created'] = date_created.apply(lambda x: int(x[2]))
df_all.drop(['date_account_created'], axis=1, inplace=True)

### Convert time first active to year, month, day

In [7]:
time_active = df_all['timestamp_first_active'].astype(int).astype(str).apply(lambda x:[x[0:4], x[4:6], x[6:8]])
df_all['year_active'] = time_active.apply(lambda x:int(x[0]))
df_all['month_active'] = time_active.apply(lambda x:int(x[1]))
df_all['day_active'] = time_active.apply(lambda x:int(x[2]))
df_all.drop(['timestamp_first_active'], axis=1, inplace=True)

In [8]:
# Fill the nan age with -1
df_all.fillna(value=0, inplace=True)

# Training phase : Model selection
1. Split data into train and test subsets, using the data point count we had before;
2. This is a multi-class classification problem, we can try DecisionTree, LogisticRegression and SVM to begin with

In [9]:
from sklearn.preprocessing import LabelEncoder
values = df_all.values
X_train = values[:train_index]
X_test = values[train_index:]
le = LabelEncoder()
y = le.fit_transform(labels)

In [10]:
# Implement the ndcg measure used by Kaggle, here I used the hack for this problem that IDCG_k is 1 if the correct result
# is in the predicted list and zero otherwise. 
from time import time
def ndcg(y_true, y_pred, k=5):
    ''' Normalized Discounted Cumulative Score.
    
    Parameters
    ----------
    
    y_true : list, shape=[n_samples]
             True class labels
    y_pred : list, shape=[n_samples, n_classes]
             Predicted probabilities
    k : int
        Number of probabilities to use
    
    Return
    ---------
    score: float
           The mean NDCG score on the data set
    '''
    
    def dcg(y_true, y_pred, k=5):
        order = sorted(range(len(y_pred)), key=lambda k:y_pred[k], reverse=True)
        # Assume the classed are dictinct
        for i in range(1,k+1):
            if order[i-1] == y_true:
                return 1/np.log2(i+1)
        return 0
    
    score = []
    for i in range(len(y_true)):
        score.append(dcg(y_true[i], y_pred[i], k))
    return np.mean(score)

def train_predict(X_train, y, X_test, y_test, clf):
    start = time()
    clf.fit(X_train, y)
    end = time()
    train_time = end - start
      
    pred = clf.predict_proba(X_train)
    score = ndcg(y, pred)
    
    start = time()
    pred_test = clf.predict_proba(X_test)
    end = time()
    predict_time = end - start
    
    score_test = ndcg(y_test, pred_test)
    print "Trained model %s on %d data points in %.4f seconds." % (clf.__class__.__name__,X_train.shape[0],train_time)
    print "Predicted on the training set in %.4f seconds." % (predict_time)
    print "The score on the training set is %.4f and that on the test set is %.4f" % (score, score_test)
    print "\n"

In [11]:
# Prepare the data set with increasing size for comparison among the classifiers
size = X_train.shape[0]/4

train_1 = X_train[0:size]
test_1 = y[0:size]
train_2 = X_train[0:size*2]
test_2 = y[0:size*2]
train_3 = X_train[0:size*3]
test_3 = y[0:size*3]

test_X = X_train[size*3:]
test_y = y[size*3:]

In [12]:
# Initialize the classifiers for experiment
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
RAND_SEED = 0
dtclf = DecisionTreeClassifier(min_samples_split=20, random_state=RAND_SEED)
lrclf = LogisticRegression()
nbclf = MultinomialNB()
clfs = [dtclf, lrclf, nbclf]

In [19]:
for clf in clfs:
    train_predict(train_1, test_1, test_X, test_y, clf)
    train_predict(train_2, test_2, test_X, test_y, clf)
    train_predict(train_3, test_3, test_X, test_y, clf)

Trained model DecisionTreeClassifier on 53362 data points in 0.6148 seconds.
Predicted on the training set in 0.0337 seconds.
The score on the training set is 0.8596 and that on the test set is 0.6507


Trained model DecisionTreeClassifier on 106724 data points in 2.2828 seconds.
Predicted on the training set in 0.0362 seconds.
The score on the training set is 0.8607 and that on the test set is 0.6905


Trained model DecisionTreeClassifier on 160086 data points in 2.2978 seconds.
Predicted on the training set in 0.0428 seconds.
The score on the training set is 0.8666 and that on the test set is 0.7078


Trained model LogisticRegression on 53362 data points in 6.4993 seconds.
Predicted on the training set in 0.0870 seconds.
The score on the training set is 0.7758 and that on the test set is 0.8221


Trained model LogisticRegression on 106724 data points in 9.5587 seconds.
Predicted on the training set in 0.0938 seconds.
The score on the training set is 0.7873 and that on the test set is

** Classifier 1: Decision Tree **

| Training Set Size | Training Time | Prediction Time (test) | NDCG (train)| NDCG (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 53362             |  0.6148                 |   0.0337               | 0.8596           |  0.6507         |
| 106724            |  2.2828                 |   0.0362               | 0.8607           |  0.6905         |
| 160086            |  2.2978                 |   0.0428               | 0.7873           |  0.8223         |

** Classifier 2: Logistic Regression **

| Training Set Size | Training Time | Prediction Time (test) | NDCG (train)| NDCG (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 53362             |  6.4993                 |   0.0870               | 0.7758           |  0.8221         |
| 106724            |  9.5587                 |   0.0938               | 0.7873           |  0.8223         |
| 160086            |  13.2305                |   0.1632               | 0.8016           |  0.8223         |

** Classifier 3: Multinomial Naive Bayes **

| Training Set Size | Training Time | Prediction Time (test) | NDCG (train)| NDCG (test) |
| :---------------: | :---------------------: | :--------------------: | :--------------: | :-------------: |
| 53362             |  0.0779                 |   0.0789               | 0.7607           |  0.6616         |
| 106724            |  0.1998                 |   0.1644               | 0.7813           |  0.8062         |
| 160086            |  0.5288                 |   0.1133               | 0.7945           |  0.8091         |

## Training phase: Cross validation
As we can see that when training on the whole data set, DecisionTree is a better model in training time, and NDCG on the test set, so I will dig deeper into DecisionTree by cross-validation on the parameters.

In [14]:
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
kf = KFold(n=X_train.shape[0], n_folds=4)
params = {'criterion': ['gini', 'entropy'], 'min_samples_split':[50, 100, 150, 200, 250]}
clf = DecisionTreeClassifier(random_state=RAND_SEED)
ndcg_scorer = make_scorer(ndcg, needs_proba=True)

In [15]:
grid = GridSearchCV(clf, params, ndcg_scorer)
grid.fit(X_train, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [50, 100, 150, 200, 250], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(ndcg, needs_proba=True), verbose=0)

In [16]:
best = grid.best_estimator_
print best

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=200, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best')


In [17]:
pred = best.predict_proba(X_train)
score = ndcg(y, pred)

In [18]:
print score

0.816928123685


## Training Phase: Ensemble learning
1. Random forest classifier
2. Ada boosted tree
3. Gradient boosted tree

In [25]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost.sklearn import XGBClassifier
rfclf = RandomForestClassifier(criterion='entropy', min_samples_split=20, random_state=RAND_SEED, class_weight='balanced')
adclf = AdaBoostClassifier(base_estimator=best, random_state=RAND_SEED)
xgclf = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, 
                     colsample_bytree=0.5, seed=RAND_SEED)
eclfs = [rfclf, adclf, xgclf]

In [26]:
for clf in eclfs:
    train_predict(train_1, test_1, test_X, test_y, clf)
    train_predict(train_2, test_2, test_X, test_y, clf)
    train_predict(train_3, test_3, test_X, test_y, clf)

Trained model RandomForestClassifier on 53362 data points in 0.7124 seconds.
Predicted on the training set in 0.1468 seconds.
The score on the training set is 0.7614 and that on the test set is 0.5895


Trained model RandomForestClassifier on 106724 data points in 1.5294 seconds.
Predicted on the training set in 0.1748 seconds.
The score on the training set is 0.7582 and that on the test set is 0.6114


Trained model RandomForestClassifier on 160086 data points in 2.7595 seconds.
Predicted on the training set in 0.1866 seconds.
The score on the training set is 0.7654 and that on the test set is 0.6482


Trained model AdaBoostClassifier on 53362 data points in 22.5017 seconds.
Predicted on the training set in 1.5054 seconds.
The score on the training set is 0.8060 and that on the test set is 0.7551


Trained model AdaBoostClassifier on 106724 data points in 58.8780 seconds.
Predicted on the training set in 1.4139 seconds.
The score on the training set is 0.8048 and that on the test set 

In [34]:
# Generate submission file
y_pred = xgclf.predict_proba(X_test)
ids = []
pred = []
for i in range(len(y_pred)):
    ids += [id_test[i]] * 5
    order = sorted(range(len(y_pred[i])), key=lambda k:y_pred[i][k], reverse=True)
    pred += le.inverse_transform(order[:5]).tolist()

In [40]:
submission = pd.DataFrame(np.column_stack((ids, pred)), columns=['id', 'country'])
submission.to_csv('sub.csv', index=False)