In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from lib.ndcg import *

%pylab inline
import seaborn as sns
sns.set_style('darkgrid')

#some ipython options
pd.set_option('display.max_rows', 400)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 2000)

np.random.seed(0)

Populating the interactive namespace from numpy and matplotlib


In [15]:
#feature engineering
#Loading data
df_train = pd.read_csv('../input/train_users_2.csv')
df_test = pd.read_csv('../input/test_users.csv')
#sessions = pd.read_csv('../input/sessions.csv')

#df_train = pd.read_csv('train_sess_01.csv', header= 0, sep = ',')
#df_test = pd.read_csv('test_sess_01.csv', header= 0, sep = ',')

In [16]:
df_train = df_train.iloc[:,1:]
df_test = df_test.iloc[:,1:]

In [17]:
countries = pd.read_csv('../input/countries.csv') 

labels = df_train['country_destination'].values
df_train = df_train.drop(['country_destination'], axis=1)
id_test = df_test['id']
piv_train = df_train.shape[0]

##reformat countries
otherdf = {'country_destination': 'other', 
            'lat_destination':countries['lat_destination'].median(),
            'lng_destination':countries['lng_destination'].median(),
            'distance_km':countries['distance_km'].median(),
            'destination_km2':countries['destination_km2'].median(),
            #'destination_language':'eng',
            'language_levenshtein_distance':countries['language_levenshtein_distance'].median()
           }
ndf = { 'country_destination': 'NDF', 
        'lat_destination': countries['lat_destination'][9],
        'lng_destination': countries['lng_destination'][9],
        'distance_km':-countries['distance_km'][9],
        'destination_km2':countries['destination_km2'][9],
        #'destination_language':'eng',
        'language_levenshtein_distance':0.0
           }
countries = countries.append(otherdf, ignore_index = True)
countries = countries.append(ndf, ignore_index = True)

#df_train['language_levenshtein_distance'] =  map(lambda x: countries[countries['country_destination'] == x]['language_levenshtein_distance'].values[0], 
                                                #df_train['country_destination'])

#df_test['language_levenshtein_distance'] =  map(lambda x: countries[countries['country_destination'] == x]['language_levenshtein_distance'].values[0], 
                                                #df_train['country_destination'])

KeyError: 'id'

In [6]:
def data_reset(df_t):
    """
    df_t = pd.merge(df_t, 
                   sessions_data, 
                   left_on='id', 
                   right_on = 'user_id', 
                   how = 'left')
    """
    dac = np.vstack(df_t.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
    df_t['dac_year'] = dac[:,0]
    df_t['dac_month'] = dac[:,1]
    df_t['dac_day'] = dac[:,2]
    df_t = df_t.drop(['date_account_created'], axis=1)

    tfa = np.vstack(df_t.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
    df_t['tfa_year'] = tfa[:,0]
    df_t['tfa_month'] = tfa[:,1]
    df_t['tfa_day'] = tfa[:,2]
    df_t = df_t.drop(['timestamp_first_active'], axis=1)
    df_t = df_t.drop(['id', 'date_first_booking'], axis=1)
    #df_t = df_t.drop(['id', 'date_first_booking','user_id'], axis=1)
    df_t = df_t.fillna(-1)
    
    return df_t

In [None]:
#sessions_data = pd.read_csv('../input/sessions_reformat.csv')
df_train = data_reset(df_train)
df_test = data_reset(df_test)
#df_train = df_train.drop(['country_destination'], axis=1)

In [8]:
def featuresOnehot(df_train, df_test, oneHotCols, featureCols):
    df_t = pd.concat((df_train, df_test), axis=0, ignore_index=True)
    av = df_t.age.values
    df_t['age'] = np.where(np.logical_or(av<14, av>100), -1, av)
    features = df_t[featureCols]
    print list(features), len(list(features))
    featureDF = pd.get_dummies(features, 
                   columns = oneHotCols,
                   prefix = oneHotCols,
                   prefix_sep = '_',
                   dummy_na = False)
    
    return featureDF

In [9]:
oneHotCols = ['gender', 
             'signup_method', 
             'signup_flow', 
             'language', 
             'affiliate_channel', 
             'affiliate_provider', 
             'first_affiliate_tracked', 
             'signup_app', 
             #'signup_method',
             'first_device_type', 
             'first_browser']

featureCols = ['gender',
               'age',
               'signup_method',
               'signup_flow',
               'language',
               'affiliate_channel',
               'affiliate_provider',
               'first_affiliate_tracked',
               'signup_app',
               #'signup_method',
               'first_device_type',
               'first_browser']

In [10]:
df_all = featuresOnehot(df_train, df_test, oneHotCols, featureCols)

['gender', 'age', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser'] 11


In [10]:
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

In [None]:
xgb = XGBClassifier(max_depth= 6, learning_rate=0.1, n_estimators=300,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)  

xgb.fit(X, y)

In [None]:
y_pred = xgb.predict_proba(X_test) 

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub_sess_xgb_Feb8.csv',index=False)

In [None]:
# randomforest clf with all of our new tricks:
from sklearn.ensemble import RandomForestClassifier as RF

#build a random forest classifier
# n_estimators - number of trees- dial this up for increased accuracy at the cost of overfitting and increased runtime
# n_jobs - the parelelism with which the forest is trained. You might want to dial this down if you are not running a quadcore
# class_weight - weighting tactic. Since we are dealing with a hidiously imbalanced dataset, balanced is important
#    this means that the RF classifier optimizes as if there were as many move observations as no-move observations
#    the cost is dicreased overall accuracy since we are effectivly discounting no-move observations
# min_sample_split and min_weight_fraction_leaf - these are effectivly regularization options
#   they effect how much data is needed to support each tree split and ensure that the trees don't get too deep
#   at the cost of overall accuracy

rfclf = RF(n_estimators= 500, n_jobs = 8, class_weight = 'balanced', 
           min_samples_split = 20, min_weight_fraction_leaf =0.0001,
           verbose = 0)

# train the model on the data
rfclf.fit(X, y)

In [None]:
y_pred = xgb.predict_proba(X_test) 

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub_sess01_rb_Feb8.csv',index=False)

## Cross Validation

In [11]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    lb = LabelBinarizer()
    lb.fit(range(len(predictions) + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)


In [1]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

xgb = XGBClassifier(max_depth= 5, learning_rate=0.1, n_estimators=8,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)  

xgb.fit(X_train, y_train)

NameError: name 'X' is not defined

In [None]:
y_pred_lbs = []  #list of countries
for i in range(len(y_pred)):
    y_pred_lbs.append(le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist())

#from lib import ndcg
preds = pd.DataFrame(y_pred_lbs)
truth = pd.Series(le.inverse_transform(y_test))

print ndcg_score(preds, truth)

In [None]:
param = {'max_depth':8, 'eta':0.1,
         'subsample':0.75, 'seed':1,
         'colsample_bytree':0.75,
         'objective':'multi:softprob',
         'eval_metric': 'mlogloss',
         'num_class':12,
         'verbose':0,
         'nthread':4}

In [13]:
from sklearn.grid_search import GridSearchCV
import xgboost
xgb_model = xgboost.XGBClassifier(objective="multi:softprob", nthread=-1)

clf = GridSearchCV(
    xgb_model,
    {
        'max_depth': [1, 2, 3],
        'n_estimators': [4, 5, 6],
        'learning_rate': [0.1, 0.2],
    },
    cv=10,
    verbose=10,
    n_jobs=1,
    scoring=ndcg_scorer
)

clf.fit(X, y)

Fitting 10 folds for each of 18 candidates, totalling 180 fits
[CV] n_estimators=4, learning_rate=0.1, max_depth=1 ..................
[CV]  n_estimators=4, learning_rate=0.1, max_depth=1, score=0.799007 -  19.1s
[CV] n_estimators=4, learning_rate=0.1, max_depth=1 ..................
[CV]  n_estimators=4, learning_rate=0.1, max_depth=1, score=0.806662 -  18.2s
[CV] n_estimators=4, learning_rate=0.1, max_depth=1 ..................


[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   19.1s


MemoryError: 

In [None]:
##feature importance
import operator
importance = gbdt.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

In [None]:
imdf = pd.DataFrame(importance, columns=['feature', 'fscore'])
imdf['fscore'] = imdf['fscore'] / imdf['fscore'].sum()

XList = list(df_all)
feat=[XList[int(filter(lambda x: x in '0123456789.', imdf.feature[i]))] for i in range(len(imdf))]

xgbFImportance = pd.DataFrame(sorted(zip(imdf.fscore, feat), 
             reverse=True), columns=['Importance','Feature'])

plt.figure()
xgbFImportance.plot()
xgbFImportance.head(60).plot(kind='barh', x='Feature', y='Importance', legend=False, figsize=(10, 18))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')

feats = [f for f in xgbFImportance.head(60).Feature]

In [None]:
df_imp = df_all[feats]
#Splitting train and test
vals = df_imp.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

In [None]:
y_pred = bst.predict( xgb_test )

In [None]:
y_pred_lbs = []  #list of countries
for i in range(len(y_pred)):
    y_pred_lbs.append(le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist())

from lib import ndcg
preds = pd.DataFrame(y_pred_lbs)
truth = pd.Series(le.inverse_transform(y_test))
sc = score_predictions(preds, truth)

print sc.mean(axis=0)

In [None]:
xgbt.cv(param, 
        xgb_train, 
        num_round, 
        nfold=5,
        metrics={'merror'}, 
        seed = 0)

In [None]:
from sklearn.cross_validation import KFold
import xgboost as xgbt
from sklearn.grid_search import GridSearchCV

k_fold = KFold(len(X), 5)

In [None]:
def gridsearch(param):
    num_round = 30
    ndcglist = []
    for k, (train, test) in enumerate(k_fold):
        xgb_train = xgbt.DMatrix(X[train], label= y[train])
        xgb_test = xgbt.DMatrix(X[test], label= y[test])
        watchlist = [ (xgb_train,'train'), (xgb_test, 'test') ]
        bst = xgbt.train(param, xgb_train, num_round, watchlist)
        y_pred = bst.predict( xgb_test )
        y_pred_lbs = []  #list of countries
        for i in range(len(y_pred)):
            y_pred_lbs.append(le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist())
    
    from lib import ndcg
    preds = pd.DataFrame(y_pred_lbs)
    truth = pd.Series(le.inverse_transform(y[test]))
    sc = score_predictions(preds, truth)
    ndcglist.append(sc.mean(axis=0))
    return mean(ndcglist)

In [None]:
df_res = pd.DataFrame(results, columns = ['max_depth', 'eta', 'sub', 'col', 'ndcg'])

In [None]:
df_train = pd.merge(df_train, 
                   sessions_data, 
                   left_on='id', 
                   right_on = 'user_id', 
                   how = 'left')

df_test = pd.merge(df_test, 
                   sessions_data, 
                   left_on='id', 
                   right_on = 'user_id', 
                   how = 'left')

df_train = df_train.drop(['id', 'date_first_booking','user_id'], axis=1)
df_test = df_test.drop(['id', 'date_first_booking','user_id'], axis=1)

df_train.fillna(-1)
df_test.fillna(-1)

#Creating a DataFrame with train+test data
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
#Removing id and date_first_booking
df_all = df_all.drop(['id', 'date_first_booking','user_id'], axis=1)
#Filling nan
df_all = df_all.fillna(-1)

#####Feature engineering#######
#date_account_created
dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_all['dac_year'] = dac[:,0]
df_all['dac_month'] = dac[:,1]
df_all['dac_day'] = dac[:,2]
df_all = df_all.drop(['date_account_created'], axis=1)

#timestamp_first_active
tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_all['tfa_year'] = tfa[:,0]
df_all['tfa_month'] = tfa[:,1]
df_all['tfa_day'] = tfa[:,2]
df_all = df_all.drop(['timestamp_first_active'], axis=1)

In [None]:
#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

## Reformat sessions data

sessions = pd.read_csv('../input/sessions.csv')

grpby = sessions.groupby(['user_id'])['secs_elapsed'].sum().reset_index()
grpby.columns = ['user_id','secs_elapsed']

action_type = pd.pivot_table(sessions, index = ['user_id'],columns = ['action_type'],values = 'action',aggfunc=len,fill_value=0).reset_index()
action_type = action_type.drop(['booking_response'],axis=1)

device_type = pd.pivot_table(sessions, index = ['user_id'],columns = ['device_type'],values = 'action',aggfunc=len,fill_value=0).reset_index()
device_type = device_type.drop(['Blackberry','Opera Phone','iPodtouch','Windows Phone'],axis=1)

sessions_data = pd.merge(action_type,device_type,on='user_id',how='inner')
sessions_data = pd.merge(sessions_data,grpby,on='user_id',how='inner')

def oneHotfeats(oneHotCols, df_t):
    av = df_t.age.values
    df_t['age'] = np.where(np.logical_or(av<14, av>100), -1, av)
    
    featureDF = pd.get_dummies(df_t, 
                   columns = oneHotCols,
                   prefix = oneHotCols,
                   prefix_sep = '_',
                   dummy_na = False)
        
    return featureDF

#Age
av = df_all.age.values
df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

#One-hot-encoding features
ohe_feats = ['gender', 
             'signup_method', 
             'signup_flow', 
             'language', 
             'affiliate_channel', 
             'affiliate_provider', 
             'first_affiliate_tracked', 
             'signup_app', 
             'signup_method',
             'first_device_type', 
             'first_browser']

for f in ohe_feats:
    df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
    df_all = df_all.drop([f], axis=1)
    df_all = pd.concat((df_all, df_all_dummy), axis=1)

In [None]:
## study correlation between 'date_first_booking' isnull and 'NDF'
df_train[(df_train['date_first_booking'].isnull()) & (df_train['country_destination'] != 'NDF')]