See https://www.kaggle.com/kashnitsky/correct-time-aware-cross-validation-scheme/notebook

In [1]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

Read training and test sets, sort train set by session start time.

In [3]:
train_df = pd.read_csv('../../../data/train_sessions.csv',
                       index_col='session_id', parse_dates=['time1'])
test_df = pd.read_csv('../../../data/test_sessions.csv',
                      index_col='session_id', parse_dates=['time1'])

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


scale the site values?

In [10]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = StandardScaler().fit_transform(train_df[sites])
test_df[sites] = StandardScaler().fit_transform(test_df[sites])


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [11]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,-0.439845,2013-01-12 08:05:57,-0.44029,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,-0.439845,2013-01-12 08:37:23,-0.44029,2013-01-12 08:37:23,-0.442036,2013-01-12 09:07:07,-0.443631,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,-0.317035,2013-01-12 08:50:13,-0.318359,2013-01-12 08:50:14,-0.32023,2013-01-12 08:50:15,-0.322756,2013-01-12 08:50:15,-0.324863,2013-01-12 08:50:16,...,2013-01-12 08:50:16,-0.329161,2013-01-12 08:50:16,-0.354077,2013-01-12 08:50:16,-0.333206,2013-01-12 08:50:17,-0.335591,2013-01-12 08:50:17,0
114021,-0.317173,2013-01-12 08:50:17,-0.318086,2013-01-12 08:50:17,-0.320503,2013-01-12 08:50:18,-0.322485,2013-01-12 08:50:18,-0.324999,2013-01-12 08:50:18,...,2013-01-12 08:50:18,-0.329296,2013-01-12 08:50:19,-0.332534,2013-01-12 08:50:19,-0.333608,2013-01-12 08:50:19,-0.335591,2013-01-12 08:50:20,0
146670,-0.316897,2013-01-12 08:50:20,-0.317812,2013-01-12 08:50:20,-0.320639,2013-01-12 08:50:20,-0.32262,2013-01-12 08:50:21,-0.324321,2013-01-12 08:50:21,...,2013-01-12 08:50:21,-0.32943,2013-01-12 08:50:21,-0.331731,2013-01-12 08:50:22,-0.333608,2013-01-12 08:50:22,-0.335457,2013-01-12 08:50:22,0


#### Transform data into format which can be fed into CountVectorizer

In [14]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('float64').to_csv('train_sessions_text.txt', 
                                               sep=' ', 
                       index=None, header=None)
test_df[sites].fillna(0).astype('float64').to_csv('test_sessions_text.txt', 
                                              sep=' ', 
                       index=None, header=None)

In [15]:
!head -5 train_sessions_text.txt

-0.4398445835266358 -0.4402895577217554 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
-0.4398445835266358 -0.4402895577217554 -0.4420357045518497 -0.4436314143959681 0.0 0.0 0.0 0.0 0.0 0.0
-0.3170350082463937 -0.318359462525752 -0.32023045614633877 -0.32275607376874177 -0.3248630663844369 -0.3261765175129952 -0.3291610499754904 -0.3540767323063248 -0.3332063380617637 -0.33559051105334503
-0.3171729965332254 -0.31808576983170933 -0.32050264664556893 -0.32248474864948873 -0.32499857245293884 -0.3260410234718396 -0.32929553608784684 -0.3325336653487158 -0.3336078882260767 -0.33559051105334503
-0.31689701995956193 -0.3178120771376667 -0.32063874189518404 -0.3226204112091153 -0.324321042110429 -0.3252280592249061 -0.3294300222002033 -0.3317308181329043 -0.3336078882260767 -0.33545704303003204


#### Fit CountVectorizer and transform data with it.

In [16]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

CPU times: user 13.3 s, sys: 439 ms, total: 13.7 s
Wall time: 12.8 s


#### Save train targets into a separate vector.

In [17]:
y_train = train_df['target'].astype('int').values

#### We'll be performing time series cross-validation

We will be performing time series cross-validation, see sklearn TimeSeriesSplit and this dicussion on StackOverflow.

http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html

https://stats.stackexchange.com/questions/14099/using-k-fold-cross-validation-for-time-series-model-selection

In [18]:
time_split = TimeSeriesSplit(n_splits=10)

In [19]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

#### Perform time series cross-validation with logistic regression.

In [20]:
logit = LogisticRegression(C=1, random_state=17, solver='lbfgs')

In [21]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 58.8 ms, sys: 93.3 ms, total: 152 ms
Wall time: 6.75 s


In [22]:
cv_scores, cv_scores.mean()

(array([0.76359174, 0.60541068, 0.82961761, 0.94347401, 0.80920452,
        0.86005269, 0.8541908 , 0.8198222 , 0.89161556, 0.86047833]),
 0.8237458118524078)

#### Train logistic regression with all training data, make predictions for test set and form a submission file.

In [23]:
logit.fit(X_train, y_train)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
#write_to_submission_file(logit_test_pred, '03-subm1.csv') # 0.91288

#### Now we'll add some time features: indicators of morning, day, evening and night.

In [25]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    
    
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [26]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

CPU times: user 5.24 s, sys: 200 ms, total: 5.44 s
Wall time: 2.13 s


In [27]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

In [28]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 75.8 ms, sys: 61.2 ms, total: 137 ms
Wall time: 6.31 s


In [29]:
cv_scores, cv_scores.mean()

(array([0.82938441, 0.75686934, 0.90247745, 0.96817116, 0.89383703,
        0.93623935, 0.91427464, 0.91282831, 0.92952822, 0.91617945]),
 0.8959789355368482)

In [30]:
def add_start_month_feature(df, X_sparse):
    foo = pd.DataFrame(index=df.index)
    bar = df['time1'].apply(lambda ts: 100 * ts.year + ts.month).astype('float64')
    foo['scaled_month'] = StandardScaler().fit_transform(bar.values.reshape(-1, 1))
    
    X = hstack([X_sparse, foo])
    return X

In [31]:
%%time
X_train_new2 = add_start_month_feature(train_df.fillna(0), X_train_new)
X_test_new2 = add_start_month_feature(test_df.fillna(0), X_test_new)

CPU times: user 5.07 s, sys: 61.5 ms, total: 5.13 s
Wall time: 1.96 s


In [32]:
X_train_new2.shape, X_test_new2.shape

((253561, 50005), (82797, 50005))

In [33]:
%%time
cv_scores = cross_val_score(logit, X_train_new2, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 76.9 ms, sys: 59.6 ms, total: 137 ms
Wall time: 6.5 s


In [34]:
cv_scores, cv_scores.mean()

(array([0.82526234, 0.75767937, 0.92578353, 0.9698667 , 0.89427597,
        0.93754368, 0.91713076, 0.91327933, 0.92947603, 0.91773702]),
 0.8988034711758217)

In [35]:
def add_start_hour_feature(df, X_sparse):    
    foo = pd.DataFrame(index=df.index)
    bar = df['time1'].apply(lambda ts: ts.hour).astype(np.float64)
    foo['scaled_start_hour'] = StandardScaler().fit_transform(bar.values.reshape(-1, 1))
    
    X = hstack([X_sparse, foo])
    return X

In [36]:
%%time
X_train_new3 = add_start_month_feature(train_df.fillna(0), X_train_new2)
X_test_new3 = add_start_month_feature(test_df.fillna(0), X_test_new2)

CPU times: user 5.09 s, sys: 120 ms, total: 5.21 s
Wall time: 2.05 s


In [29]:
X_train_new3.shape, X_test_new3.shape

((253561, 50006), (82797, 50006))

In [37]:
%%time
cv_scores = cross_val_score(logit, X_train_new3, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)

CPU times: user 83 ms, sys: 55.8 ms, total: 139 ms
Wall time: 6.78 s


In [38]:
cv_scores, cv_scores.mean()

(array([0.82267842, 0.7568771 , 0.92861516, 0.96980591, 0.89408883,
        0.93745827, 0.91727875, 0.91326471, 0.92963406, 0.91727174]),
 0.898697296368355)

#### Now we tune regularization parameter C.

In [39]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=1, cv=time_split, verbose=1)

In [40]:
%%time
logit_grid_searcher.fit(X_train_new3, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  2.2min finished


CPU times: user 13min 6s, sys: 3.87 s, total: 13min 10s
Wall time: 2min 15s




GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [41]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9065266714927953, {'C': 0.21544346900318834})

In [42]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new3)[:, 1]
write_to_submission_file(logit_test_pred3, 'submissions/05-subm1.csv') # 0.73594