In [64]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [44]:
#the helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file, 
                             target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels, 
                                index = np.arange(1, predicted_labels.shape[0]+1), 
                                columns = [target])
    predicted_df.to_csv(out_file, index_label=index_label)
    

Read training and test sets, sort train by session start time

In [3]:
train_df = pd.read_csv('../../data/train_sessions.csv', index_col='session_id')
test_df = pd.read_csv('../../data/test_sessions.csv', index_col='session_id')

# convert time1, ... , time10 columns to the datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# sort the data by time
train_df = train_df.sort_values(by='time1')
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


Transform the data into format which can be fed into CountVectorizer

In [4]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', sep=' ', 
                                               index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', sep=' ', 
                                               index=None, header=None)

In [5]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [7]:
cv = CountVectorizer()

In [8]:
cv.fit_transform(['site_1', 'site_17', 'site_2'])

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [10]:
cv.fit_transform(['site_1 site_17 site_2', 
                  'site_2 site_2 site_1'])

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [13]:
cv.fit_transform(['this movie is awful', 
                  'enjoyed this movie, this movie is']).todense()

matrix([[1, 0, 1, 1, 1],
        [0, 1, 1, 2, 2]], dtype=int64)

In [17]:
cv.vocabulary_

{'this': 4, 'movie': 3, 'is': 2, 'awful': 0, 'enjoyed': 1}

In [22]:
X_sparse = cv.fit_transform(['this movie is awful', 
                  'enjoyed this movie, this movie is'])

In [23]:
X_sparse

<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [24]:
X_sparse.todense() #count words in accordance with vocabulary

matrix([[1, 0, 1, 1, 1],
        [0, 1, 1, 2, 2]], dtype=int64)

In [25]:
X_sparse.data #count nonzero elements

array([1, 1, 1, 1, 2, 2, 1, 1], dtype=int64)

In [26]:
X_sparse.indices #indices nonzero elements

array([4, 3, 2, 0, 4, 3, 2, 1])

In [29]:
X_sparse.nonzero() #row and column wise indices non zero elements, coordinates like (0,0)

(array([0, 0, 0, 0, 1, 1, 1, 1]), array([4, 3, 2, 0, 4, 3, 2, 1]))

In [30]:
cv.fit_transform(['site_1 site_17 site_2', 
                  'site_2 site_2 site_1']).todense()

matrix([[1, 1, 1],
        [1, 0, 2]], dtype=int64)

Fit CountVectorizer and transform data with it

In [31]:
%%time
cv = CountVectorizer()
with open('train_sessions_text.txt') as input_train_file:
    X_train = cv.fit_transform(input_train_file)
with open('test_sessions_text.txt') as input_test_file:
    X_test = cv.transform(input_test_file)
print(X_train.shape, X_test.shape)

(253561, 41592) (82797, 41592)
Wall time: 2.93 s


Save train targets in a separate vector

In [32]:
y_train = train_df['target'].astype('int')

Train logistic regression

In [38]:
%%time
logit = LogisticRegression(C=1.0, random_state=17, max_iter=1000)
cv_scores = cross_val_score(logit, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)

Wall time: 17 s


In [39]:
cv_scores

array([0.91385476, 0.83188647, 0.87616948, 0.89120092, 0.91287135])

In [40]:
cv_scores.mean()

0.8851965954544507

In [41]:
%%time
logit.fit(X_train, y_train)

Wall time: 9.16 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]
test_pred_logit1

array([2.31013640e-03, 4.60518851e-09, 1.89818288e-08, ...,
       8.74995441e-03, 4.53537652e-04, 2.15097801e-05])

In [45]:
# CV 0.885
write_to_submission_file(test_pred_logit1, 'logit_subm1.txt') # 0.908 ROC AUC Public Leaderbord (LB)

Time features
- hour when the session started
- morning
- day
- evening
- night

In [46]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [48]:
%%time
X_train_with_time = add_time_features(train_df.fillna(0), X_train)
X_test_with_time = add_time_features(test_df.fillna(0), X_test)

Wall time: 1min 48s


In [49]:
X_train_with_time.shape, X_test_with_time.shape

((253561, 41596), (82797, 41596))

In [50]:
%%time
cv_scores = cross_val_score(logit, X_train_with_time, y_train, cv=5, scoring='roc_auc', n_jobs=-1)

Wall time: 21 s


In [51]:
cv_scores

array([0.92488902, 0.90687982, 0.93177151, 0.94325127, 0.94759765])

In [52]:
cv_scores.mean()

0.9308778523791353

In [53]:
%%time
logit.fit(X_train_with_time, y_train)

Wall time: 10.2 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [54]:
test_pred_logit2 = logit.predict_proba(X_test_with_time)[:, 1]

In [55]:
test_pred_logit2

array([6.30599902e-05, 5.56354501e-08, 3.90223645e-08, ...,
       2.84520971e-04, 1.35706451e-05, 6.04165827e-07])

In [62]:
# CV .9308
write_to_submission_file(test_pred_logit2, 'logit_subm2.txt') # ROC AUC 0.93578 Public LB

Perform time series cross-validation

In [65]:
time_split = TimeSeriesSplit(n_splits=10)

In [66]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train_with_time)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

Perform time series cross-validation with logistic regression

In [67]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [68]:
%%time
cv_scores = cross_val_score(logit, X_train_with_time, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1)

Wall time: 16.6 s


In [69]:
cv_scores, cv_scores.mean()

(array([0.88671713, 0.7699798 , 0.92192597, 0.96726586, 0.89754884,
        0.93882954, 0.95156986, 0.92538458, 0.95002682, 0.94358576]),
 0.9152834162990182)

Tune regularization parameter C

In [70]:
c_values = np.logspace(-2, 2, 10)
logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, 
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [71]:
%%time
logit_grid_searcher.fit(X_train_with_time, y_train) # long execution 3 min 42s

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.7min finished


Wall time: 3min 42s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score=nan,
             estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
   

In [72]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9155297752725868, {'C': 0.5994842503189409})

In [63]:
# non time series CV
test_pred_logit3 = logit_grid_searcher.predict_proba(X_test_with_time)[:, 1]
write_to_submission_file(test_pred_logit3, 'logit_subm3.txt') # Public LB 0.93406

In [73]:
# with time series CV
test_pred_logit4 = logit_grid_searcher.predict_proba(X_test_with_time)[:, 1]
write_to_submission_file(test_pred_logit4, 'logit_subm4.txt') # Public LB 0.93740