In [139]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from datetime import date

In [83]:
#the helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file, 
                             target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels, 
                                index = np.arange(1, predicted_labels.shape[0]+1), 
                                columns = [target])
    predicted_df.to_csv(out_file, index_label=index_label)
    

Read training and test sets, sort train by session start time

In [84]:
train_df = pd.read_csv('../../data/train_sessions.csv', index_col='session_id')
test_df = pd.read_csv('../../data/test_sessions.csv', index_col='session_id')

# convert time1, ... , time10 columns to the datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# sort the data by time
train_df = train_df.sort_values(by='time1')
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


Transform the data into format which can be fed into CountVectorizer

In [85]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', sep=' ', 
                                               index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', sep=' ', 
                                               index=None, header=None)

In [86]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [87]:
cv = CountVectorizer()

In [88]:
cv.fit_transform(['site_1', 'site_17', 'site_2'])

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [89]:
cv.fit_transform(['site_1 site_17 site_2', 
                  'site_2 site_2 site_1'])

<2x3 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [90]:
cv.fit_transform(['this movie is awful', 
                  'enjoyed this movie, this movie is']).todense()

matrix([[1, 0, 1, 1, 1],
        [0, 1, 1, 2, 2]], dtype=int64)

In [91]:
cv.vocabulary_

{'this': 4, 'movie': 3, 'is': 2, 'awful': 0, 'enjoyed': 1}

In [92]:
X_sparse = cv.fit_transform(['this movie is awful', 
                  'enjoyed this movie, this movie is'])

In [93]:
X_sparse

<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [94]:
X_sparse.todense() #count words in accordance with vocabulary

matrix([[1, 0, 1, 1, 1],
        [0, 1, 1, 2, 2]], dtype=int64)

In [95]:
X_sparse.data #count nonzero elements

array([1, 1, 1, 1, 2, 2, 1, 1], dtype=int64)

In [96]:
X_sparse.indices #indices nonzero elements

array([4, 3, 2, 0, 4, 3, 2, 1])

In [97]:
X_sparse.nonzero() #row and column wise indices non zero elements, coordinates like (0,0)

(array([0, 0, 0, 0, 1, 1, 1, 1]), array([4, 3, 2, 0, 4, 3, 2, 1]))

In [98]:
cv.fit_transform(['site_1 site_17 site_2', 
                  'site_2 site_2 site_1']).todense()

matrix([[1, 1, 1],
        [1, 0, 2]], dtype=int64)

Fit CountVectorizer and transform data with it

In [99]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as input_train_file:
    X_train = cv.fit_transform(input_train_file)
with open('test_sessions_text.txt') as input_test_file:
    X_test = cv.transform(input_test_file)
print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)
Wall time: 9.46 s


Save train targets in a separate vector

In [100]:
y_train = train_df['target'].astype('int')

Train logistic regression

In [61]:
%%time
logit = LogisticRegression(C=1.0, random_state=17, max_iter=1000)
cv_scores = cross_val_score(logit, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)

Wall time: 14.2 s


In [62]:
cv_scores

array([0.91385476, 0.83188647, 0.87616948, 0.89120092, 0.91287135])

In [63]:
cv_scores.mean()

0.8851965954544507

In [23]:
%%time
logit.fit(X_train, y_train)

Wall time: 7.74 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
test_pred_logit1 = logit.predict_proba(X_test)[:, 1]
test_pred_logit1

array([2.31013640e-03, 4.60518851e-09, 1.89818288e-08, ...,
       8.74995441e-03, 4.53537652e-04, 2.15097801e-05])

In [25]:
# CV 0.885
write_to_submission_file(test_pred_logit1, 'logit_subm1.txt') # 0.908 ROC AUC Public Leaderbord (LB)

Time features
- hour when the session started
- morning
- day
- evening
- night

In [106]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [107]:
%%time
X_train_with_time = add_time_features(train_df.fillna(0), X_train)
X_test_with_time = add_time_features(test_df.fillna(0), X_test)

Wall time: 1min 33s


In [108]:
X_train_with_time.shape, X_test_with_time.shape

((253561, 50004), (82797, 50004))

In [109]:
%%time
cv_scores = cross_val_score(logit, X_train_with_time, y_train, cv=5, scoring='roc_auc', n_jobs=-1)

Wall time: 17.7 s


In [110]:
cv_scores, cv_scores.mean()
# 0.9308778523791353

(array([0.91113585, 0.90286512, 0.92285701, 0.94908471, 0.94677201]),
 0.9265429395373801)

In [111]:
%%time
logit.fit(X_train_with_time, y_train)

Wall time: 10.6 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
test_pred_logit2 = logit.predict_proba(X_test_with_time)[:, 1]

In [34]:
test_pred_logit2

array([6.30599902e-05, 5.56354501e-08, 3.90223645e-08, ...,
       2.84520971e-04, 1.35706451e-05, 6.04165827e-07])

In [35]:
# CV .9308
write_to_submission_file(test_pred_logit2, 'logit_subm2.txt') # ROC AUC 0.93578 Public LB

Perform time series cross-validation

In [101]:
time_split = TimeSeriesSplit(n_splits=10)

In [102]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

Perform time series cross-validation with logistic regression

In [103]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [104]:
%%time
cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1)

Wall time: 21 s


In [105]:
cv_scores, cv_scores.mean()

(array([0.83141992, 0.64669651, 0.87991837, 0.9631551 , 0.84221519,
        0.87840596, 0.94475893, 0.8532181 , 0.92987836, 0.90752702]),
 0.8677193446907168)

Tune regularization parameter C

In [130]:
c_values = np.logspace(-1, 0, 20)
c_values

array([0.1       , 0.11288379, 0.1274275 , 0.14384499, 0.16237767,
       0.18329807, 0.20691381, 0.23357215, 0.26366509, 0.29763514,
       0.33598183, 0.37926902, 0.42813324, 0.48329302, 0.54555948,
       0.61584821, 0.6951928 , 0.78475997, 0.88586679, 1.        ])

In [131]:
c_values = np.logspace(-1, 0, 20)
logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values}, 
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [132]:
%%time
logit_grid_searcher.fit(X_train_with_time, y_train) # long execution 4 min 18s

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.2min finished


Wall time: 4min 18s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score=nan,
             estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([0.1       , 0.11288379, 0.1274275 , 0.14384499, 0.16237767,
       0.18329807, 0.20691381, 0.23357215, 0.26366509, 0.29763514,
       0.33598183, 0.37926902, 0.42813324, 0.48329302, 0.54555948,
       0.61584821, 0.6951928 , 0.78475997, 0.88586679, 1.     

In [133]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# c_values = np.logspace(-2, 2, 10) -> (0.9173776310405486, {'C': 0.21544346900318834})
# c_values = np.logspace(-1, 1, 20) -> (0.9174831924737227, {'C': 0.26366508987303583})
# c_values = np.logspace(-1, 0, 20) -> (0.9174831924737227, {'C': 0.26366508987303583})

(0.9174831924737227, {'C': 0.26366508987303583})

In [134]:
# with time series CV
test_pred_logit6 = logit_grid_searcher.predict_proba(X_test_with_time)[:, 1]
write_to_submission_file(test_pred_logit6, 'logit_subm6.txt') 
# submit#5 -> Public LB 0.94242
# submit#6 -> Public LB 0.94216

Weekday features
- weekday
- is_weekend


In [150]:
def add_weekday_features(df, X_sparse):
    weekday = df['time1'].apply(lambda wd: date(wd.year, wd.month, wd.day).weekday())
    is_weekend = (weekday > 4).astype('int')
    X = hstack([X_sparse, weekday.values.reshape(-1, 1), is_weekend.values.reshape(-1, 1)])
    return X

In [151]:
%%time
X_train_with_time_weekday = add_weekday_features(train_df.fillna(0), X_train_with_time)
X_test_with_time_weekday = add_weekday_features(test_df.fillna(0), X_test_with_time)

Wall time: 1min 37s


In [153]:
X_train_with_time_weekday.shape, X_test_with_time_weekday.shape

((253561, 50006), (82797, 50006))

In [154]:
%%time
cv_scores = cross_val_score(logit, X_train_with_time_weekday, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1)

Wall time: 24.4 s


In [155]:
cv_scores, cv_scores.mean()

(array([0.89170179, 0.76381627, 0.84749276, 0.97732513, 0.89959147,
        0.93790506, 0.95868884, 0.92915544, 0.95703246, 0.9520673 ]),
 0.9114776513404363)

In [156]:
%%time
logit.fit(X_train_with_time_weekday, y_train)

Wall time: 12.1 s


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=17, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [158]:
test_pred_logit7 = logit.predict_proba(X_test_with_time_weekday)[:, 1]
write_to_submission_file(test_pred_logit7, 'logit_subm7.txt') # public score: 0.94547

Tune C parameter for new data

In [159]:
%%time
logit_grid_searcher.fit(X_train_with_time_weekday, y_train) # long execution 4 min 18s

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   26.1s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  4.7min finished


Wall time: 4min 43s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
             error_score=nan,
             estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=17, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([0.1       , 0.11288379, 0.1274275 , 0.14384499, 0.16237767,
       0.18329807, 0.20691381, 0.23357215, 0.26366509, 0.29763514,
       0.33598183, 0.37926902, 0.42813324, 0.48329302, 0.54555948,
       0.61584821, 0.6951928 , 0.78475997, 0.88586679, 1.     

In [160]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9135310285839562, {'C': 0.29763514416313175})

In [161]:
test_pred_logit = logit_grid_searcher.predict_proba(X_test_with_time_weekday)[:, 1]
write_to_submission_file(test_pred_logit, 'logit_subm8.txt') # public score -> 0.94882