In [1]:
# baseline 0.95214

In [2]:
# Logit Tf-Idf 6 features

In [3]:
# from Correct time-aware cross-validation scheme

In [4]:
# Import libraries and set desired options
import numpy as np
import pandas as pd
#from scipy.sparse import hstack
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
# A helper function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [7]:
train_df = pd.read_csv('train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('test_sessions.csv',
                      index_col='session_id')

# Convert time1, ..., time10 columns to datetime type
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# Sort the data by time
train_df = train_df.sort_values(by='time1')

# Look at the first rows of the training set
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [8]:
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites].fillna(0).astype('int').to_csv('train_sessions_text.txt', 
                                               sep=' ', index=None, header=None)
test_df[sites].fillna(0).astype('int').to_csv('test_sessions_text.txt', 
                                              sep=' ', index=None, header=None)

In [9]:
!head -5 train_sessions_text.txt

56 55 0 0 0 0 0 0 0 0
56 55 56 55 0 0 0 0 0 0
946 946 951 946 946 945 948 784 949 946
945 948 949 948 945 946 947 945 946 946
947 950 948 947 950 952 946 951 946 947


In [10]:
%%time
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
X_train.shape, X_test.shape

Wall time: 9.91 s


In [11]:
y_train = train_df['target'].astype('int')

In [12]:
time_split = TimeSeriesSplit(n_splits=10)

In [13]:
[(el[0].shape, el[1].shape) for el in time_split.split(X_train)]

[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [14]:
logit = LogisticRegression(C=1, random_state=17)

In [15]:
%%time

cv_scores = cross_val_score(logit, X_train, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 31.7 s


In [16]:
cv_scores, cv_scores.mean()

(array([0.83141992, 0.64669477, 0.87991917, 0.96315292, 0.84221701,
        0.87840872, 0.94475732, 0.85321644, 0.92987908, 0.90752818]),
 0.8677193536148717)

In [17]:
logit.fit(X_train, y_train)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [18]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288

In [19]:
def add_time_features(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)])
    return X

In [20]:
%%time
X_train_new = add_time_features(train_df.fillna(0), X_train)
X_test_new = add_time_features(test_df.fillna(0), X_test)

Wall time: 1min 24s


In [21]:
X_train_new.shape, X_test_new.shape

((253561, 50004), (82797, 50004))

In [22]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 29 s


In [23]:
cv_scores, cv_scores.mean()

(array([0.87652278, 0.75122979, 0.93062182, 0.97864617, 0.90399606,
        0.93831379, 0.96248761, 0.92731291, 0.94885607, 0.94043603]),
 0.915842304479402)

In [24]:
logit.fit(X_train_new, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [25]:
logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') # 0.93843

In [26]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=2, cv=time_split, verbose=1)

In [27]:
%%time
logit_grid_searcher.fit(X_train_new, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   49.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  6.9min finished


Wall time: 7min 3s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [28]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9173776926166856, {'C': 0.21544346900318834})

In [29]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94242

In [30]:
#GridSearchCV?

In [31]:
def add_day_features(df, X_sparse):
    day = df['time1'].apply(lambda ts: ts.weekday())
    weekend = ((day >= 5)).astype('int')
    weekday = ((day < 5)).astype('int')
    X = hstack([X_sparse, weekend.values.reshape(-1, 1), 
                weekday.values.reshape(-1, 1)])
    return X

In [32]:
%%time
X_train_new1 = add_time_features(train_df.fillna(0), X_train_new)
X_test_new1 = add_time_features(test_df.fillna(0), X_test_new)

Wall time: 1min 25s


In [33]:
X_train_new1.shape, X_test_new1.shape

((253561, 50008), (82797, 50008))

In [34]:
%%time
cv_scores = cross_val_score(logit, X_train_new1, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 29.3 s


In [35]:
cv_scores, cv_scores.mean()

(array([0.87806863, 0.75450051, 0.93075898, 0.97850289, 0.90417853,
        0.9384806 , 0.96237984, 0.92768015, 0.94890392, 0.94062803]),
 0.9164082065409461)

In [36]:
%%time
logit_grid_searcher.fit(X_train_new1, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   50.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  7.3min finished


Wall time: 7min 26s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [37]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# (0.9183503264075926, {'C': 0.21544346900318834})

(0.9183503264075926, {'C': 0.21544346900318834})

In [38]:
logit_test_pred4 = logit_grid_searcher.predict_proba(X_test_new1)[:, 1]
write_to_submission_file(logit_test_pred4, 'subm_self1.csv')

In [39]:
def label_unique_site_sum(row):
    s = set()
    for i in sites:
        if row[i] != 0:
            s.add(row[i])
            pass
        pass
    return sum(s)

In [40]:
def label_unique_site(row):
    s = set()
    for i in sites:
        if row[i] != 0:
            s.add(row[i])
            pass
        pass
    return len(s)

In [41]:

def add_month_features(df, X_sparse):
    month = df['time1'].apply(lambda ts: ts.month)
    holiday = (((month >= 6) & (month <= 9)) | (month >= 12) | (month <= 1)).astype('int')
    schoolday = (((month >= 1) & (month <= 5)) | ((month > 9) & (month < 1))).astype('int')
    n_unique_web = df.apply(lambda row: label_unique_site(row), axis=1)
    sum_unique_web = df.apply(lambda row: label_unique_site_sum(row), axis=1)
    
    X = csr_matrix(hstack([X_sparse, holiday.values.reshape(-1, 1), 
                schoolday.values.reshape(-1, 1)
                           , n_unique_web.values.reshape(-1, 1)
                           , sum_unique_web.values.reshape(-1, 1)
                          ]))
    return X

In [42]:
#%%time
#X_train_new2 = add_month_features(train_df.fillna(0), X_train_new1)
#X_test_new2 = add_month_features(test_df.fillna(0), X_test_new1)

Wall time: 2min 45s


In [43]:
#%%time
#cv_scores = cross_val_score(logit, X_train_new2, y_train, cv=time_split, 
 #                           scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 18 s


In [44]:
#%%time
#logit_grid_searcher.fit(X_train_new2, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  2.3min finished


Wall time: 2min 24s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [45]:
#logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# (0.8717104095946803, {'C': 1.6681005372000592})

(0.8717104095946803, {'C': 1.6681005372000592})

In [46]:
#logit_test_pred5 = logit_grid_searcher.predict_proba(X_test_new2)[:, 1]
#write_to_submission_file(logit_test_pred5, 'subm_self2.csv')

In [47]:

def add_only_month_features(df, X_sparse):
    month = df['time1'].apply(lambda ts: ts.month)
    holiday = (((month >= 6) & (month <= 9)) | (month >= 12) | (month <= 1)).astype('int')
    schoolday = (((month >= 1) & (month <= 5)) | ((month > 9) & (month < 1))).astype('int')
    
    X = csr_matrix(hstack([X_sparse, holiday.values.reshape(-1, 1), 
                schoolday.values.reshape(-1, 1)
                          ]))
    return X

In [48]:
#%%time
#X_train_new3 = add_only_month_features(train_df.fillna(0), X_train_new1)
#X_test_new3 = add_only_month_features(test_df.fillna(0), X_test_new1)

Wall time: 1min 25s


In [49]:
#%%time
#cv_scores = cross_val_score(logit, X_train_new3, y_train, cv=time_split, 
#                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 35.7 s


In [50]:
#%%time
#logit_grid_searcher.fit(X_train_new3, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   53.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  8.0min finished


Wall time: 8min 10s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [51]:
#logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# (0.9148358099076543, {'C': 0.21544346900318834})

(0.9148358099076543, {'C': 0.21544346900318834})

In [52]:
#logit_test_pred4 = logit_grid_searcher.predict_proba(X_test_new3)[:, 1]
#write_to_submission_file(logit_test_pred4, 'subm_self3.csv')

In [55]:
def label_unique_site_sum(row):
    s = set()
    for i in sites:
        if row[i] != 0:
            s.add(row[i])
            pass
        pass
    return sum(s)

In [54]:
def label_unique_site(row):
    s = set()
    for i in sites:
        if row[i] != 0:
            s.add(row[i])
            pass
        pass
    return len(s)

In [90]:
def add_stand_features(df, X_sparse):
    full_new_feat = pd.DataFrame(index=df.index)
    full_new_feat['start_hour'] = df['time1'].apply(lambda ts: ts.hour).astype('float64')
    full_new_feat['day_in_month'] = df['time1'].apply(lambda ts: ts.day ).astype('float64')
    full_new_feat['sum_unique_sites'] = df.apply (lambda row: label_unique_site_sum (row),axis=1).astype('float64')
    full_new_feat['n_unique_sites'] = df.apply (lambda row: label_unique_site (row),axis=1).astype('float64')
    
    features = ['start_hour', 'day_in_month', 'sum_unique_sites', 'n_unique_sites']
    tmp = StandardScaler().fit_transform(full_new_feat[features])
    X = csr_matrix(hstack([X_sparse, tmp[:,:]]))
    
    return X

In [57]:
#%%time
#X_train_new4 = add_only_month_features(train_df.fillna(0), X_train)
#X_test_new4 = add_only_month_features(test_df.fillna(0), X_test)

Wall time: 1min 26s


In [59]:
#%%time
#cv_scores = cross_val_score(logit, X_train_new4, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 36.6 s


In [60]:
#%%time
#logit_grid_searcher.fit(X_train_new4, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   48.5s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  7.6min finished


Wall time: 7min 44s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [61]:
#logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# (0.8683678963028504, {'C': 0.21544346900318834})

(0.8683678963028504, {'C': 0.21544346900318834})

In [62]:
logit_test_pred5 = logit_grid_searcher.predict_proba(X_test_new4)[:, 1]
write_to_submission_file(logit_test_pred5, 'subm_self5.csv')

In [None]:
### 6

In [68]:
#month = train_df['time1'].apply(lambda ts: ts.month)
#holiday = (((month >= 6) & (month <= 9)) | (month >= 12) | (month <= 1)).astype('int')

In [69]:
#month.head()

session_id
21669     1
54843     1
77292     1
114021    1
146670    1
Name: time1, dtype: int64

In [70]:
#holiday.head()

session_id
21669     1
54843     1
77292     1
114021    1
146670    1
Name: time1, dtype: int32

In [85]:
#full_new_feat = pd.DataFrame(index=train_df.index)
#full_new_feat['start_hour'] = train_df['time1'].apply(lambda ts: ts.hour).astype('float64')
#full_new_feat['day_in_month'] = train_df['time1'].apply(lambda ts: ts.day ).astype('float64')

In [86]:
#full_new_feat.head()

Unnamed: 0_level_0,start_hour,day_in_month
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1
21669,8.0,12.0
54843,8.0,12.0
77292,8.0,12.0
114021,8.0,12.0
146670,8.0,12.0


In [87]:
#tmp = StandardScaler().fit_transform(full_new_feat[['start_hour', 'day_in_month']])

In [88]:
#len(tmp)

253561

In [89]:
#tmp

array([[-1.35736646, -0.85723139],
       [-1.35736646, -0.85723139],
       [-1.35736646, -0.85723139],
       ...,
       [ 3.39034912,  1.50227179],
       [ 3.39034912,  1.50227179],
       [ 3.39034912,  1.50227179]])

In [79]:
#holiday.head()

session_id
21669     1
54843     1
77292     1
114021    1
146670    1
Name: time1, dtype: int32

In [83]:
#len(holiday.values.reshape(-1, 1))

253561

In [91]:
#%%time
#X_train_new6 = add_stand_features(train_df.fillna(0), X_train_new1)
#X_test_new6 = add_stand_features(test_df.fillna(0), X_test_new1)

Wall time: 2min 44s


In [92]:
#%%time
#cv_scores = cross_val_score(logit, X_train_new6, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 30.8 s


In [93]:
##%%time
#logit_grid_searcher.fit(X_train_new6, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   53.8s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  8.0min finished


Wall time: 8min 5s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [94]:
#logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# (0.9081512175544284, {'C': 0.21544346900318834})

(0.9081512175544284, {'C': 0.21544346900318834})

In [None]:
### 7

In [95]:
fastlogist = LogisticRegression(C=0.21544346900318834, random_state=17)

In [97]:
def add_features7(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    
    dayinweek = df['time1'].apply(lambda ts: ts.weekday())
    weekend = ((dayinweek >= 5)).astype('int')
    weekday = ((dayinweek < 5)).astype('int')
    
    full_new_feat = pd.DataFrame(index=df.index)
    full_new_feat['start_hour'] = df['time1'].apply(lambda ts: ts.hour).astype('float64')
    full_new_feat['day_in_month'] = df['time1'].apply(lambda ts: ts.day ).astype('float64')
    full_new_feat['dayinweek'] = df['time1'].apply (lambda ts: ts.weekday()).astype('float64')
    full_new_feat['n_unique_sites'] = df.apply (lambda row: label_unique_site (row),axis=1).astype('float64')
   
    
    features = ['start_hour', 'day_in_month', 'dayinweek', 'n_unique_sites']
    tmp = StandardScaler().fit_transform(full_new_feat[features])
    
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)
                , weekend.values.reshape(-1, 1)
                , weekday.values.reshape(-1, 1)
                , tmp[:, :]
               ])
    return X

In [98]:
#%%time
#X_train_new7 = add_features7(train_df.fillna(0), X_train)
#X_test_new7 = add_features7(test_df.fillna(0), X_test)

Wall time: 2min 7s


In [99]:
#%%time
#cv_scores = cross_val_score(fastlogist, X_train_new7, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 16.9 s


In [100]:
#cv_scores, cv_scores.mean()
# 0.9081530648461433

(array([0.80807699, 0.71559309, 0.91968498, 0.97513569, 0.89239026,
        0.95858788, 0.94293394, 0.92419185, 0.97359981, 0.97133616]),
 0.9081530648461433)

In [102]:
#%%time
#fastlogist.fit(X_train_new7, y_train)



Wall time: 6.47 s


LogisticRegression(C=0.21544346900318834, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=17,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [103]:
#logit_test_pred7 = fastlogist.predict_proba(X_test_new7)[:, 1]
#write_to_submission_file(logit_test_pred7, 'subm_self7.csv')

In [104]:
#%%time
#logit_grid_searcher.fit(X_train_new7, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   54.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  7.8min finished


Wall time: 7min 57s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [105]:
#logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# (0.9081530648461433, {'C': 0.21544346900318834})

(0.9081530648461433, {'C': 0.21544346900318834})

In [None]:
#### 8

In [None]:
### no more use the original data nummber

In [None]:
#train_df.head()

In [110]:
alice_web_statistic = {}
alice_web_count_statistic = {}

def getalice_feature(row):
    s = set()
    for i in sites:
        webid = row[i]
        s.add(webid)
        if webid in alice_web_statistic:
            alice_web_statistic[webid] = alice_web_statistic[webid] + 1
        else:
            alice_web_statistic[webid] = 1
            pass
        pass
    webcount = len(s)
    if webcount in alice_web_count_statistic:
        alice_web_count_statistic[webcount] = alice_web_count_statistic[webcount] + 1
    else:
        alice_web_count_statistic[webcount] = 1
        pass
    pass

train_df[train_df['target'] == 1][sites].fillna(0).astype('int').apply(lambda row: getalice_feature(row),axis=1)

session_id
251175    None
196388    None
172448    None
70129     None
206254    None
167235    None
54979     None
77902     None
203387    None
104441    None
76310     None
195629    None
213093    None
157429    None
65043     None
78974     None
226267    None
220102    None
138364    None
144809    None
131623    None
223045    None
96047     None
153722    None
141517    None
78455     None
28854     None
137033    None
19962     None
16185     None
          ... 
70568     None
61740     None
186166    None
26501     None
186526    None
10380     None
213886    None
248700    None
242618    None
127373    None
42411     None
3041      None
4129      None
25151     None
203856    None
86083     None
213421    None
80207     None
167067    None
183428    None
252357    None
148268    None
106636    None
223749    None
25272     None
188473    None
40139     None
205622    None
60720     None
244233    None
Length: 2297, dtype: object

In [111]:
#alice_web_count_statistic

{4: 241, 3: 141, 5: 370, 6: 373, 10: 61, 9: 206, 8: 336, 7: 467, 2: 74, 1: 28}

In [113]:
#alice_web_statistic

In [118]:
alice_web_count = [(k, alice_web_count_statistic[k]) for k in sorted(alice_web_count_statistic, key=alice_web_count_statistic.get, reverse=True)][: 5]

In [119]:
alice_love_web = [(k, alice_web_statistic[k]) for k in sorted(alice_web_statistic, key=alice_web_statistic.get, reverse=True)][: 10]

In [121]:
#alice_web_statistic[0]

In [144]:
def match_alice_love_web(row):
    count = 0
    for i in sites:
        if row[i] in alice_love_web:
            count += 1
            pass
        pass
    return count > 5

In [145]:
def match_alice_web_count(row):
    s = set()
    for i in sites:
        if row[i] != 0:
            s.add(row[i])
            pass
        pass
    return sum(s) in alice_web_count

In [146]:
def add_features8(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    
    dayinweek = df['time1'].apply(lambda ts: ts.weekday())
    weekend = ((dayinweek >= 5)).astype('int')
    weekday = ((dayinweek < 5)).astype('int')
    
    web_count_match = df.apply (lambda row: match_alice_web_count (row),axis=1).astype('int')
    
    web_type = df.apply (lambda row: match_alice_love_web (row),axis=1)
   
    
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)
                , weekend.values.reshape(-1, 1)
                , weekday.values.reshape(-1, 1)
                , web_count_match.values.reshape(-1, 1)
                , web_type.values.reshape(-1, 1)
               ])
    return X

In [147]:
#%%time
#X_train_new8 = add_features8(train_df.fillna(0), X_train)
#X_test_new8 = add_features8(test_df.fillna(0), X_test)

Wall time: 2min 24s


In [148]:
#%%time
#cv_scores = cross_val_score(fastlogist, X_train_new8, y_train, cv=time_split, 
#                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 15.8 s


In [149]:
#cv_scores, cv_scores.mean()
# 0.914169880747776

(array([0.88686294, 0.75522478, 0.87320259, 0.97673136, 0.91206056,
        0.94753728, 0.95367394, 0.93131575, 0.95734345, 0.94774616]),
 0.914169880747776)

In [127]:
#%%time
#fastlogist.fit(X_train_new8, y_train)



Wall time: 6.46 s


LogisticRegression(C=0.21544346900318834, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=17,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [128]:
#logit_test_pred8 = fastlogist.predict_proba(X_test_new8)[:, 1]
#write_to_submission_file(logit_test_pred8, 'subm_self8.csv')

In [129]:
#%%time
#logit_grid_searcher.fit(X_train_new8, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   54.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  7.9min finished


Wall time: 8min 3s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [130]:
#logit_grid_searcher.best_score_, logit_grid_searcher.best_params_
# (0.9081530648461433, {'C': 0.21544346900318834})

(0.9081530648461433, {'C': 0.21544346900318834})

In [None]:
### 9

In [150]:

alice_web_count_score = {}

count = 0
for key, val in alice_web_count_statistic.items():
    count += val
    pass
for key, val in alice_web_count_statistic.items():
    alice_web_count_score[key] = val * 1.0 / count
    pass

In [151]:

alice_web_score = {}
count = 0

for key, val in alice_web_statistic.items():
    count += val
    pass
for key, val in alice_web_statistic.items():
    alice_web_score[key] = val * 1.0 / count
    pass

In [152]:
def get_web_score(row):
    score = 0
    for i in sites:
        if row[i] != 0:
            if row[i] in alice_web_count_score:
                score += alice_web_count_score[row[i]]
            pass
        pass
    return score
    pass

In [158]:
def get_web_count_score(row):
    count = 0
    for i in sites:
        if row[i] != 0:
            count += 1
            pass
        pass
    if count in alice_web_count_score:
        return alice_web_count_score[count]
    else:
        return 0
    pass

In [163]:
def add_features9(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    
    dayinweek = df['time1'].apply(lambda ts: ts.weekday())
    weekend = ((dayinweek >= 5)).astype('int')
    weekday = ((dayinweek < 5)).astype('int')
    
    #web_count_score = df.apply (lambda row: get_web_count_score (row),axis=1).astype('float64')  
    web_score = df.apply (lambda row: get_web_score (row),axis=1).astype('float64')
   
    
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)
                , weekend.values.reshape(-1, 1)
                , weekday.values.reshape(-1, 1)
                #, web_count_score.values.reshape(-1, 1)
                , web_score.values.reshape(-1, 1)
               ])
    return X

In [164]:
%%time
X_train_new9 = add_features9(train_df.fillna(0), X_train)
X_test_new9 = add_features9(test_df.fillna(0), X_test)

Wall time: 2min 4s


In [165]:
%%time
cv_scores = cross_val_score(fastlogist, X_train_new9, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=2) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 15.7 s


In [166]:
cv_scores, cv_scores.mean()
# 0.9141953509312539

(array([0.88698234, 0.75539107, 0.87293626, 0.97672051, 0.9120343 ,
        0.94751517, 0.95366912, 0.93137957, 0.95714482, 0.94818035]),
 0.9141953509312539)

In [167]:
np.logspace(-2, 2, 10)

array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])

In [None]:
## 10

In [170]:
np.linspace(0.1, 1, 10)

array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [171]:
def add_features10(df, X_sparse):
    hour = df['time1'].apply(lambda ts: ts.hour)
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')
    
    dayinweek = df['time1'].apply(lambda ts: ts.weekday())
    week1 = ((dayinweek == 0)).astype('int')
    week2 = ((dayinweek == 1)).astype('int')
    week3 = ((dayinweek == 2)).astype('int')
    week4 = ((dayinweek == 3)).astype('int')
    week5 = ((dayinweek == 4)).astype('int')
    week6 = ((dayinweek == 5)).astype('int')
    week7 = ((dayinweek == 6)).astype('int')
   
    
    X = hstack([X_sparse, morning.values.reshape(-1, 1), 
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1), 
                night.values.reshape(-1, 1)
                , week1.values.reshape(-1, 1)
                , week2.values.reshape(-1, 1)
                , week3.values.reshape(-1, 1)
                , week4.values.reshape(-1, 1)
                , week5.values.reshape(-1, 1)
                , week6.values.reshape(-1, 1)
                , week7.values.reshape(-1, 1)
               ])
    return X

In [172]:
%%time
X_train_new10 = add_features10(train_df.fillna(0), X_train)
X_test_new10 = add_features10(test_df.fillna(0), X_test)

Wall time: 1min 24s


In [173]:
%%time
cv_scores = cross_val_score(fastlogist, X_train_new10, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 11.1 s


In [174]:
cv_scores, cv_scores.mean()
# 0.9138388958077528

(array([0.81850013, 0.79404042, 0.90053997, 0.98191459, 0.93380106,
        0.96377177, 0.94524072, 0.9499836 , 0.88747843, 0.96311826]),
 0.9138388958077528)

In [175]:
logit_grid_searcher2 = GridSearchCV(estimator=logit, param_grid={'C': np.linspace(0.1, 1, 10)},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [176]:
logit_grid_searcher2.fit(X_train_new10, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [177]:
logit_grid_searcher2.best_score_, logit_grid_searcher2.best_params_

(0.9139053650214148, {'C': 0.30000000000000004})

In [178]:
#logit_grid_searcher3 = GridSearchCV(estimator=logit, param_grid={'C': np.linspace(0.1, 1, 10)},
#                                  scoring='accuracy', n_jobs=-1, cv=time_split, verbose=1)

In [179]:
#logit_grid_searcher3.fit(X_train_new10, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [180]:
#logit_grid_searcher3.best_score_, logit_grid_searcher3.best_params_

(0.9913496160687172, {'C': 0.1})

In [185]:
#logit_test_pred10 = logit_grid_searcher3.predict_proba(X_test_new10)[:, 1]
#write_to_submission_file(logit_test_pred10, 'subm_self10.csv')

In [182]:
#logit_grid_searcher4 = GridSearchCV(estimator=logit, param_grid={'C': c_values},
#                                  scoring='accuracy', n_jobs=-1, cv=time_split, verbose=1)

In [183]:
#logit_grid_searcher4.fit(X_train_new10, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.5min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-02, 2.78256e-02, 7.74264e-02, 2.15443e-01, 5.99484e-01,
       1.66810e+00, 4.64159e+00, 1.29155e+01, 3.59381e+01, 1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=1)

In [184]:
#logit_grid_searcher4.best_score_, logit_grid_searcher4.best_params_

(0.9916185848770118, {'C': 0.01})

In [186]:
#logit_test_pred11 = logit_grid_searcher4.predict_proba(X_test_new10)[:, 1]
#write_to_submission_file(logit_test_pred11, 'subm_self11.csv')

In [188]:
np.logspace(-2, 2, 10)

array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])

In [187]:
np.logspace(-3, 1, 10)

array([1.00000000e-03, 2.78255940e-03, 7.74263683e-03, 2.15443469e-02,
       5.99484250e-02, 1.66810054e-01, 4.64158883e-01, 1.29154967e+00,
       3.59381366e+00, 1.00000000e+01])

In [189]:
c_values = np.logspace(-3, 1, 10)

logit_grid_searcher_1 = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [190]:
#logit_grid_searcher_1.fit(X_train_new10, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.2min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-03, 2.78256e-03, 7.74264e-03, 2.15443e-02, 5.99484e-02,
       1.66810e-01, 4.64159e-01, 1.29155e+00, 3.59381e+00, 1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [191]:
#logit_grid_searcher_1.best_score_, logit_grid_searcher_1.best_params_
# 0.913374049587434

(0.913374049587434, {'C': 0.46415888336127775})

In [192]:
#%%time
#logit_grid_searcher_1.fit(X_train_new1, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.3min finished


Wall time: 2min 27s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([1.00000e-03, 2.78256e-03, 7.74264e-03, 2.15443e-02, 5.99484e-02,
       1.66810e-01, 4.64159e-01, 1.29155e+00, 3.59381e+00, 1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [193]:
#logit_grid_searcher_1.best_score_, logit_grid_searcher_1.best_params_
# 0.9181526261645014

(0.9181526261645014, {'C': 0.1668100537200059})

In [194]:
np.linspace(0.15, 0.25, 10)

array([0.15      , 0.16111111, 0.17222222, 0.18333333, 0.19444444,
       0.20555556, 0.21666667, 0.22777778, 0.23888889, 0.25      ])

In [195]:
logit_grid_searcher_2 = GridSearchCV(estimator=logit, param_grid={'C': np.linspace(0.15, 0.25, 10)},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [196]:
%%time
logit_grid_searcher_2.fit(X_train_new1, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished


Wall time: 1min 45s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([0.15   , 0.16111, 0.17222, 0.18333, 0.19444, 0.20556, 0.21667,
       0.22778, 0.23889, 0.25   ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [197]:
logit_grid_searcher_2.best_score_, logit_grid_searcher_2.best_params_

(0.9183729937045094, {'C': 0.22777777777777777})

In [198]:
logit_grid_searcher_3 = GridSearchCV(estimator=logit, param_grid={'C': np.linspace(0.22, 0.6, 20)},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [199]:
%%time
logit_grid_searcher_3.fit(X_train_new1, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  3.9min finished


Wall time: 4min 4s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([0.22, 0.24, 0.26, 0.28, 0.3 , 0.32, 0.34, 0.36, 0.38, 0.4 , 0.42,
       0.44, 0.46, 0.48, 0.5 , 0.52, 0.54, 0.56, 0.58, 0.6 ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [200]:
logit_grid_searcher_3.best_score_, logit_grid_searcher_3.best_params_

(0.9183505761017501, {'C': 0.24})

In [201]:
logit_grid_searcher_4 = GridSearchCV(estimator=logit, param_grid={'C': np.linspace(0.22, 0.25, 10)},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [202]:
%%time
logit_grid_searcher_4.fit(X_train_new1, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished


Wall time: 1min 46s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([0.22   , 0.22333, 0.22667, 0.23   , 0.23333, 0.23667, 0.24   ,
       0.24333, 0.24667, 0.25   ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [203]:
logit_grid_searcher_4.best_score_, logit_grid_searcher_4.best_params_

(0.9183754679584685, {'C': 0.22666666666666666})

In [204]:
logit_grid_searcher_5 = GridSearchCV(estimator=logit, param_grid={'C': np.linspace(0.223, 0.229, 10)},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [205]:
%%time
logit_grid_searcher_5.fit(X_train_new1, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.6min finished


Wall time: 1min 46s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise-deprecating',
       estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=17, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'C': array([0.223  , 0.22367, 0.22433, 0.225  , 0.22567, 0.22633, 0.227  ,
       0.22767, 0.22833, 0.229  ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [206]:
logit_grid_searcher_5.best_score_, logit_grid_searcher_5.best_params_

(0.9183764088859656, {'C': 0.227})

In [208]:
logit_test_pred12 = logit_grid_searcher_5.predict_proba(X_test_new1)[:, 1]
write_to_submission_file(logit_test_pred12, 'subm_self12.csv')

In [None]:
### 11