In [122]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
%matplotlib inline


In [2]:
sites = ['site' + str(i) for i in range(1,11)]
times = ['time' + str(i) for i in range(1, 11)]

df_train = pd.read_csv('D:/alice/train.csv', index_col='session_id', parse_dates=times)
df_test = pd.read_csv('D:/alice/test.csv', index_col='session_id', parse_dates=times)

In [3]:
df_train[sites] = df_train[sites].fillna(0).astype('int')
df_test[sites] = df_test[sites].fillna(0).astype('int')

In [4]:
# site names dict
with open('D:/alice/site_dic.pkl', 'rb') as input_file:
    site_dict = pickle.load(input_file)

In [5]:
# Inverted dict
new_dict = {}
for key in site_dict:
    new_dict[site_dict[key]] = key

In [6]:
df_train[sites].fillna(0).to_csv('train_sessions_text.txt', 
                                 sep=' ', index=None, header=None)
df_test[sites].fillna(0).to_csv('test_sessions_text.txt', 
                                sep=' ', index=None, header=None)

In [7]:
# for i in df_train[sites]:
#     df_train[i] = df_train[i].apply(lambda x:new_dict[x] if x!=0 else '0')

In [8]:
list_sites = []
for row in df_train[sites].values:
    row_sites = ' '. join([str(x) for x in row if x!=0])
    list_sites.append(row_sites)

In [9]:
list_sites_names = []
for row in df_train[sites].values:
    row_sites = ' '.join([new_dict[x] for x in row if x!=0])
    list_sites_names.append(row_sites)


In [210]:
cv = CountVectorizer(ngram_range=(1, 3), max_features=60000)

In [211]:
%%time

with open('train_sessions_text.txt') as inp_train_file:
    X = cv.fit_transform(inp_train_file)
with open('test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)

print(X.shape, X_test.shape)

(253561, 60000) (82797, 60000)
Wall time: 30.2 s


In [212]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
    
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed, solver='lbfgs', max_iter=300).fit(X[:idx, :], y[:idx])
    
    # Prediction for validation set
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    
    # Calculate the quality
    score = roc_auc_score(y[idx:], y_pred)
    
    return score

In [213]:
y = df_train['target'].values

In [14]:
# %%time
# Calculate metric on the validation set. 90% of train data for training. 10% for validation.

# print(get_auc_lr_valid(X, y))

In [201]:
time_split = TimeSeriesSplit(n_splits=10)

In [202]:
logit = LogisticRegression(C=1, random_state=17, solver='liblinear')

In [17]:
%%time

cv_scores = cross_val_score(logit, X, y, cv=time_split,
                            scoring='roc_auc', n_jobs=-1)  # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 27.6 s


In [53]:
X.shape

(253561, 60000)

In [52]:
[(el[0].shape, el[1].shape) for el in time_split.split(X)]


[((23051,), (23051,)),
 ((46102,), (23051,)),
 ((69153,), (23051,)),
 ((92204,), (23051,)),
 ((115255,), (23051,)),
 ((138306,), (23051,)),
 ((161357,), (23051,)),
 ((184408,), (23051,)),
 ((207459,), (23051,)),
 ((230510,), (23051,))]

In [18]:
cv_scores.mean()

0.944228607775516

In [19]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = range(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [20]:
logit.fit(X, y)

LogisticRegression(C=1, random_state=17, solver='liblinear')

In [21]:
logit_test_pred = logit.predict_proba(X_test)[:, 1]
write_to_submission_file(logit_test_pred, 'subm1.csv') # 0.91288

In [216]:
def time_between(row, first_site=1, last_site=0):    
    if last_site == 0:
        last_site = row['count_sites']
    if first_site > last_site:
        first_site, last_site = last_site, first_site
        return row['time' + str(last_site)] - row['time' + str(first_site)]
    return row['time' + str(last_site)] - row['time' + str(first_site)]

def count_row(row):
    counter = 0
    for i in row:
        if i != 0:
            counter += 1
    return counter

def total_time(row, mean=False):
    last_site_number = row['count_sites']
    if mean == True and last_site_number != 1:      
        return (row['time' + str(last_site_number)] - row['time1']) / last_site_number
    
    
    return row['time' + str(last_site_number)] - row['time1']

In [240]:
def add_time_features(df, X_sparse):
    df['count_sites'] = df[sites].apply(lambda x: count_row(x), axis=1).values
    dummied_count_sites = pd.get_dummies(df, columns=['count_sites'], sparse=True)[
        ['count_sites_' + str(i) for i in range(1, 10)]]
    
    
    df['session_during_sec'] = df.apply(lambda x: total_time(x).seconds, axis=1) + 0.001
    df['mean_time_per_site'] = df.apply(lambda x: total_time(x, mean=True).seconds, axis=1) + 0.001
    
    
    hour = df['time1'].apply(lambda ts: ts.hour)
    month = df['time1'].apply(lambda ts: ts.month)
    is_january = (month == 1).astype('int')
    is_may = (month == 5).astype('int')
    morning = ((hour >= 7) & (hour <= 11)).astype('int')
    day = ((hour >= 12) & (hour <= 18)).astype('int')
    evening = ((hour >= 19) & (hour <= 23)).astype('int')
    night = ((hour >= 0) & (hour <= 6)).astype('int')

    X = hstack([X_sparse, morning.values.reshape(-1, 1),
                day.values.reshape(-1, 1), evening.values.reshape(-1, 1),
                night.values.reshape(-1, 1), dummied_count_sites.values,
                is_january.values.reshape(-1, 1), is_may.values.reshape(-1, 1),
#                 np.log(df['session_during_sec'].values.reshape(-1, 1)),
#                np.log(df['mean_time_per_site'].values.reshape(-1, 1))
               ])
    
    return X

In [218]:
df_train.apply(lambda x: total_time(x).seconds, axis=1)

session_id
1           0
2          26
3           7
4         270
5         246
         ... 
253557     59
253558      3
253559     87
253560      2
253561      3
Length: 253561, dtype: int64

In [219]:
df_train.apply(lambda x: total_time(x, mean=True).seconds, axis=1)

session_id
1          0
2          2
3          0
4         27
5         24
          ..
253557     5
253558     0
253559    17
253560     0
253561     0
Length: 253561, dtype: int64

In [205]:
X[:, 59999:]

<253561x1 sparse matrix of type '<class 'numpy.int64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [241]:
%%time
X_train_new = add_time_features(df_train.fillna(0), X)
X_test_new = add_time_features(df_test.fillna(0), X_test)

Wall time: 1min 20s


In [242]:
X_train_new.shape, X_test_new.shape


((253561, 60015), (82797, 60015))

In [243]:
%%time
cv_scores = cross_val_score(logit, X_train_new, y, cv=time_split, 
                            scoring='roc_auc', n_jobs=-1) # hangs with n_jobs > 1, and locally this runs much faster

Wall time: 27.3 s


In [244]:
cv_scores, cv_scores.mean()

(array([0.93756605, 0.95362358, 0.96835939, 0.96955161, 0.96539819,
        0.98082901, 0.97900392, 0.96912628, 0.97684463, 0.97808857]),
 0.9678391252732007)

In [155]:
logit.fit(X_train_new, y)

LogisticRegression(C=1, random_state=17, solver='liblinear')

In [156]:
logit_test_pred2 = logit.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred2, 'subm2.csv') 

In [214]:
df_train

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time7,site8,time8,site9,time9,site10,time10,target,count_sites,session_during_sec
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,718,2014-02-20 10:02:45,0,NaT,0,NaT,0,NaT,0,NaT,...,NaT,0,NaT,0,NaT,0,NaT,0,1,0
2,890,2014-02-22 11:19:50,941,2014-02-22 11:19:50,3847,2014-02-22 11:19:51,941,2014-02-22 11:19:51,942,2014-02-22 11:19:51,...,2014-02-22 11:19:52,3846,2014-02-22 11:19:52,1516,2014-02-22 11:20:15,1518,2014-02-22 11:20:16,0,10,26
3,14769,2013-12-16 16:40:17,39,2013-12-16 16:40:18,14768,2013-12-16 16:40:19,14769,2013-12-16 16:40:19,37,2013-12-16 16:40:19,...,2013-12-16 16:40:20,14768,2013-12-16 16:40:21,14768,2013-12-16 16:40:22,14768,2013-12-16 16:40:24,0,10,7
4,782,2014-03-28 10:52:12,782,2014-03-28 10:52:42,782,2014-03-28 10:53:12,782,2014-03-28 10:53:42,782,2014-03-28 10:54:12,...,2014-03-28 10:55:12,782,2014-03-28 10:55:42,782,2014-03-28 10:56:12,782,2014-03-28 10:56:42,0,10,270
5,22,2014-02-28 10:53:05,177,2014-02-28 10:55:22,175,2014-02-28 10:55:22,178,2014-02-28 10:55:23,177,2014-02-28 10:55:23,...,2014-02-28 10:55:59,177,2014-02-28 10:55:59,177,2014-02-28 10:57:06,178,2014-02-28 10:57:11,0,10,246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253557,3474,2013-11-25 10:26:54,3474,2013-11-25 10:26:58,141,2013-11-25 10:27:03,2428,2013-11-25 10:27:04,106,2013-11-25 10:27:13,...,2013-11-25 10:27:28,2428,2013-11-25 10:27:40,2428,2013-11-25 10:27:52,148,2013-11-25 10:27:53,0,10,59
253558,12727,2013-12-03 16:01:15,12727,2013-12-03 16:01:16,2215,2013-12-03 16:01:16,38,2013-12-03 16:01:17,2215,2013-12-03 16:01:17,...,2013-12-03 16:01:18,2215,2013-12-03 16:01:18,23,2013-12-03 16:01:18,21,2013-12-03 16:01:18,0,10,3
253559,2661,2013-12-09 14:05:03,15004,2013-12-09 14:05:10,5562,2013-12-09 14:05:10,5562,2013-12-09 14:06:29,5562,2013-12-09 14:06:30,...,NaT,0,NaT,0,NaT,0,NaT,0,5,87
253560,812,2013-12-19 15:20:22,676,2013-12-19 15:20:22,814,2013-12-19 15:20:22,22,2013-12-19 15:20:22,39,2013-12-19 15:20:22,...,2013-12-19 15:20:23,570,2013-12-19 15:20:23,22,2013-12-19 15:20:24,570,2013-12-19 15:20:24,0,10,2


In [157]:
c_values = np.logspace(-2, 2, 10)

logit_grid_searcher = GridSearchCV(estimator=logit, param_grid={'C': c_values},
                                  scoring='roc_auc', n_jobs=-1, cv=time_split, verbose=1)

In [158]:
%%time
logit_grid_searcher.fit(X_train_new, y)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Wall time: 4min 45s


GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=10, test_size=None),
             estimator=LogisticRegression(C=1, random_state=17,
                                          solver='liblinear'),
             n_jobs=-1,
             param_grid={'C': array([1.00000000e-02, 2.78255940e-02, 7.74263683e-02, 2.15443469e-01,
       5.99484250e-01, 1.66810054e+00, 4.64158883e+00, 1.29154967e+01,
       3.59381366e+01, 1.00000000e+02])},
             scoring='roc_auc', verbose=1)

In [159]:
logit_grid_searcher.best_score_, logit_grid_searcher.best_params_

(0.9663522269135161, {'C': 0.5994842503189409})

In [None]:
#0.96

In [41]:
logit_test_pred3 = logit_grid_searcher.predict_proba(X_test_new)[:, 1]
write_to_submission_file(logit_test_pred3, 'subm3.csv') # 0.94242