In [2]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

### Dowload data from Kaggle

In [3]:
#!kaggle competitions download -c catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2
#!unzip catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2.zip;
#!rm catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2.zip;
#!mkdir data
#!mv -r tra*
#!mv -r s*

### Data preparation

In [4]:
df_train = pd.read_csv("data/train_sessions.csv", index_col="session_id")
df_test = pd.read_csv("data/test_sessions.csv", index_col="session_id")

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 253561 entries, 1 to 253561
Data columns (total 21 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   site1   253561 non-null  int64  
 1   time1   253561 non-null  object 
 2   site2   250098 non-null  float64
 3   time2   250098 non-null  object 
 4   site3   246919 non-null  float64
 5   time3   246919 non-null  object 
 6   site4   244321 non-null  float64
 7   time4   244321 non-null  object 
 8   site5   241829 non-null  float64
 9   time5   241829 non-null  object 
 10  site6   239495 non-null  float64
 11  time6   239495 non-null  object 
 12  site7   237297 non-null  float64
 13  time7   237297 non-null  object 
 14  site8   235224 non-null  float64
 15  time8   235224 non-null  object 
 16  site9   233084 non-null  float64
 17  time9   233084 non-null  object 
 18  site10  231052 non-null  float64
 19  time10  231052 non-null  object 
 20  target  253561 non-null  int64  
dtypes: float64

#### as we can see, time columns isn't datetime object. We need to convert it

In [6]:
# change time* columns to datetime format
time_list = [f"time{i}" for i in range(1,11)]
df_train[time_list]=df_train[time_list].apply(pd.to_datetime)
df_test[time_list]= df_test[time_list].apply(pd.to_datetime)

In [7]:
# sort by first time1, because time1 can be considered as start for all session 
df_train = df_train.sort_values(by="time1")

In [8]:
df_train.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [9]:
# We can see, that site2..site10 columns are floats, but it is index of sites, so it can be converted into integer.
#Filling NaN with zeros. Nan can't be converted into int type
sites = [f"site{i}" for i in range(1,11)]
df_train[sites] = df_train[sites].fillna(0).astype("int")
df_test[sites] = df_test[sites].fillna(0).astype("int") 

In [10]:
# Load dictionary with sites
with open("./data/site_dic.pkl", "rb") as f:
    sites_dict = pickle.load(f)
# make dictionary more convenient to use
sites_dict_df = pd.DataFrame(data = sites_dict.keys(),
                             index = sites_dict.values(),
                             columns = ["site"])

In [11]:
sites_dict_df.shape

(48371, 1)

In [12]:
# create target variable
y_train = df_train["target"]

In [13]:
# Concat train and test data to perform OHE
dffull_train_test = pd.concat([df_train.drop("target", axis=1), df_test])
idx_split_tr_test = df_train.shape[0]

## first model: consider only sites 

#### __Idea__: build models without time variables, considering only visited sites for every user. We can identify Alice based on her favorite sites, that she visited more often. 

#### We want to build matrix, where ever column is boolean a variable, showing whether some site from dict were visited.

In [14]:
#dataset for OHE
df_sites = dffull_train_test[sites]
df_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [15]:
#Creating sparse matrix
sites_flatten = df_sites.values.flatten()
df_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]

#### we got the same number of features as number of elements in the dictionary with sites

In [16]:
df_sites_sparse.shape

(336358, 48371)

#### Divide sparse matrix back to train\test

In [17]:
X_sites_train = df_sites_sparse[:idx_split_tr_test]
X_sites_test = df_sites_sparse[idx_split_tr_test:]

In [18]:
#check dimensions
assert X_sites_train.shape[0] == y_train.shape[0]
assert X_sites_test.shape[0] == df_test.shape[0]

#### Function to calculate model performance on hold-out dataset

In [19]:
def get_auc_lr_valid(X, y, C=1.0, train_ratio=0.9, seed=17):
    train_slice = int(train_ratio * X.shape[0])
    X_train = X[:train_slice]
    y_train = y[:train_slice]
    
    X_holdout = X[train_slice:]
    y_holdout = y[train_slice:]
    
    log_reg = LogisticRegression(C = C,
                                 n_jobs=-1,
                                 random_state=seed)
    
    log_reg.fit(X_train,y_train)
    
    holdout_pred = log_reg.predict_proba(X_holdout)[:, 1]
    
    return roc_auc_score(y_holdout, holdout_pred)
    

#### Get AUC-ROC score for train data

In [20]:
%%time 
get_auc_lr_valid(X_sites_train, y_train)

CPU times: user 143 ms, sys: 64.3 ms, total: 207 ms
Wall time: 4.61 s


0.9197957084494166

#### Re-train model on the whole data

In [21]:
%%time
log_reg = log_reg = LogisticRegression(n_jobs=-1, random_state=17)
log_reg.fit(X_sites_train, y_train)

CPU times: user 101 ms, sys: 16.4 ms, total: 117 ms
Wall time: 4.24 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

#### Create Kaggle file submission

In [22]:
def create_submission_file(predicted_labels, out_file,
                            target = "target", index_label="session_id"):
    preds_df = pd.DataFrame(predicted_labels,
                            index = np.arange(1, predicted_labels.shape[0] + 1),
                            columns=[target])
    preds_df.to_csv(out_file, index_label = index_label)

In [23]:
test_pred = log_reg.predict_proba(X_sites_test)[:,1]

In [24]:
create_submission_file(test_pred, "first_submission.csv")

# First submission 
### Public leaderboard: 0.90734

## Add more features

#### __Idea__: First model was based only on sites. We can improve model by adding information about time

#### create new dataframe to combine different features

In [25]:
new_feat_train = pd.DataFrame(index=df_train.index)
new_feat_test = pd.DataFrame(index=df_test.index)

#### Add new feature year_month, that shows month and year of the session. This feature will contribute as linear trend.

In [26]:
new_feat_train["year_month"] =  df_train["time1"].apply(lambda dt: dt.year*100 + dt.month)
new_feat_test["year_month"] =  df_test["time1"].apply(lambda dt: dt.year*100 + dt.month)

#### In linear models we need to scale out features to have better learning process

In [27]:
scaler = StandardScaler()
scaler.fit(new_feat_train["year_month"].values.reshape(-1,1))

new_feat_train["year_month_scaled"] = scaler.transform(new_feat_train["year_month"].values.reshape(-1,1))
new_feat_test["year_month_scaled"] = scaler.transform(new_feat_test["year_month"].values.reshape(-1,1))

In [28]:
# Adding new feature to out dataset.
# Important: we need to keep this matrix sparse 
# Using hstack from scipy and one more converting
X_sites_train_new = csr_matrix(hstack([X_sites_train,
                            new_feat_train["year_month_scaled"].values.reshape(-1,1)]))

In [29]:
%%time
get_auc_lr_valid(X_sites_train_new, y_train)

CPU times: user 48.8 ms, sys: 15.8 ms, total: 64.6 ms
Wall time: 3.88 s


0.9198902054055882

#### Try second submission

In [30]:
%%time
log_reg = LogisticRegression(n_jobs=-1, random_state=17)
log_reg.fit(X_sites_train_new, y_train)

CPU times: user 23.8 ms, sys: 8.01 ms, total: 31.9 ms
Wall time: 4.33 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2', random_state=17,
                   solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)

In [31]:
X_sites_test_new = csr_matrix(hstack([X_sites_test,
                            new_feat_test["year_month_scaled"].values.reshape(-1,1)]))


test_pred = log_reg.predict_proba(X_sites_test_new)[:,1]
create_submission_file(test_pred, "second_submission.csv")

# Second submission

## Public Leaderboard: 0.90842

### Adding new features: start_hour and is_morning

In [32]:
new_feat_train["start_hour"] = df_train["time1"].apply(lambda x: x.hour)
new_feat_train["is_morning"] = (new_feat_train["start_hour"] <= 11).astype("int")

In [33]:
time_scaler = StandardScaler()
new_feat_train["start_hour"] = time_scaler.fit_transform(new_feat_train["start_hour"].values.reshape(-1,1))

In [34]:
X_sites_train_new = csr_matrix(hstack([X_sites_train,
                            new_feat_train.drop("year_month",axis=1).values]))

In [35]:
### add test
new_feat_test
new_feat_test["start_hour"] = df_test["time1"].apply(lambda x: x.hour)
new_feat_test["is_morning"] = (new_feat_test["start_hour"] <= 11).astype("int")
new_feat_test["start_hour"] = time_scaler.transform(new_feat_test["start_hour"].values.reshape(-1,1))

In [36]:
X_sites_test_new = csr_matrix(hstack([X_sites_test,
                            new_feat_test.drop("year_month",axis=1).values]))

### tune regularization

In [39]:
import pprint

In [43]:
for x in np.logspace(-3,1,10):
    print(f"{x:.3f}")

0.001
0.003
0.008
0.022
0.060
0.167
0.464
1.292
3.594
10.000


In [44]:
for C in np.logspace(-3,1,10):
    roc_auc = get_auc_lr_valid(X_sites_train_new, y_train, C=C)
    print(f"C={C}, roc_auc= {roc_auc}")

C=0.001, roc_auc= 0.9341581891122599
C=0.0027825594022071257, roc_auc= 0.9430479978117765
C=0.007742636826811269, roc_auc= 0.9516763548689784
C=0.021544346900318832, roc_auc= 0.9578539801634889
C=0.05994842503189409, roc_auc= 0.9605764284135743
C=0.1668100537200059, roc_auc= 0.960868674591127
C=0.46415888336127775, roc_auc= 0.9597175024062005
C=1.2915496650148828, roc_auc= 0.9583571085259803
C=3.593813663804626, roc_auc= 0.9573639847645546
C=10.0, roc_auc= 0.9553985688390702


### Train on the whole data

In [49]:
%%time
log_reg = LogisticRegression(C = 0.1668100537200059, n_jobs=-1, random_state=17)
log_reg.fit(X_sites_train_new, y_train) 

CPU times: user 59.5 ms, sys: 47.7 ms, total: 107 ms
Wall time: 5.2 s


LogisticRegression(C=0.1668100537200059, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=17, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Create third submission

In [50]:
test_pred = log_reg.predict_proba(X_sites_test_new)[:,1]
create_submission_file(test_pred, "third_submission.csv")

# Second submission
## Public Leaderboard: 0.92642