<center>
<img src="../../img/ods_stickers.jpg">
## Open Machine Learning Course
<center>
Author: Yury Kashnitsky, Data Scientist at Mail.Ru Group

This material is subject to the terms and conditions of the license [Creative Commons CC BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/). Free use is permitted for any non-comercial purpose with an obligatory indication of the names of the authors and of the source.

## <center>Assignment #6. Part 1
### <center> Beating benchmarks in "Catch Me If You Can: Intruder Detection through Webpage Session Tracking"
    
[Competition](https://www.kaggle.com/c/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2). The task is to beat "Assignment 6 baseline".

In [1]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import os
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer

from datetime import datetime
from tqdm import tqdm, tqdm_notebook
import pickle
import glob


Reading original data

In [24]:

PATH_TO_DATA = ('/Users/lucky/.kaggle/competitions/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2')

alice_log = os.path.join(PATH_TO_DATA, 'train', 'Alice_log.csv')
others_logs = os.path.join(PATH_TO_DATA, 'train', 'other_user_logs', '*.csv')
fulltrain_csv = os.path.join(PATH_TO_DATA, 'train', 'fulltrain.csv')

# load dictionary
with open(os.path.join(PATH_TO_DATA, 'site_dic.pkl'), "rb") as input_file:
    site_dict = pickle.load(input_file)

def load_data(filename, session_len=10, timeout=1800):
    sessions = []
    default_session = dict(
        list(zip(['site'+str(i) for i in range(1, 11)], [None]*10)) + 
        list(zip(['time'+str(i) for i in range(1, 11)], [None]*10)))
    session = default_session.copy()
    with open(filename, 'rt') as f:
        f.readline() # skip header
        counter = 0
        last_ts = None
        
        for i, line in enumerate(f):
            (ts, site) = (None, None)
            try:
                ts, site = line.split(',')
            except Exception:
                print('Split error in file "{}" line# {}'.format(filename, i))
                next
            ts = datetime.strptime(ts, '%Y-%m-%d %H:%M:%S')
            site = site.replace('\n', '')
            if last_ts == None:
                last_ts = ts
            if (counter == session_len) or ((ts - last_ts).total_seconds() > timeout): # end of session 
                sessions.append(session)
                
                # reset counters & buffers
                counter = 0
                last_ts = None
                session = default_session.copy()
            else:
                session['site'+str(counter+1)] = site_dict[site]
                session['time'+str(counter+1)] = datetime.strftime(ts, '%Y-%m-%d %H:%M:%S')
                
                # update counters
                counter += 1
                last_ts = ts
        
        if session['site1'] is not None and session['time1'] is not None:
            sessions.append(session)

    return pd.DataFrame.from_dict(sessions)
 
def prepare_train_file(fname):
    alice_df = load_data(alice_log)
    alice_df['target'] = 1

    dataframes = [alice_df]
    tqdm.monitor_interval = 0
    for filename in tqdm_notebook(glob.glob(others_logs)):
        other_user_df = load_data(filename)
        other_user_df['target'] = 0
        dataframes.append(other_user_df)

    full_df = pd.concat(dataframes)
    full_df.reset_index(drop=True, inplace=True)
    full_df.to_csv(fname, index_label='session_id')
    return

if not os.path.exists(fulltrain_csv):
    prepare_train_file(fulltrain_csv)


In [25]:
train_sessions_file = fulltrain_csv if fulltrain_csv else os.path.join(PATH_TO_DATA, 'test_sessions.csv')

train_df = pd.read_csv(train_sessions_file, index_col='session_id')
test_df = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_sessions.csv'), index_col='session_id')
train_df = train_df.dropna(axis=0, how='all')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 227890 entries, 0 to 227889
Data columns (total 21 columns):
site1     227890 non-null int64
site10    211136 non-null float64
site2     224997 non-null float64
site3     222746 non-null float64
site4     220689 non-null float64
site5     218971 non-null float64
site6     217335 non-null float64
site7     215749 non-null float64
site8     214152 non-null float64
site9     212674 non-null float64
time1     227890 non-null object
time10    211136 non-null object
time2     224997 non-null object
time3     222746 non-null object
time4     220689 non-null object
time5     218971 non-null object
time6     217335 non-null object
time7     215749 non-null object
time8     214152 non-null object
time9     212674 non-null object
target    227890 non-null int64
dtypes: float64(9), int64(2), object(10)
memory usage: 38.3+ MB


In [26]:
%%time

site_cols = ['site'+str(i) for i in range(1, 11)]
time_cols = ['time'+str(i) for i in range(1, 11)]

def prepare_dataset(df, is_test=False):
    
    sites = df[site_cols].fillna(0).astype(int)
    times = df[time_cols].fillna(method='ffill', axis=1)
    for c in time_cols:
        times[c] = pd.to_datetime(times[c])

    list_to_concat = [sites, times]
    if not is_test:
        list_to_concat.append(train_df['target'])
    df = pd.concat(list_to_concat, axis=1)

    df['#unique_sites'] = sites.nunique(axis=1) / 10 # to scale the feature we divide it by 10

    df['hour_of_day'] = df['time1'].dt.hour
    df['day_of_week'] = df['time1'].dt.dayofweek
    df['weekend'] = df['day_of_week'].apply(lambda d: 1 if d==5 or d==6 else 0)
    df['part_of_day'] = df['hour_of_day'].apply(lambda h: 1 if h > 11 and h <= 13 else (
                                                     2 if h > 15 and h <= 18 else (
                                                     3 if h > 18 and h <= 24 
                                                         else 4)))

    df['session_span'] = (df.time10 - df.time1).astype('timedelta64[s]')
    for i in range(1, 10):
        df['diff'+str(i)] = (df['time'+str(i+1)] - df['time'+str(i)]).astype('timedelta64[s]')
    
    return df
    
train_dataset = prepare_dataset(train_df)

test_dataset = prepare_dataset(test_df, is_test=True)

CPU times: user 30.8 s, sys: 557 ms, total: 31.3 s
Wall time: 31.2 s


Let's figure out top 30 popular sites for our train set:

In [27]:

reverse_site_dict = dict((v,k) for (k,v) in site_dict.items())

unique, counts = np.unique(train_dataset[train_dataset['target'] == 1][site_cols].values.flatten(), return_counts=True)
top30 = [s[0] for s in sorted(zip(unique, counts), key=lambda x: x[1], reverse=True)[0:31]]
top30.remove(0)
[reverse_site_dict[site_id] for site_id in top30]

['i1.ytimg.com',
 's.youtube.com',
 'www.youtube.com',
 'www.facebook.com',
 'www.google.fr',
 'r4---sn-gxo5uxg-jqbe.googlevideo.com',
 'r1---sn-gxo5uxg-jqbe.googlevideo.com',
 'apis.google.com',
 'r2---sn-gxo5uxg-jqbe.googlevideo.com',
 's.ytimg.com',
 'www.google.com',
 's-static.ak.facebook.com',
 'r3---sn-gxo5uxg-jqbe.googlevideo.com',
 'static.ak.facebook.com',
 'twitter.com',
 'vk.com',
 'translate.google.fr',
 'platform.twitter.com',
 'yt3.ggpht.com',
 'www.info-jeunes.net',
 'mts0.google.com',
 'clients1.google.com',
 'www.audienceinsights.net',
 'www.melty.fr',
 'gg.google.com',
 'plus.googleapis.com',
 'www.dailymotion.com',
 'mts1.google.com',
 'youwatch.org',
 'api.bing.com']

Find out average time of user's being at top 30 sites:

In [28]:
%%time

avg_ss_columns = []

def avg_session_span_for(site_id, row):
    n_visits = 0
    duration = 0
    for i in range(1, 10):
        if row['site'+str(i)] == site_id:
            n_visits += 1
            duration += row['diff'+str(i)]
    return duration/n_visits if n_visits > 0 else 0


top=top30[:10]
for site in top:
    train_dataset['avg_ss_for_'+str(site)] = train_dataset.apply(lambda r: avg_session_span_for(site, r), axis=1)
    print('*** {} [train]'.format(reverse_site_dict[site]))
    test_dataset['avg_ss_for_'+str(site)] = test_dataset.apply(lambda r: avg_session_span_for(site, r), axis=1)
    print('--- {} [test]'.format(reverse_site_dict[site]))
    avg_ss_columns.append('avg_ss_for_'+str(site))


*** i1.ytimg.com [train]
--- i1.ytimg.com [test]
*** s.youtube.com [train]
--- s.youtube.com [test]
*** www.youtube.com [train]
--- www.youtube.com [test]
*** www.facebook.com [train]
--- www.facebook.com [test]
*** www.google.fr [train]
--- www.google.fr [test]
*** r4---sn-gxo5uxg-jqbe.googlevideo.com [train]
--- r4---sn-gxo5uxg-jqbe.googlevideo.com [test]
*** r1---sn-gxo5uxg-jqbe.googlevideo.com [train]
--- r1---sn-gxo5uxg-jqbe.googlevideo.com [test]
*** apis.google.com [train]
--- apis.google.com [test]
*** r2---sn-gxo5uxg-jqbe.googlevideo.com [train]
--- r2---sn-gxo5uxg-jqbe.googlevideo.com [test]
*** s.ytimg.com [train]
--- s.ytimg.com [test]
CPU times: user 13min 54s, sys: 7.86 s, total: 14min 2s
Wall time: 14min 8s


Separate target feature 

In [29]:
y = train_dataset['target']

Build Tf-Idf features based on sites. You can use `ngram_range`=(1, 3) and `max_features`=100000 or more

In [40]:
%%time

train_sessions = train_dataset[site_cols].astype(str).apply(lambda s: ' '.join(s), axis=1)
test_sessions = test_dataset[site_cols].astype(str).apply(lambda s: ' '.join(s), axis=1)

# fit TfidfVectorizer with all sites from user sessions
vec = TfidfVectorizer(ngram_range=(1, 5), max_features=200000, stop_words=['0'])
vec = vec.fit(test_sessions.append(train_sessions))

# generate sparse matrices
train_v = vec.transform(train_sessions)
test_v = vec.transform(test_sessions)

CPU times: user 1min 4s, sys: 1.6 s, total: 1min 6s
Wall time: 1min 6s


Идеи:
* Пользователи предпочитают пользоваться определённым поисковиком и определённой соцсетью, таким образом можно ввести категориальные признаки: поисковик, соцсеть со значениями (google, yandex, mail.ru, rambler, microsoft ... и facebook, vk, odnoklassniki ...)
* Продолжительность пребывания на сайте может нести полезную информацию, поэтому разреженная матрица, где в строках сессии, а в столбцах сайты, со значением время пребывания за сессию может улучшить модель теоретически 
* Возможно сработают комбинации типа "любимые сайты по утрам" или "любимые сайты по выходным"

Add features based on the session start time: hour, whether it's morning, day or night and so on.
Scale this features and combine then with Tf-Idf based on sites (you'll need `scipy.sparse.hstack`)

In [31]:
cat_features = ['hour_of_day', 'part_of_day', 'day_of_week', 'weekend']
scalable_features = ['session_span'] + avg_ss_columns
other_features = ['#unique_sites']
all_features = cat_features + scalable_features + other_features


features_train = train_dataset[all_features]
features_test = test_dataset[all_features]

scaler = MinMaxScaler()
features_train[scalable_features] = scaler.fit_transform(features_train[scalable_features])
features_test[scalable_features] = scaler.transform(features_test[scalable_features])

In [32]:
enc = LabelEncoder()
enc.fit(cat_features)
new_cat_features = enc.transform(cat_features)

encoder = OneHotEncoder(categorical_features=new_cat_features)
train_mtx = encoder.fit_transform(features_train)
test_mtx = encoder.transform(features_test)

In [41]:
train_X = hstack([train_mtx, train_v])
test_X = hstack([test_mtx, test_v])

Perform cross-validation with logistic regression.

In [42]:
%%time
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
logitCV = LogisticRegressionCV(
        Cs=[12.33, 12.55, 13.27], #np.linspace(11, 14, 4),
        penalty='l2',
        scoring='roc_auc',
        cv=skf,
        random_state=42,
        solver='liblinear',
        n_jobs=-1,
        refit=True,
        verbose=2,
        max_iter=100,
        tol=0.0001
    )

logitCV.fit(train_X, y)


os.system('say "your program has finished"')

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished


[LibLinear]CPU times: user 14.7 s, sys: 665 ms, total: 15.4 s
Wall time: 2min 17s


Let's evaluate our model:

In [43]:
print ('AUC_ROC for our model:', logitCV.scores_[1].mean(axis=0).max())

AUC_ROC for our model: 0.98706222552


Experiments:

 * TFxIDF+hour,pod,dow,is_weekend,session_span,#unique sites /np.logspace(-7, 7, 15); l1; 3 folds/ = 0.98138
 * TFxIDF+hour,pod,dow,is_weekend,session_span,#unique sites /np.logspace(-2, 3, 9); l2; 3 folds/ = 0.98430
 * TFxIDF+hour,pod,dow,is_weekend,session_span,#unique sites /l2; 3 folds; ngram_range=(1,2), C=12.55/ = 0.98446
 * TFxIDF only /.../ = 0.95835
 * TFxIDF+hour,pod,dow,is_weekend,session_span,#unique sites /l2; 5 folds; ngram_range=(1,3), C=11.55/ = 0.98626403243
 * ... /10 folds/ = 0.98667
 * ... /tol=0.00001/ = 0.98667
 * ...+avg_per_site = 0.98668
 * ...+TFxIDF(max_features=200000) = 0.98671
 * ...+TFxIDF(max_features=500000) = 0.98598
 * tuned hour feature+TFxIDF(max_features=200000)  = 0.98671
 * ...+day,month,year = 0.99170 (probably overfit 0.90 on kaggle)
 * ...+month = 0.98994
 * TFxIDF+hour,pod,dow,is_weekend,session_span,#unique sites, tuned hour, avg_per_site/l2; 10 folds; ngram_range=(1,3), C=12./ = 0.98807
 * ... C=13.27 = 0.98808
 * refactored = 0.98818 > 0.94894
 * top10 sites duration = 0.98817 > 0.94896
 * top15 of alice sites /3 folds/ = 0.98666 > 0.94904
 * ... /10 folds, C=14.677/ = 0.98817 > 0.94904
 * full train dataset 227890 entries = 0.98708 > 0.95029
 * full train dataset with 10 folds crossval = 0.98780 >  0.94968
 * ... with top10 sites duration = 0.98708 > 0.95028
 * ngram_range=(1,5) = 0.98706 > 0.95188

In [44]:
logitCV.C_

array([ 13.27])

Make prediction for the test set and form a submission file.

In [45]:
test_pred = logitCV.predict_proba(test_X)[:,1]
test_pred = np.array(list("{:.6f}".format(x) for x in test_pred))

In [46]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


In [47]:
write_to_submission_file(test_pred, "assignment6_alice_submission.csv")