In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

import pickle
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression


sample_sub = pd.read_csv('Kaggle/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/sample_submission.csv', index_col='session_id')
sample_sub.head(2)

Unnamed: 0_level_0,target
session_id,Unnamed: 1_level_1
1,0.948255
2,0.682483


In [2]:
train_df = pd.read_csv('Kaggle/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/train_sessions.csv', index_col='session_id', parse_dates=['time1'])
train_df = train_df.sort_values(by='time1')
train_df.head(2)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0


In [3]:
test_df = pd.read_csv('Kaggle/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/test_sessions.csv', index_col='session_id', parse_dates=['time1'])
test_df.head(2)

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,site6,time6,site7,time7,site8,time8,site9,time9,site10,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,29,2014-10-04 11:19:53,35.0,2014-10-04 11:19:53,22.0,2014-10-04 11:19:54,321.0,2014-10-04 11:19:54,23.0,2014-10-04 11:19:54,2211.0,2014-10-04 11:19:54,6730.0,2014-10-04 11:19:54,21.0,2014-10-04 11:19:54,44582.0,2014-10-04 11:20:00,15336.0,2014-10-04 11:20:00
2,782,2014-07-03 11:00:28,782.0,2014-07-03 11:00:53,782.0,2014-07-03 11:00:58,782.0,2014-07-03 11:01:06,782.0,2014-07-03 11:01:09,782.0,2014-07-03 11:01:10,782.0,2014-07-03 11:01:23,782.0,2014-07-03 11:01:29,782.0,2014-07-03 11:01:30,782.0,2014-07-03 11:01:53


In [4]:
with open('Kaggle/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/site_dic.pkl', 'rb') as f:
    site_dic = pickle.load(f)
sites_dict = pd.DataFrame(list(site_dic.keys()), index=list(site_dic.values()), columns=['site'])
sites_dict.head()

Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


In [5]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,,,,,,...,,,,,,,,,,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,,...,,,,,,,,,,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [6]:
PATH_TO_DATA = 'Kaggle/catch-me-if-you-can-intruder-detection-through-webpage-session-tracking2/'
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

In [7]:
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,,0,,0,,...,,0,,0,,0,,0,,0
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,,...,,0,,0,,0,,0,,0
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [8]:
y_train = train_df['target'].values
full_df = pd.concat([train_df.drop('target', axis=1), test_df])
idx_split = train_df.shape[0]

train_df[sites].fillna(0).to_csv('Kaggle/train_sessions_text.txt', sep=' ', index=None, header=None)
test_df[sites].fillna(0).to_csv('Kaggle/test_sessions_text.txt', sep=' ', index=None, header=None)

!head -3 Kaggle/train_sessions_text.txt

"head" �� ���� ����७��� ��� ���譥�
��������, �ᯮ��塞�� �ணࠬ��� ��� ������ 䠩���.


In [9]:
cv = CountVectorizer(ngram_range=(1, 3), max_features=50000)
with open('Kaggle/train_sessions_text.txt') as inp_train_file:
    X_train = cv.fit_transform(inp_train_file)
with open('Kaggle/test_sessions_text.txt') as inp_test_file:
    X_test = cv.transform(inp_test_file)
print(X_train.shape, X_test.shape)

(253561, 50000) (82797, 50000)


In [10]:
# def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.9):
#     idx = int(round(X.shape[0] * ratio))
#     lr = LogisticRegression(C=C, random_state=seed, solver='lbfgs', max_iter=500).fit(X[:idx, :], y[:idx])
#     y_pred = lr.predict_proba(X[idx:, :])[:, 1]
#     score = roc_auc_score(y[idx:], y_pred)
#     return score

# print(get_auc_lr_valid(X_train, y_train))

In [11]:
# def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
#     predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
#     predicted_df.to_csv(out_file, index_label=index_label)
    
# lr = LogisticRegression(C=1.0, random_state=17, solver='lbfgs', max_iter=500).fit(X_train, y_train)
# y_test = lr.predict_proba(X_test)[:, 1]
# write_to_submission_file(y_test, 'Kaggle/baseline_1.csv')

-------
##### Let's try to do improvements of the model. Baseline_1 score is 0.91

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from tune_sklearn import TuneGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import (LogisticRegression, LogisticRegressionCV,
                                  SGDClassifier)

In [29]:
sc = StandardScaler(with_mean=False).fit(X_test)
X_test_scaled = sc.transform(X_test)

In [30]:
# Create param grid.

param_grid = {
    'C': [0.001, 0.01, 0.1],
    'penalty': ['l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'max_iter': [100, 500],
}


# Create grid search object
lg = LogisticRegression()
clf = GridSearchCV(lg, param_grid=param_grid, cv=4, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(X_test_scaled, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [82797, 253561]

In [36]:
print(X_test_scaled.shape, X_test.shape, y_train.shape)

(82797, 50000) (82797, 50000) (253561,)


In [17]:
best_clf.best_params_

{'C': 0.1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'newton-cg'}

In [16]:
# Model
def get_auc_lr_valid(X, y, C=0.1, seed=17, ratio = 0.9):
    idx = int(round(X.shape[0] * ratio))
    lr = LogisticRegression(C=C, random_state=seed, solver='newton-cg', max_iter=100, penalty='l2').fit(X[:idx, :], y[:idx])
    y_pred = lr.predict_proba(X[idx:, :])[:, 1]
    score = roc_auc_score(y[idx:], y_pred)
    return score

print(get_auc_lr_valid(X_train, y_train))

0.9223906025824964


In [16]:
# Writing the file
def write_to_submission_file(predicted_labels, out_file, target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels, index = np.arange(1, predicted_labels.shape[0] + 1), columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)
    
lr = LogisticRegression(C=1.0, random_state=17, solver='lbfgs', max_iter=500).fit(X_train, y_train)
y_test = lr.predict_proba(X_test)[:, 1]
write_to_submission_file(y_test, 'Kaggle/baseline_1.csv')

Let's try Random Forest Classifier

In [23]:
grid_TV = TuneGridSearchCV(dt, parametrs, cv=5, n_jobs=-1)
grid_TV.fit(X_train, y_train)

grid_TV.best_params_



{'n_estimators': 10,
 'max_depth': 9,
 'min_samples_leaf': 3,
 'min_samples_split': 6}