In [1]:
import pickle
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [2]:
PATH_TO_DATA = "../../data/kaggle_alice/"
!PATH_TO_DATA=../../data/kaggle_alice/

INP_TRAIN = "train_sessions.csv"
INP_TEST  = "test_sessions.csv"
SITE_DIC = "site_dic.pkl"
SAMPLE_SUBMIT = "sample_submission.csv"

!INP_TRAIN=train_sessions.csv
!INP_TEST=test_sessions.csv
!SITE_DIC=site_dic.pkl
!SAMPLE_SUBMIT=sample_submission.csv

In [8]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(C=C, n_jobs=-1, random_state=seed)
    
    logit.fit(X_train, y_train)
    
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    
    return round(roc_auc_score(y_valid, valid_pred), 5)

In [9]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [4]:
times = ["time%s" % i for i in range(1, 11)]
sites = ["site%s" % i for i in range(1, 11)]

with open(PATH_TO_DATA + SITE_DIC, "rb") as inp_file:
    site_dic = pickle.load(inp_file)

inv_site_dic = {v: k for k, v in site_dic.items()}

In [5]:
train_df = pd.read_csv(PATH_TO_DATA + INP_TRAIN, 
                       index_col="session_id", 
                       parse_dates=times).sort_values(by="time1")
train_df[sites] = train_df[sites].fillna(0).astype("int")

test_df = pd.read_csv(PATH_TO_DATA + INP_TEST,
                       index_col="session_id", 
                       parse_dates=times)
test_df[sites] = test_df[sites].fillna(0).astype("int")

y_train = train_df["target"]
train_df.drop('target', axis=1, inplace=True)

In [6]:
train_to_text = train_df[sites].apply(
    lambda x: " ".join([str(a) for a in x.values if a != 0]), axis=1)\
               .values.reshape(len(train_df[sites]), 1)
test_to_text = test_df[sites].apply(
    lambda x: " ".join([str(a) for a in x.values if a != 0]), axis=1)\
               .values.reshape(len(test_df[sites]), 1)

In [7]:
pipeline = Pipeline([
    ("vectorize", CountVectorizer()),
    ("tfidf", TfidfTransformer())
])
pipeline.fit(train_to_text.ravel())

X_train_sparse = pipeline.transform(train_to_text.ravel())
X_test_sparse = pipeline.transform(test_to_text.ravel())

X_train_sparse.shape, X_test_sparse.shape

((253561, 41592), (82797, 41592))

In [15]:
for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse, y_train, C=C))

0.001 0.77996
0.00278255940221 0.83187
0.00774263682681 0.8792
0.0215443469003 0.90236
0.0599484250319 0.91056
0.16681005372 0.91727
0.464158883361 0.92185
1.29154966501 0.92373
3.5938136638 0.92341
10.0 0.92245


In [12]:
feat_train = pd.DataFrame(index=train_df.index)
feat_test = pd.DataFrame(index=test_df.index)

In [13]:
feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
feat_test['year_month'] = test_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)

scaler = StandardScaler()
feat_train['year_month_scaled'] = scaler.fit_transform(feat_train['year_month'].values.reshape(-1, 1))
feat_test['year_month_scaled'] = scaler.transform(feat_test['year_month'].values.reshape(-1, 1))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse, 
                                        feat_train['year_month_scaled'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, 
                                       feat_test['year_month_scaled'].values.reshape(-1, 1)]))



In [14]:
for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))

0.001 0.77932
0.00278255940221 0.83217
0.00774263682681 0.87618
0.0215443469003 0.89742
0.0599484250319 0.9083
0.16681005372 0.91647
0.464158883361 0.92163
1.29154966501 0.92341
3.5938136638 0.92277
10.0 0.92191


In [16]:
%%time
feat_train['start_hour'] = train_df['time1'].apply(lambda ts: ts.hour)
feat_test['start_hour'] = test_df['time1'].apply(lambda ts: ts.hour)

scaler = StandardScaler()
feat_train['start_hour_scaled'] = scaler.fit_transform(feat_train['start_hour'].values.reshape(-1, 1))
feat_test['start_hour_scaled'] = scaler.transform(feat_test['start_hour'].values.reshape(-1, 1))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                                        feat_train['start_hour_scaled'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['start_hour_scaled'].values.reshape(-1, 1)]))

for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))



0.001 0.9193
0.00278255940221 0.92766
0.00774263682681 0.93361
0.0215443469003 0.94103
0.0599484250319 0.95025
0.16681005372 0.95743
0.464158883361 0.96158
1.29154966501 0.963
3.5938136638 0.96259
10.0 0.96126
CPU times: user 16.5 s, sys: 72 ms, total: 16.6 s
Wall time: 16.5 s


In [17]:
%%time
feat_train['weekday'] = train_df['time1'].apply(lambda ts: ts.dayofweek)
feat_test['weekday'] = test_df['time1'].apply(lambda ts: ts.dayofweek)

scaler = StandardScaler()
feat_train['weekday_scaled'] = scaler.fit_transform(feat_train['weekday'].values.reshape(-1, 1))
feat_test['weekday_scaled'] = scaler.transform(feat_test['weekday'].values.reshape(-1, 1))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                                        feat_train['weekday_scaled'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['weekday_scaled'].values.reshape(-1, 1)]))

for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))



0.001 0.96173
0.00278255940221 0.96539
0.00774263682681 0.96766
0.0215443469003 0.96969
0.0599484250319 0.9729
0.16681005372 0.9751
0.464158883361 0.97592
1.29154966501 0.97518
3.5938136638 0.97351
10.0 0.97139
CPU times: user 18.5 s, sys: 64 ms, total: 18.6 s
Wall time: 18.6 s


In [19]:
feat_train['morning'] = train_df['time1'].apply(lambda ts: int(ts.hour > 4 and ts.hour <= 11))
feat_test['morning'] = test_df['time1'].apply(lambda ts: int(ts.hour > 4 and ts.hour <= 11))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new,  
                                        feat_train['morning'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['morning'].values.reshape(-1, 1)]))

for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))

0.001 0.93055
0.00278255940221 0.96539
0.00774263682681 0.96887
0.0215443469003 0.9704
0.0599484250319 0.97295
0.16681005372 0.97481
0.464158883361 0.97566
1.29154966501 0.97536
3.5938136638 0.9742
10.0 0.97266


In [20]:
%%time
logit = LogisticRegression(C=0.5, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y_train)
y_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/tfidf_yms_shs_ws_m.csv")

CPU times: user 2.2 s, sys: 8 ms, total: 2.21 s
Wall time: 2.21 s


In [32]:
feat_train['is_youtube'] = train_df['site1'].apply(lambda s: 1 if ("youtube" in inv_site_dic[s]) or ("ytimg" in inv_site_dic[s]) else 0)
feat_test['is_youtube'] = test_df['site1'].apply(lambda s: 1 if ("youtube" in inv_site_dic[s]) or ("ytimg" in inv_site_dic[s]) else 0)

X_train_sparse_site = csr_matrix(hstack([X_train_sparse_new,  
                                        feat_train['is_youtube'].values.reshape(-1, 1)]))
X_test_sparse_site = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['is_youtube'].values.reshape(-1, 1)]))

feat_train['is_social'] = train_df['site1'].apply(lambda s: 1 if ("facebook" in inv_site_dic[s]) or ("vk_" in inv_site_dic[s]) else 0)
feat_test['is_social'] = test_df['site1'].apply(lambda s: 1 if ("facebook" in inv_site_dic[s]) or ("vk_" in inv_site_dic[s]) else 0)

X_train_sparse_site = csr_matrix(hstack([X_train_sparse_site,  
                                        feat_train['is_social'].values.reshape(-1, 1)]))
X_test_sparse_site = csr_matrix(hstack([X_test_sparse_site, 
                                       feat_test['is_social'].values.reshape(-1, 1)]))

feat_train['is_google'] = train_df['site1'].apply(lambda s: 1 if "google" in inv_site_dic[s] else 0)
feat_test['is_google'] = test_df['site1'].apply(lambda s: 1 if "google" in inv_site_dic[s] else 0)

X_train_sparse_site = csr_matrix(hstack([X_train_sparse_site,  
                                        feat_train['is_google'].values.reshape(-1, 1)]))
X_test_sparse_site = csr_matrix(hstack([X_test_sparse_site, 
                                       feat_test['is_google'].values.reshape(-1, 1)]))

for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse_site, y_train, C=C))

0.001 0.82335
0.00278255940221 0.9255
0.00774263682681 0.9612
0.0215443469003 0.96871
0.0599484250319 0.97233
0.16681005372 0.97461
0.464158883361 0.97552
1.29154966501 0.97526
3.5938136638 0.97408
10.0 0.97251


In [33]:
%%time
logit = LogisticRegression(C=0.5, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_site, y_train)
y_pred = logit.predict_proba(X_test_sparse_site)[:, 1]

write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/tfidf_yms_shs_ws_m_sites.csv")

CPU times: user 2.56 s, sys: 8 ms, total: 2.57 s
Wall time: 2.57 s


In [61]:
train_df[y_train == 1][sites].head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
251175,270,270,270,21,21,7832,21,7832,30,7832
196388,29,7832,37,7832,7832,29,7832,29,7832,7832
172448,29,7832,7832,29,37,7832,29,7832,29,270
70129,167,167,1515,167,37,1514,855,1515,855,1514
206254,1520,1522,1522,1515,1515,1524,1514,1515,1520,1521


In [69]:
train_df[y_train == 1][sites].stack().value_counts()

77       1382
80       1354
76       1307
29        897
21        857
81        609
879       522
22        522
75        451
82        447
23        437
35        381
881       371
37        293
33        291
3000      286
733       274
30        272
78        236
941       215
7832      209
52        206
704       204
0         201
2078      188
617       159
1057      155
942       153
270       151
335       150
         ... 
24480       1
27294       1
21584       1
8931        1
27370       1
27240       1
27182       1
27386       1
970         1
13934       1
27272       1
188         1
3003        1
12613       1
2590        1
27288       1
2381        1
6400        1
508         1
540         1
27169       1
19190       1
14244       1
27185       1
5061        1
698         1
27352       1
1570        1
5648        1
3159        1
Length: 1054, dtype: int64

In [72]:
a = train_df[y_train == 1][sites].stack().value_counts().nlargest(30).index
for s in a:
    print(inv_site_dic[s])

i1.ytimg.com
s.youtube.com
www.youtube.com
www.facebook.com
www.google.fr
r4---sn-gxo5uxg-jqbe.googlevideo.com
r1---sn-gxo5uxg-jqbe.googlevideo.com
apis.google.com
s.ytimg.com
r2---sn-gxo5uxg-jqbe.googlevideo.com
www.google.com
s-static.ak.facebook.com
r3---sn-gxo5uxg-jqbe.googlevideo.com
twitter.com
static.ak.facebook.com
vk.com
translate.google.fr
platform.twitter.com
yt3.ggpht.com
mts0.google.com
www.info-jeunes.net
clients1.google.com
www.audienceinsights.net


KeyError: 0

In [38]:
%%time
feat_train['uniq_sites'] = train_df[sites].apply(lambda x: len(set(a for a in x.values if a != 0)), axis=1)\
               .values.reshape(len(train_df[sites]), 1)
feat_test['uniq_sites'] = test_df[sites].apply(lambda x: len(set(a for a in x.values if a != 0)), axis=1)\
               .values.reshape(len(test_df[sites]), 1)

scaler = StandardScaler()
feat_train['uniq_sites_scaled'] = scaler.fit_transform(feat_train['uniq_sites'].values.reshape(-1, 1))
feat_test['uniq_sites_scaled'] = scaler.transform(feat_test['uniq_sites'].values.reshape(-1, 1))

X_train_sparse_uniq = csr_matrix(hstack([X_train_sparse_new, 
                                        feat_train['uniq_sites_scaled'].values.reshape(-1, 1)]))
X_test_sparse_uniq = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['uniq_sites_scaled'].values.reshape(-1, 1)]))

for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse_uniq, y_train, C=C))



0.001 0.93522
0.00278255940221 0.96685
0.00774263682681 0.96806
0.0215443469003 0.96884
0.0599484250319 0.97126
0.16681005372 0.97307
0.464158883361 0.97408
1.29154966501 0.97436
3.5938136638 0.97388
10.0 0.97266
CPU times: user 25 s, sys: 32 ms, total: 25 s
Wall time: 24.9 s


In [39]:
%%time
logit = LogisticRegression(C=1.2, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_site, y_train)
y_pred = logit.predict_proba(X_test_sparse_site)[:, 1]

write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/tfidf_yms_shs_ws_m_uniq.csv")

CPU times: user 3.4 s, sys: 8 ms, total: 3.4 s
Wall time: 3.4 s


In [46]:
logit_params={'scoring':'roc_auc','class_weight':'balanced',
             'Cs':np.logspace(-3, 1, 10),'n_jobs':-1, 'random_state':17}
logit = LogisticRegressionCV(**logit_params)
logit.fit(X_train_sparse_site, y_train)

LogisticRegressionCV(Cs=array([  1.00000e-03,   2.78256e-03,   7.74264e-03,   2.15443e-02,
         5.99484e-02,   1.66810e-01,   4.64159e-01,   1.29155e+00,
         3.59381e+00,   1.00000e+01]),
           class_weight='balanced', cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=-1, penalty='l2', random_state=17,
           refit=True, scoring='roc_auc', solver='lbfgs', tol=0.0001,
           verbose=0)

In [44]:
logit.scores_

{1: array([[ 0.82549158,  0.84968286,  0.87119884,  0.88443907,  0.89032856,
          0.89230763,  0.89352081,  0.89537402,  0.89789291,  0.90035543],
        [ 0.8597251 ,  0.88779918,  0.90753221,  0.91753303,  0.92073008,
          0.91990324,  0.91679022,  0.91270905,  0.90708727,  0.90108702],
        [ 0.94136112,  0.95111974,  0.95242123,  0.95089276,  0.94879406,
          0.94625014,  0.94372691,  0.94133113,  0.93943402,  0.93706618]])}