In [80]:
train_uniq_sites = set(train_df[sites].values.flatten())

train_sites = pd.DataFrame(index=train_df.index)
test_sites = pd.DataFrame(index=test_df.index)

for site in sites:
    # transform train sites
#     train_sites[site] = train_df[site].map(lambda x: inv_site_dic[x])
    train_sites[site] = train_df[site].map(lambda x: inv_site_dic[x].replace(".", "_").replace("-", "_"))
    # transform test sites
    test_sites[site] = test_df[site].map(lambda x: inv_site_dic[x].replace(".", "_").replace("-", "_") 
                                         if x in train_uniq_sites else "unknown_unknown")
    
len(train_uniq_sites), len(set(train_sites[sites].values.flatten())), len(set(test_sites[sites].values.flatten()))

(41602, 41600, 9088)

In [8]:
import pickle
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [9]:
PATH_TO_DATA = "../../data/kaggle_alice/"
!PATH_TO_DATA=../../data/kaggle_alice/

INP_TRAIN = "train_sessions.csv"
INP_TEST  = "test_sessions.csv"
SITE_DIC = "site_dic.pkl"
SAMPLE_SUBMIT = "sample_submission.csv"

!INP_TRAIN=train_sessions.csv
!INP_TEST=test_sessions.csv
!SITE_DIC=site_dic.pkl
!SAMPLE_SUBMIT=sample_submission.csv

In [10]:
times = ["time%s" % i for i in range(1, 11)]
sites = ["site%s" % i for i in range(1, 11)]

In [11]:
with open(PATH_TO_DATA + SITE_DIC, "rb") as inp_file:
    site_dic = pickle.load(inp_file)

inv_site_dic = {v: k for k, v in site_dic.items()}

In [13]:
train_df = pd.read_csv(PATH_TO_DATA + INP_TRAIN, 
                       index_col="session_id", 
                       parse_dates=times).sort_values(by="time1")
train_df[sites] = train_df[sites].fillna(0).astype("int")

test_df = pd.read_csv(PATH_TO_DATA + INP_TEST,
                       index_col="session_id", 
                       parse_dates=times)
test_df[sites] = test_df[sites].fillna(0).astype("int")

y_train = train_df["target"]
train_df.drop('target', axis=1, inplace=True)

In [22]:
train_to_text = train_df[sites].apply(
    lambda x: " ".join([str(a) for a in x.values if a != 0]), axis=1)\
               .values.reshape(len(train_df[sites]), 1)
test_to_text = test_df[sites].apply(
    lambda x: " ".join([str(a) for a in x.values if a != 0]), axis=1)\
               .values.reshape(len(test_df[sites]), 1)

In [24]:
pipeline = Pipeline([
    ("vectorize", CountVectorizer()),
    ("tfidf", TfidfTransformer())
])
pipeline.fit(train_to_text.ravel())

X_train_sparse = pipeline.transform(train_to_text.ravel())
X_test_sparse = pipeline.transform(test_to_text.ravel())

X_train_sparse.shape, X_test_sparse.shape

((253561, 41592), (82797, 41592))

In [105]:
train_sites_list = [" ".join(row.tolist()) for _, row in train_sites.iterrows()]
train_sites_list.append("unknown_unknown")
test_sites_list = [" ".join(row.tolist()) for _, row in test_sites.iterrows()]

In [57]:
train_sites.shape, test_sites.shape

((253561, 10), (82797, 10))

In [82]:
(len(train_sites_list), len(set(train_sites[sites].values.flatten()))), \
(len(test_sites_list), len(set(test_sites[sites].values.flatten())))

((253562, 41600), (82797, 9088))

In [104]:
#test
# train_sites_list = train_sites_list[:9] + [train_sites_list[-1]]
# test_sites_list = test_sites_list[:10]

In [106]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(train_sites_list)
X_train_cv = X_train_cv[:-1, :]
X_test_cv = cv.transform(test_sites_list)

In [107]:
X_train_cv.shape, X_test_cv.shape

((253561, 41601), (82797, 41601))

In [108]:
assert set(train_sites[sites].values.flatten()) - set(cv.vocabulary_) == set()

In [112]:
transformer = TfidfTransformer()
X_train_sparse = transformer.fit_transform(X_train_cv)
X_test_sparse = transformer.transform(X_test_cv)

In [113]:
X_train_sparse.shape, X_test_sparse.shape

((253561, 41601), (82797, 41601))

===== BASELINE =====

In [114]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.9, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(C=C, n_jobs=-1, random_state=seed)
    
    logit.fit(X_train, y_train)
    
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    
    return round(roc_auc_score(y_valid, valid_pred), 5)

In [147]:
for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse, y_train, C=C))

0.001 0.80008
0.00278255940221 0.84407
0.00774263682681 0.88516
0.0215443469003 0.9061
0.0599484250319 0.91304
0.16681005372 0.91914
0.464158883361 0.925
1.29154966501 0.92758
3.5938136638 0.92605
10.0 0.92211


In [46]:
%%time
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse, y_train)
y_pred = logit.predict_proba(X_test_sparse)[:, 1]

CPU times: user 1.73 s, sys: 20 ms, total: 1.75 s
Wall time: 1.79 s


In [148]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

lr = LogisticRegression(n_jobs=-1)
param_grid = {"C": np.logspace(-3, 1, 10)}
grid = GridSearchCV(lr, param_grid=param_grid, cv=5, scoring="roc_auc")
grid.fit(X_train_sparse, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   2.78256e-03,   7.74264e-03,   2.15443e-02,
         5.99484e-02,   1.66810e-01,   4.64159e-01,   1.29155e+00,
         3.59381e+00,   1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [149]:
grid.grid_scores_

[mean: 0.77461, std: 0.08813, params: {'C': 0.001},
 mean: 0.80174, std: 0.08661, params: {'C': 0.0027825594022071257},
 mean: 0.82517, std: 0.08042, params: {'C': 0.0077426368268112694},
 mean: 0.83886, std: 0.07485, params: {'C': 0.021544346900318832},
 mean: 0.84936, std: 0.06795, params: {'C': 0.059948425031894091},
 mean: 0.85842, std: 0.06070, params: {'C': 0.1668100537200059},
 mean: 0.86796, std: 0.05349, params: {'C': 0.46415888336127775},
 mean: 0.87743, std: 0.04696, params: {'C': 1.2915496650148828},
 mean: 0.88511, std: 0.04066, params: {'C': 3.5938136638046259},
 mean: 0.88825, std: 0.03610, params: {'C': 10.0}]

In [146]:
%%time
get_auc_lr_valid(X_train_sparse, y_train, C=0.1)

CPU times: user 944 ms, sys: 4 ms, total: 948 ms
Wall time: 955 ms


0.91635999999999995

In [18]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/simple_tfidf.csv")

===== MY ======

In [116]:
feat_train = pd.DataFrame(index=train_df.index)
feat_test = pd.DataFrame(index=test_df.index)

**year_month_scaled**

In [150]:
feat_train['year_month'] = train_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)
feat_test['year_month'] = test_df['time1'].apply(lambda ts: 100 * ts.year + ts.month)

scaler = StandardScaler()
scaler.fit(feat_train['year_month'].values.reshape(-1, 1))

feat_train['year_month_scaled'] = scaler.transform(feat_train['year_month'].values.reshape(-1, 1))
feat_test['year_month_scaled'] = scaler.transform(feat_test['year_month'].values.reshape(-1, 1))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse, 
                                        feat_train['year_month_scaled'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, 
                                       feat_test['year_month_scaled'].values.reshape(-1, 1)]))



In [21]:
%%time
get_auc_lr_valid(X_train_sparse_new, y_train)

CPU times: user 1.83 s, sys: 0 ns, total: 1.83 s
Wall time: 1.85 s


0.92730999999999997

In [51]:
%%time
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y_train)
y_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

CPU times: user 2.3 s, sys: 48 ms, total: 2.35 s
Wall time: 2.38 s


In [None]:
write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/simple_tfidf_yms.csv")

**start_hour_scaled, weekday_scaled**

In [151]:
%%time
feat_train['start_hour'] = train_df['time1'].apply(lambda ts: ts.hour)
feat_test['start_hour'] = test_df['time1'].apply(lambda ts: ts.hour)

scaler = StandardScaler()
scaler.fit(feat_train['start_hour'].values.reshape(-1, 1))

feat_train['start_hour_scaled'] = scaler.transform(feat_train['start_hour'].values.reshape(-1, 1))
feat_test['start_hour_scaled'] = scaler.transform(feat_test['start_hour'].values.reshape(-1, 1))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                                        feat_train['start_hour_scaled'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['start_hour_scaled'].values.reshape(-1, 1)]))

print(get_auc_lr_valid(X_train_sparse_new, y_train))



0.96496
CPU times: user 5.04 s, sys: 128 ms, total: 5.17 s
Wall time: 5.17 s


In [152]:
%%time
feat_train['weekday'] = train_df['time1'].apply(lambda ts: ts.dayofweek)
feat_test['weekday'] = test_df['time1'].apply(lambda ts: ts.dayofweek)

scaler = StandardScaler()
scaler.fit(feat_train['weekday'].values.reshape(-1, 1))

feat_train['weekday_scaled'] = scaler.transform(feat_train['weekday'].values.reshape(-1, 1))
feat_test['weekday_scaled'] = scaler.transform(feat_test['weekday'].values.reshape(-1, 1))

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                                        feat_train['weekday_scaled'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['weekday_scaled'].values.reshape(-1, 1)]))

print(get_auc_lr_valid(X_train_sparse_new, y_train))



0.97688
CPU times: user 5.28 s, sys: 368 ms, total: 5.65 s
Wall time: 5.64 s


In [157]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

lr = LogisticRegression(n_jobs=-1)
param_grid = {"C": np.logspace(-3, 1, 10)}
grid = GridSearchCV(lr, param_grid=param_grid, cv=5, scoring="roc_auc")
grid.fit(X_train_sparse_new, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-03,   2.78256e-03,   7.74264e-03,   2.15443e-02,
         5.99484e-02,   1.66810e-01,   4.64159e-01,   1.29155e+00,
         3.59381e+00,   1.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [158]:
grid.grid_scores_

[mean: 0.82565, std: 0.10471, params: {'C': 0.001},
 mean: 0.82962, std: 0.11273, params: {'C': 0.0027825594022071257},
 mean: 0.83901, std: 0.11507, params: {'C': 0.0077426368268112694},
 mean: 0.85244, std: 0.11170, params: {'C': 0.021544346900318832},
 mean: 0.86887, std: 0.09993, params: {'C': 0.059948425031894091},
 mean: 0.88458, std: 0.08507, params: {'C': 0.1668100537200059},
 mean: 0.89793, std: 0.07335, params: {'C': 0.46415888336127775},
 mean: 0.90825, std: 0.06360, params: {'C': 1.2915496650148828},
 mean: 0.91585, std: 0.05516, params: {'C': 3.5938136638046259},
 mean: 0.92051, std: 0.04787, params: {'C': 10.0}]

In [159]:
for C in np.logspace(-3, 1, 10):
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))

0.001 0.96548
0.00278255940221 0.96817
0.00774263682681 0.96993
0.0215443469003 0.97157
0.0599484250319 0.97424
0.16681005372 0.97632
0.464158883361 0.97723
1.29154966501 0.97662
3.5938136638 0.97507
10.0 0.97285


In [25]:
%%time
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y_train)
y_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

CPU times: user 2.15 s, sys: 16 ms, total: 2.17 s
Wall time: 2.19 s


In [26]:
write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/tfidf_yms_shs_ws.csv")

In [153]:
feat_train['is_youtube'] = train_df['site1'].apply(lambda s: 1 if ("youtube" in s) or ("ytimg" in s) else 0)
feat_test['is_youtube'] = test_df['site1'].apply(lambda s: 1 if ("youtube" in s) or ("ytimg" in s) else 0)

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new,  
                                        feat_train['is_youtube'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['is_youtube'].values.reshape(-1, 1)]))
print(get_auc_lr_valid(X_train_sparse_new, y_train))

TypeError: argument of type 'int' is not iterable

In [154]:
feat_train['is_social'] = train_df['site1'].apply(lambda s: 1 if ("facebook" in s) or ("vk_" in s) else 0)
feat_test['is_social'] = test_df['site1'].apply(lambda s: 1 if ("facebook" in s) or ("vk_" in s) else 0)

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new,  
                                        feat_train['is_social'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['is_social'].values.reshape(-1, 1)]))
print(get_auc_lr_valid(X_train_sparse_new, y_train))

TypeError: argument of type 'int' is not iterable

In [29]:
feat_train['is_google'] = train_df['site1'].apply(lambda s: 1 if "google" in s else 0)
feat_test['is_google'] = test_df['site1'].apply(lambda s: 1 if "google" in s else 0)

X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new,  
                                        feat_train['is_google'].values.reshape(-1, 1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                       feat_test['is_google'].values.reshape(-1, 1)]))
print(get_auc_lr_valid(X_train_sparse_new, y_train))

0.9768


In [35]:
%%time
logit = LogisticRegressionCV(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y_train)
y_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

CPU times: user 964 ms, sys: 200 ms, total: 1.16 s
Wall time: 24.5 s


In [36]:
write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/tfidf_yms_shs_ws_first_sites.csv")