In [2]:
import pickle
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
%matplotlib inline

In [3]:
PATH_TO_DATA = "../../data/kaggle_alice/"
!PATH_TO_DATA=../../data/kaggle_alice/

INP_TRAIN = "train_sessions.csv"
INP_TEST  = "test_sessions.csv"
SITE_DIC = "site_dic.pkl"
SAMPLE_SUBMIT = "sample_submission.csv"

!INP_TRAIN=train_sessions.csv
!INP_TEST=test_sessions.csv
!SITE_DIC=site_dic.pkl
!SAMPLE_SUBMIT=sample_submission.csv

In [None]:
def get_auc_lr_valid(X, y, C=1.0, ratio=0.7, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    train_len = int(ratio * X.shape[0])
    X_train = X[:train_len, :]
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    
    logit = LogisticRegression(penalty='l2', C=C, n_jobs=-1, random_state=seed)
    
    logit.fit(X_train, y_train)
    
    valid_pred = logit.predict_proba(X_valid)[:, 1]
    
    return round(roc_auc_score(y_valid, valid_pred), 5)

In [5]:
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [6]:
times = ["time%s" % i for i in range(1, 11)]
sites = ["site%s" % i for i in range(1, 11)]

with open(PATH_TO_DATA + SITE_DIC, "rb") as inp_file:
    site_dic = pickle.load(inp_file)

inv_site_dic = {v: k for k, v in site_dic.items()}

In [7]:
train_df = pd.read_csv(PATH_TO_DATA + INP_TRAIN, 
                       index_col="session_id", 
                       parse_dates=times).sort_values(by="time1")
train_df[sites] = train_df[sites].fillna(0).astype("int")

test_df = pd.read_csv(PATH_TO_DATA + INP_TEST,
                       index_col="session_id", 
                       parse_dates=times)
test_df[sites] = test_df[sites].fillna(0).astype("int")

y_train = train_df["target"]
train_df.drop('target', axis=1, inplace=True)

In [8]:
train_to_text = train_df[sites].apply(
    lambda x: " ".join([str(a) for a in x.values if a != 0]), axis=1)\
               .values.reshape(len(train_df[sites]), 1)
test_to_text = test_df[sites].apply(
    lambda x: " ".join([str(a) for a in x.values if a != 0]), axis=1)\
               .values.reshape(len(test_df[sites]), 1)

In [9]:
pipeline = Pipeline([
    ("vectorize", CountVectorizer()),
    ("tfidf", TfidfTransformer())
])
pipeline.fit(train_to_text.ravel())

X_train_sparse = pipeline.transform(train_to_text.ravel())
X_test_sparse = pipeline.transform(test_to_text.ravel())

X_train_sparse.shape, X_test_sparse.shape

((253561, 41592), (82797, 41592))

In [10]:
def calc_auc(X_train_sparse, y_train):
    Cs = np.logspace(-3, 1, 10)
    Ms = []
    for C in Cs:
        auc = get_auc_lr_valid(X_train_sparse, y_train, C=C)
        Ms.append(auc)

    for i, m in enumerate(Ms):
        s = ""
        if m == max(Ms): s += "--> "
        s += "C: %s, auc: %s" % (Cs[i], m)
        print(s)

In [11]:
feat_train = pd.DataFrame(index=train_df.index)
feat_test = pd.DataFrame(index=test_df.index)

In [12]:
lmbd = lambda ts: 100 * ts.year + ts.month
feat_train['year_month'] = train_df['time1'].apply(lmbd)
feat_test['year_month'] = test_df['time1'].apply(lmbd)

scaler = StandardScaler()
feat_train['year_month_scaled'] = scaler.fit_transform(feat_train['year_month'].values.reshape(-1, 1))
feat_test['year_month_scaled'] = scaler.transform(feat_test['year_month'].values.reshape(-1, 1))



In [35]:
lmbd = lambda ts: ts.year
feat_train['year'] = train_df['time1'].apply(lmbd)
feat_test['year'] = test_df['time1'].apply(lmbd)

scaler = StandardScaler()
feat_train['year_scaled'] = scaler.fit_transform(feat_train['year'].values.reshape(-1, 1))
feat_test['year_scaled'] = scaler.transform(feat_test['year'].values.reshape(-1, 1))



In [36]:
lmbd = lambda ts: ts.month
feat_train['month'] = train_df['time1'].apply(lmbd)
feat_test['month'] = test_df['time1'].apply(lmbd)

scaler = StandardScaler()
feat_train['month_scaled'] = scaler.fit_transform(feat_train['month'].values.reshape(-1, 1))
feat_test['month_scaled'] = scaler.transform(feat_test['month'].values.reshape(-1, 1))



In [13]:
lmbd = lambda ts: ts.hour
feat_train['start_hour'] = train_df['time1'].apply(lmbd)
feat_test['start_hour'] = test_df['time1'].apply(lmbd)

scaler = StandardScaler()
feat_train['start_hour_scaled'] = scaler.fit_transform(feat_train['start_hour'].values.reshape(-1, 1))
feat_test['start_hour_scaled'] = scaler.transform(feat_test['start_hour'].values.reshape(-1, 1))



In [14]:
lmbd = lambda ts: ts.dayofweek
feat_train['weekday'] = train_df['time1'].apply(lmbd)
feat_test['weekday'] = test_df['time1'].apply(lmbd)

scaler = StandardScaler()
feat_train['weekday_scaled'] = scaler.fit_transform(feat_train['weekday'].values.reshape(-1, 1))
feat_test['weekday_scaled'] = scaler.transform(feat_test['weekday'].values.reshape(-1, 1))



In [40]:
lmbd = lambda ts: int(ts.hour > 4 and ts.hour <= 11)
feat_train['morning'] = train_df['time1'].apply(lmbd)
feat_test['morning'] = test_df['time1'].apply(lmbd)

lmbd = lambda ts: int(ts.hour > 11 and ts.hour <= 18)
feat_train['work'] = train_df['time1'].apply(lmbd)
feat_test['work'] = test_df['time1'].apply(lmbd)

lmbd = lambda ts: int(ts.hour > 18 and ts.hour <= 23)
feat_train['eve'] = train_df['time1'].apply(lmbd)
feat_test['eve'] = test_df['time1'].apply(lmbd)

In [16]:
lmbd = lambda x: len(set(a for a in x.values if a != 0))
feat_train['uniq_sites'] = train_df[sites].apply(lmbd, axis=1).values.reshape(len(train_df[sites]), 1)
feat_test['uniq_sites'] = test_df[sites].apply(lmbd, axis=1).values.reshape(len(test_df[sites]), 1)

scaler = StandardScaler()
feat_train['uniq_sites_scaled'] = scaler.fit_transform(feat_train['uniq_sites'].values.reshape(-1, 1))
feat_test['uniq_sites_scaled'] = scaler.transform(feat_test['uniq_sites'].values.reshape(-1, 1))



In [17]:
lmbd = lambda ts: max(ts).timestamp() - min(ts).timestamp()
feat_train['session_timespan'] = train_df[times].apply(lmbd, axis=1).values.reshape(len(train_df[sites]), 1)
feat_test['session_timespan'] = test_df[times].apply(lmbd, axis=1).values.reshape(len(test_df[sites]), 1)

scaler = StandardScaler()
feat_train['session_timespan_scaled'] = scaler.fit_transform(feat_train['session_timespan'].values.reshape(-1, 1))
feat_test['session_timespan_scaled'] = scaler.transform(feat_test['session_timespan'].values.reshape(-1, 1))

In [18]:
def add_features(subset):
    X_train_sparse_new, X_test_sparse_new = X_train_sparse, X_test_sparse
    
    for feature in subset:
        X_train_sparse_new = csr_matrix(hstack([X_train_sparse_new, 
                                        feat_train[feature].values.reshape(-1, 1)]))
        X_test_sparse_new = csr_matrix(hstack([X_test_sparse_new, 
                                               feat_test[feature].values.reshape(-1, 1)]))
        
    return X_train_sparse_new, X_test_sparse_new

In [None]:
import itertools

stuff = ['year_month', 'year_month_scaled', 'start_hour', 'start_hour_scaled',
       'weekday', 'weekday_scaled', 'morning', 'uniq_sites',
       'uniq_sites_scaled']
auc = {}
for L in range(1, len(stuff)+1):
    for subset in itertools.combinations(stuff, L):
        skip = False
        for feature in subset:
            found = sum(1 for s in subset if feature in s)
            if found > 1:
                skip = True
        
        if skip:
            print("- skipped", subset)
            continue

        X_train_sparse_new, X_test_sparse_new = add_features(subset)
        r = get_auc_lr_valid(X_train_sparse_new, y_train, C=0.5)
        auc[subset] = r
        print(subset, r)

In [None]:
t = sorted(auc.items(), key=lambda x:-x[1])[:20]
for x in t:
    print(x)

In [None]:
subset = ('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites')
X_train_sparse_new, X_test_sparse_new = add_features(subset)

for C in np.logspace(-3, 2, 20):
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))

In [None]:
for n in [5, 10, 20, 30]:
    in_top = "in_top_" + str(n)
    in_top_scaled = in_top + "_scaled"
    top_n_sites = train_df[y_train == 1][sites].stack().value_counts().nlargest(n).index

    lmbd = lambda x: sum(1 for s in x.values if s != 0 and s in top_n_sites)
    feat_train[in_top] = train_df[sites].apply(lmbd, axis=1).values.reshape(len(train_df[sites]), 1)
    feat_test[in_top] = test_df[sites].apply(lmbd, axis=1).values.reshape(len(test_df[sites]), 1)

    scaler = StandardScaler()
    feat_train[in_top_scaled] = scaler.fit_transform(feat_train[in_top].values.reshape(-1, 1))
    feat_test[in_top_scaled] = scaler.transform(feat_test[in_top].values.reshape(-1, 1))

In [None]:
import itertools

stuff = ['in_top_5', 'in_top_5_scaled', 
         'in_top_10', 'in_top_10_scaled', 'in_top_20', 'in_top_20_scaled', 'in_top_30', 'in_top_30_scaled']
subset = ('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites')

auc1 = {}
for f in stuff:
    subset1 = subset + (f, )
    X_train_sparse_new, X_test_sparse_new = add_features(subset1)
    r = get_auc_lr_valid(X_train_sparse_new, y_train, C=0.5)
    auc1[subset1] = r
    print(subset1, r)

In [None]:
%%time
logit = LogisticRegression(C=3.6, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_top, y_train)
y_pred = logit.predict_proba(X_test_sparse_top)[:, 1]

write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/tfidf_yms_shs_ws_m_uniq_top30_c3_6.csv")

In [None]:
feat_train.head()

In [None]:
import itertools

stuff = ["session_timespan", "session_timespan_scaled"]
subset = ('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites_scaled')

auc1 = {}
for f in stuff:
    subset1 = subset + (f, )
    X_train_sparse_new, X_test_sparse_new = add_features(subset1)
    r = get_auc_lr_valid(X_train_sparse_new, y_train, C=0.5)
    auc1[subset1] = r
    print(subset1, r)

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm

def svc_param_selection(X, y):
    parameters = {'kernel':('linear',), 'C':[0.01, 0.1, 1]}
    svc = svm.SVC()
    grid_search = GridSearchCV(svc, parameters, n_jobs=-1, verbose=10)
    grid_search.fit(X, y)

    return grid_search.best_params_

In [26]:
subset = ('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites_scaled', 'session_timespan_scaled')
X_train_sparse_new, X_test_sparse_new = add_features(subset)

for C in np.logspace(-3, 2, 20):
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))

0.001 0.88176
0.00183298071083 0.94706
0.00335981828628 0.95509
0.00615848211066 0.95257
0.0112883789168 0.95157
0.0206913808111 0.95206
0.0379269019073 0.95282
0.0695192796178 0.95317
0.12742749857 0.95311
0.233572146909 0.95274
0.428133239872 0.95215
0.784759970351 0.95131
1.43844988829 0.95018
2.63665089873 0.94904
4.83293023857 0.94802
8.8586679041 0.94721
16.2377673919 0.94647
29.7635144163 0.9455
54.5559478117 0.94423
100.0 0.94266


In [43]:
top_n_sites = train_df[y_train == 1][sites].stack().value_counts().nlargest(10).index
for s in top_n_sites:
    print(inv_site_dic[s])

i1.ytimg.com
s.youtube.com
www.youtube.com
www.facebook.com
www.google.fr
r4---sn-gxo5uxg-jqbe.googlevideo.com
r1---sn-gxo5uxg-jqbe.googlevideo.com
apis.google.com
s.ytimg.com
r2---sn-gxo5uxg-jqbe.googlevideo.com


In [29]:
for n in [5, 10, 20, 30]:
    in_top = "in_top_" + str(n)
    top_n_sites = train_df[y_train == 1][sites].stack().value_counts().nlargest(n).index
    
    def lmbd(x):
        for s in x.values: 
            if s == 0: continue 
            if s in top_n_sites: return 1
        return 0

    feat_train[in_top] = train_df[sites].apply(lmbd, axis=1).values.reshape(len(train_df[sites]), 1)
    feat_test[in_top] = test_df[sites].apply(lmbd, axis=1).values.reshape(len(test_df[sites]), 1)

In [32]:
stuff = ['in_top_5', 'in_top_10', 'in_top_20', 'in_top_30']
subset = ('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites_scaled', 'session_timespan_scaled')

auc1 = {}
for f in stuff:
    subset1 = subset + (f, )
    X_train_sparse_new, X_test_sparse_new = add_features(subset1)
    r = get_auc_lr_valid(X_train_sparse_new, y_train, C=0.1)
    auc1[subset1] = r
    print(subset1, r)

('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites_scaled', 'session_timespan_scaled', 'in_top_5') 0.95283
('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites_scaled', 'session_timespan_scaled', 'in_top_10') 0.95301
('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites_scaled', 'session_timespan_scaled', 'in_top_20') 0.95287
('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'uniq_sites_scaled', 'session_timespan_scaled', 'in_top_30') 0.95213


In [None]:
lmbd = lambda ts: max(ts).timestamp() - min(ts).timestamp()
feat_train['session_timespan'] = train_df[times].apply(lmbd, axis=1).values.reshape(len(train_df[sites]), 1)
feat_test['session_timespan'] = test_df[times].apply(lmbd, axis=1).values.reshape(len(test_df[sites]), 1)

scaler = StandardScaler()
feat_train['session_timespan_scaled'] = scaler.fit_transform(feat_train['session_timespan'].values.reshape(-1, 1))
feat_test['session_timespan_scaled'] = scaler.transform(feat_test['session_timespan'].values.reshape(-1, 1))

In [75]:
train_time_diff = np.diff(train_df[times], axis=1)/np.timedelta64(1, 's')
test_time_diff = np.diff(test_df[times], axis=1)/np.timedelta64(1, 's')
deltas = ["delta%s" % str(i) for i in range(1, 10)]
for i, delta in enumerate(deltas):
    feat_train[delta] = train_time_diff[:, i]
    feat_train[delta] = feat_train[delta].fillna(0)
    
    feat_test[delta] = test_time_diff[:, i]
    feat_test[delta] = feat_test[delta].fillna(0)
    
    scaler = StandardScaler()
    feat_train[delta + '_scaled'] = scaler.fit_transform(feat_train[delta].values.reshape(-1, 1))
    feat_test[delta + '_scaled'] = scaler.transform(feat_test[delta].values.reshape(-1, 1))

In [77]:
feat_test.head()

Unnamed: 0_level_0,year_month,year_month_scaled,start_hour,start_hour_scaled,weekday,weekday_scaled,morning,uniq_sites,uniq_sites_scaled,session_timespan,...,delta9,delta1_scaled,delta2_scaled,delta3_scaled,delta4_scaled,delta5_scaled,delta6_scaled,delta7_scaled,delta8_scaled,delta9_scaled
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,201410,0.822948,11,-0.407823,5,1.682905,1,10,1.747312,7.0,...,0.0,-0.191229,-0.170968,-0.186631,-0.193139,-0.192809,-0.193165,-0.192025,-0.104099,-0.190791
2,201407,0.752287,11,-0.407823,3,0.441028,1,1,-1.858194,85.0,...,23.0,0.061235,-0.133105,-0.100195,-0.154907,-0.17962,-0.013667,-0.105958,-0.17618,0.150664
3,201412,0.870055,15,0.858234,4,1.061966,0,2,-1.457582,84.0,...,3.0,-0.181131,-0.170968,0.472445,-0.180395,-0.17962,-0.179357,-0.17768,0.01123,-0.146253
4,201411,0.846501,10,-0.724338,1,-0.80085,1,7,0.545477,4.0,...,0.0,-0.191229,-0.170968,-0.175826,-0.193139,-0.17962,-0.193165,-0.192025,-0.17618,-0.190791
5,201405,0.705179,15,0.858234,4,1.061966,0,8,0.946088,13.0,...,4.0,-0.181131,-0.170968,-0.121804,-0.180395,-0.192809,-0.193165,-0.192025,-0.190596,-0.131407


In [78]:
subset = ('year_month_scaled', 'start_hour_scaled', 'weekday_scaled', 'morning', 'work', 'eve',
          'uniq_sites', 'in_top_10')
subset += tuple(["delta%s_scaled" % str(i) for i in range(1, 10)]) 
X_train_sparse_new, X_test_sparse_new = add_features(subset)

X_train_sparse_new.shape, X_test_sparse_new.shape

((253561, 41609), (82797, 41609))

In [82]:
for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    print(C, get_auc_lr_valid(X_train_sparse_new, y_train, C=C))

0.001 0.5438
0.01 0.91725
0.1 0.95233
1 0.95259
10 0.95104
100 0.94867
1000 0.9412


In [83]:
%%time
logit = LogisticRegression(penalty="l2", C=1, n_jobs=-1, random_state=17)
logit.fit(X_train_sparse_new, y_train)
y_pred = logit.predict_proba(X_test_sparse_new)[:, 1]

write_to_submission_file(y_pred, PATH_TO_DATA + "/submit/tfidf_deltas.csv")

CPU times: user 8.51 s, sys: 0 ns, total: 8.51 s
Wall time: 8.52 s
