In [1]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from collections import Counter
import gc

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.metrics import accuracy_score

#from vowpalwabbit import pyvw

https://www.kaggle.com/c/identify-me-if-you-can4/

In [2]:
PATH_TO_DATA = os.path.join('./', 'data', 'compete-catch_me_if_you_can')
ANSW = './answers'
PATH_TO_DATA

'./data/compete-catch_me_if_you_can'

In [3]:
sites = ['site' + str(i) for i in range(1, 11)]

In [4]:
def write_answer_to_file(answer, file_address):
    if isinstance(answer, list) or isinstance(answer, np.ndarray):
        with open(os.path.join(ANSW, file_address), 'w') as out_f:
            for idx, elmnt in enumerate(answer):
                if idx == 0:
                    out_f.write(str(elmnt))
                else:
                    out_f.write(' ' + str(elmnt))
    else:
        with open(os.path.join(ANSW, file_address), 'w') as out_f:
            out_f.write(str(answer))

In [5]:
def write_to_submission_file(predicted_labels, out_file,
                             target='user_id', index_label="session_id"):
    # turn predictions into data frame and save as csv file
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)

In [6]:
train_part_vw = os.path.join(PATH_TO_DATA, 'train_part.vw')
valid_vw      = os.path.join(PATH_TO_DATA, 'valid.vw')
train_full_vw = os.path.join(PATH_TO_DATA, 'train_full.vw')
test_vw       = os.path.join(PATH_TO_DATA, 'test.vw')
model         = os.path.join(PATH_TO_DATA, 'vw_model.vw')
pred          = os.path.join(PATH_TO_DATA, 'vw_pred.csv')

## Часть 2. Применение Vowpal Wabbit к данным по посещению сайтов

In [7]:
train_df_400 = pd.read_csv(os.path.join(PATH_TO_DATA,'train_sessions_400users.csv'), 
                           index_col='session_id')

test_df_400 = pd.read_csv(os.path.join(PATH_TO_DATA,'test_sessions_400users.csv'), 
                           index_col='session_id')

train_df_400.shape, test_df_400.shape, train_df_400['user_id'].nunique()

((182793, 21), (46473, 20), 400)

In [8]:
for el in sites:
    train_df_400[el] = train_df_400[el].fillna(0).astype('int')
    test_df_400[el] = test_df_400[el].fillna(0).astype('int')

Переводим user_id в диапазон 0-400 для работы vw с возможностью востановления

In [9]:
y = train_df_400['user_id']
class_encoder = LabelEncoder()
y_for_vw = class_encoder.fit_transform(y) + 1

Подготовим csr матрицу

In [10]:
train_test_df_400 = pd.concat([train_df_400, test_df_400])
train_test_df_400_sites = train_test_df_400[['site%d' % i for i in range(1, 11)]].fillna(0).astype('int')
train_test_df_400_sites.reset_index(inplace=True)
train_test_df_400_sites.shape

(229266, 11)

In [11]:
%%time
row_csr  = []
data_csr = []
col_csr  = []

for idx in range(train_test_df_400_sites.shape[0]):
    for el in Counter(train_test_df_400_sites.loc[idx, :].values[1:]).most_common():
        row_csr.append(idx)
        data_csr.append(el[1])
        col_csr.append(el[0])

CPU times: user 29.2 s, sys: 63.1 ms, total: 29.2 s
Wall time: 28.8 s


In [12]:
tmp_sparse = csr_matrix((data_csr, (row_csr, col_csr)), shape=(max(row_csr)+1, max(col_csr)+1))
X_train_sparse = tmp_sparse[:train_df_400.shape[0], :]
X_test_sparse  = tmp_sparse[train_df_400.shape[0]:, :]
X_train_sparse.shape, X_test_sparse.shape

((182793, 36657), (46473, 36657))

In [13]:
del train_test_df_400_sites
del tmp_sparse
gc.collect()

22

In [14]:
#gc.collect()

Разобьем на тестовую и проверочную

In [15]:
train_share = int(.7 * train_df_400.shape[0])
train_df_part = train_df_400[sites].iloc[:train_share, :]
valid_df = train_df_400[sites].iloc[train_share:, :]
X_train_part_sparse = X_train_sparse[:train_share, :]
X_valid_sparse = X_train_sparse[train_share:, :]

In [16]:
y_train_part = y[:train_share]
y_valid = y[train_share:]
y_train_part_for_vw = y_for_vw[:train_share]
y_valid_for_vw = y_for_vw[train_share:]

Подготовим данные в формате vowpal wabbit

In [17]:
def arrays_to_vw(X, y=None, train=True, out_file='tmp.vw'):
    
    if (train == False):
        y = [1]*X.shape[0]
    
    with open(out_file, 'w') as fd:
        for idx in range(X.shape[0]):
            fd.write( str(y[idx]) + ' | '+ ( ' '.join([str(int(el)) for el in X[idx]]) ) + '\n')
    
    pass

In [18]:
arrays_to_vw(train_df_part.values, y_train_part_for_vw, out_file = train_part_vw)
arrays_to_vw(valid_df.values, y_valid_for_vw, out_file = valid_vw)
arrays_to_vw(train_df_400[sites].values, y_for_vw, out_file = train_full_vw)
arrays_to_vw(test_df_400[sites].values, train=False, out_file = test_vw)

In [22]:
!head -3 $PATH_TO_DATA/train_part.vw

262 | 23713 23720 23713 23713 23720 23713 23713 23713 23713 23713
82 | 8726 8725 665 8727 45 8725 45 5320 5320 5320
16 | 303 19 303 303 303 303 303 309 303 303


In [23]:
!head -3  $PATH_TO_DATA/valid.vw

4 | 7 923 923 923 11 924 7 924 838 7
160 | 91 198 11 11 302 91 668 311 310 91
312 | 27085 848 118 118 118 118 11 118 118 118


In [24]:
!head -3  $PATH_TO_DATA/train_full.vw

262 | 23713 23720 23713 23713 23720 23713 23713 23713 23713 23713
82 | 8726 8725 665 8727 45 8725 45 5320 5320 5320
16 | 303 19 303 303 303 303 303 309 303 303


In [25]:
!head -3  $PATH_TO_DATA/test.vw

1 | 9 304 308 307 91 308 312 300 305 309
1 | 838 504 68 11 838 11 838 886 27 305
1 | 190 192 8 189 191 189 190 2375 192 8


Обучим Vowpal Wabbit на части выборки

In [26]:
!vw $PATH_TO_DATA/train_part.vw  --oaa 400 --passes 3 -c -k -b 26 --random_seed 17 -f $PATH_TO_DATA/model_part.vw

final_regressor = ./data/compete-catch_me_if_you_can/model_part.vw
Num weight bits = 26
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = ./data/compete-catch_me_if_you_can/train_part.vw.cache
Reading datafile = ./data/compete-catch_me_if_you_can/train_part.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      262        1       11
1.000000 1.000000            2            2.0       82      262       11
1.000000 1.000000            4            4.0      241      262       11
1.000000 1.000000            8            8.0      352      262       11
1.000000 1.000000           16           16.0      135       16       11
1.000000 1.000000           32           32.0       71      112       11
0.968750 0.937500           64           64.0      358      231       11
0.976562 0.984375     

In [30]:
!vw -d $PATH_TO_DATA/valid.vw -i $PATH_TO_DATA/model_part.vw -t -p $PATH_TO_DATA/pred_part.csv

only testing
predictions = ./data/compete-catch_me_if_you_can/pred_part.csv
Num weight bits = 26
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./data/compete-catch_me_if_you_can/valid.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        4      188       11
1.000000 1.000000            2            2.0      160      220       11
0.750000 0.500000            4            4.0      143      143       11
0.750000 0.750000            8            8.0      247      247       11
0.687500 0.625000           16           16.0      341       30       11
0.593750 0.500000           32           32.0      237      237       11
0.609375 0.625000           64           64.0      178      178       11
0.640625 0.671875          128          128.0      132      228       11
0.656250 0.671875          256 

In [31]:
pred_part = pd.read_csv(os.path.join(PATH_TO_DATA, 'pred_part.csv'), header=None)

In [32]:
answ6_1 = round(accuracy_score(y_valid_for_vw, pred_part), 3)
#answ6_1 = accuracy_score(y_valid_for_vw, pred_part)
answ6_1

0.345

In [33]:
write_answer_to_file(answ6_1, 'answer6_1.txt')

Теперь посмотрим на SGD и LogisticRegression

In [34]:
SGD_logit = SGDClassifier(loss='log', n_jobs= -1, max_iter = 3, random_state = 17)
logit = LogisticRegression(random_state = 17, n_jobs = -1)

In [67]:
%t%ime
logit.fit(X_train_part_sparse, y_train_part)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.91 µs


  " = {}.".format(effective_n_jobs(self.n_jobs)))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=-1, penalty='l2', random_state=17,
                   solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [69]:
pred_part_logit = logit.predict(X_valid_sparse)

In [70]:
%%time
SGD_logit.fit(X_train_part_sparse, y_train_part)

CPU times: user 1min 6s, sys: 34.3 s, total: 1min 40s
Wall time: 25.8 s




SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=3,
              n_iter_no_change=5, n_jobs=-1, penalty='l2', power_t=0.5,
              random_state=17, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [71]:
pred_part_sgd_logit = SGD_logit.predict(X_valid_sparse)

In [73]:
answ6_2 = round(accuracy_score(y_valid, pred_part_sgd_logit), 3)
answ6_2

0.29

In [77]:
write_answer_to_file(answ6_2, 'answer6_2.txt')

In [74]:
answ6_3 = round(accuracy_score(y_valid, pred_part_logit), 3)
answ6_3

0.365

In [78]:
write_answer_to_file(answ6_3, 'answer6_3.txt')

## 3. Валидация по тестовой выборке (Public Leaderboard)

In [35]:
!vw $PATH_TO_DATA/train_full.vw  --oaa 400 --passes 3 -c -k -b 26 --random_seed 17 -f $PATH_TO_DATA/model.vw

final_regressor = ./data/compete-catch_me_if_you_can/model.vw
Num weight bits = 26
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = ./data/compete-catch_me_if_you_can/train_full.vw.cache
Reading datafile = ./data/compete-catch_me_if_you_can/train_full.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0      262        1       11
1.000000 1.000000            2            2.0       82      262       11
1.000000 1.000000            4            4.0      241      262       11
1.000000 1.000000            8            8.0      352      262       11
1.000000 1.000000           16           16.0      135       16       11
1.000000 1.000000           32           32.0       71      112       11
0.968750 0.937500           64           64.0      358      231       11
0.976562 0.984375          

In [36]:
!vw -d $PATH_TO_DATA/test.vw -i $PATH_TO_DATA/model.vw -t -p $PATH_TO_DATA/pred.csv

only testing
predictions = ./data/compete-catch_me_if_you_can/pred.csv
Num weight bits = 26
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ./data/compete-catch_me_if_you_can/test.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        1       90       11
1.000000 1.000000            2            2.0        1       21       11
1.000000 1.000000            4            4.0        1      265       11
1.000000 1.000000            8            8.0        1      137       11
1.000000 1.000000           16           16.0        1      273       11
1.000000 1.000000           32           32.0        1      384       11
1.000000 1.000000           64           64.0        1      139       11
1.000000 1.000000          128          128.0        1       85       11
1.000000 1.000000          256       

Возвращаем метки к первоначальным

In [37]:
vw_pred = pd.read_csv(os.path.join(PATH_TO_DATA, 'pred.csv'), header=None)
vw_pred = vw_pred - 1
vw_pred = class_encoder.inverse_transform(vw_pred.values)
min(vw_pred), max(vw_pred)

  y = column_or_1d(y, warn=True)


(1, 997)

In [38]:
write_to_submission_file(vw_pred, os.path.join(PATH_TO_DATA, 'vw_400_users.csv'))

In [122]:
vw_liderboard = 0.18768

In [123]:
answ6_4 = vw_liderboard

In [124]:
write_answer_to_file(answ6_4, 'answer6_4.txt')

In [31]:
SGD_logit = SGDClassifier(loss='log', n_jobs= -1, random_state = 17)

In [32]:
%%time
SGD_logit.fit(X_train_sparse, y)

Wall time: 29.6 s


SGDClassifier(loss='log', n_jobs=-1, random_state=17)

In [33]:
pred_sgd_400 = SGD_logit.predict(X_test_sparse)
min(pred_sgd_400), max(pred_sgd_400)

(1, 997)

In [34]:
write_to_submission_file(pred_sgd_400, os.path.join(PATH_TO_DATA, 'sgd_400_users.csv'))

In [35]:
answ6_5 = 0.18247

In [36]:
write_answer_to_file(answ6_5, 'answer6_5.txt')

In [37]:
logit = LogisticRegression(random_state = 17, n_jobs = -1)

In [38]:
%%time
logit.fit(X_train_sparse, y)

Wall time: 9min 57s


LogisticRegression(n_jobs=-1, random_state=17)

In [39]:
pred_log_400 = logit.predict(X_test_sparse)
min(pred_log_400), max(pred_log_400)

(1, 997)

In [40]:
write_to_submission_file(pred_log_400, os.path.join(PATH_TO_DATA, 'log_400_users.csv'))

In [41]:
answ6_6 = 0.19448

In [42]:
write_answer_to_file(answ6_6, 'answer6_6.txt')