In [179]:
from __future__ import division

import numpy as np
import pandas as pd
import os
import codecs

from sklearn.metrics import mean_absolute_error
from subprocess import call
from itertools import product
from tqdm import tqdm_notebook

from custom_vw import holdout_cv_vw

pd.set_option('display.max_colwidth', 200)

# Cross Validation

In [185]:
# paths to files
train_file_vw = '../../data/data.hw8/habr_train.vw'
test_file_vw  = '../../data/data.hw8/habr_test.vw'
to_dir = 'tmp'

# cross-validation parameters
scoring = mean_absolute_error
train_length = 120000
test_fraction = 1/5
train_size = int(train_length * (1-test_fraction))
random_state=42

!head -2 ../../data/data.hw8/tmp/train.vw

0.693147 |title самопроизвольное разлогинивание |xtags логин login |domain habrahabr.ru |author @ptitov |hubs Хабрахабр |num content_len:0.0 month:7 hour:1 |link:0 |user:0 |video:0 |image:0 |weekday Sat |holiday:1 |description меня такое ощущение что logout время времени происходит самопроизвольно несмотря что чекбокс про логине включен возможно это происходит при
1.098612 |title stand-along cообщества против сообществ в рамках социальных сетей |xtags сообщества интернет-сообщество социальные сети нишевой бренд |domain geektimes.ru |author @AlexBruce |hubs Чёрная дыра |num content_len:0.0 month:7 hour:14 |link:1 |user:0 |video:0 |image:0 |weekday Sat |holiday:1 |description вот тут подумал смотря скажем комби зачем надо создавать социальную сеть чтобы потом там формировать сообщества ведь сразу возникает вопрос откуда


```
Features:
    title: exact title. Modicfication: to lower()
    xtags: exact tags, space-separated
    domain: exact domain
    author: exact author (nickname)
    hubs: exact hubs
    num: content_len, in millions, rounded to second decimal, month and year
    
    link: number of links in content
    user: number of links to users in content
    video: number of videos in content
    image: number of images in content
    weekday: string weekday
    holiday: binary, weekend or not
    description: trunkated content, same transform
    
#     flow: none for everyone? ignore it <- useless, removed    
#     content: content to lower, keep only 3+ long words <- useless, removed    
#     month: string month <- useless, removed
```

In [183]:
# use whenever vw file format is changed to create new validation sets
holdout_cv_vw(train_file_vw = train_file_vw,
              train_size = train_size,
              to_dir = to_dir,
              scoring = scoring,
              make_sets = True,
              mute = False)

# Grid Search

In [306]:
# PARAMETERS TO TUNE
params = [
    ('passes',        [3]),
    ('ngram',         [2]), # title    
    
    ('learning_rate', [0.3]),
    ('power_t',       [0.17]),
    ('bit_precision', [28]),
    
#     ('ignore',        ['t']), # title
#     ('ignore',        ['x']), # tags
#     ('ignore',        ['d']), # domain
#     ('ignore',        ['a']), # author
#     ('ignore',        ['h']), # hubs
#     ('ignore',        ['n']), # num

#     ('ignore',        ['l']), # link    
#     ('ignore',        ['u']), # user  
#     ('ignore',        ['v']), # video    
#     ('ignore',        ['i']), # image 
#     ('ignore',        ['w']), # weekday (string) 
#     ('ignore',        ['h']), # weekend or not  
#     ('ignore',        ['d']), # description
    
#    ('quadratic',        ['vi']),
   ('quadratic',        ['nw']),
]

note = 'title.lower link_int user_int video_int image_int weekday_str weekend_bin desc'

param_names = [p[0] for p in params]
param_combos = list(product(*[p[1] for p in params]))
param_list = []
for param_combo in param_combos:
    s = ''
    for i, p in enumerate(param_combo):
        if p != '':
            s += '--' + param_names[i] + ' ' + str(p) + ' '
    param_list.append(s[:-1])
print str(len(param_list)) + ' models to train'

1 models to train


In [307]:
# !vw -i ../../data/data.hw8/tmp/model.vw -t -d ../../data/data.hw8/tmp/train.vw -p ../../data/data.hw8/tmp/train_p.txt

In [308]:
# with open('../../data/data.hw8/tmp/train_y.txt') as f:
#     train_y = [float(label) for label in f.readlines()]

# with open('../../data/data.hw8/tmp/train_y.txt') as f:
#     train_p = [float(label) for label in f.readlines()]
# len(train_p), len(train_y)

In [309]:
# !vw -c -k -d ../../data/data.hw8/tmp/train.vw -f ../../data/data.hw8/tmp/model.vw --passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.4 --bit_precision 29 --ignore  --ignore m --ignore  --l1 0 --l2 0

In [310]:
for i, train_params in enumerate(tqdm_notebook(param_list, total = len(param_list))):
      
    train_score, valid_score = holdout_cv_vw(
        train_file_vw = train_file_vw,
        train_size = train_size,
        train_params = train_params,
        to_dir = to_dir,
        scoring = scoring,
        make_sets = False,
        mute = True)
    # write to log
    log = "{0:.4f}".format(train_score) + ', ' + "{0:.4f}".format(valid_score) + ', ' +train_params + \
          ', ' + note + '\n'
    with open('../../data/data.hw8/log.txt', 'a') as f:
        f.write(log)    




In [311]:
log = pd.read_csv('../../data/data.hw8/log.txt',
                  names = ['train score', 'valid score', 'params', 'note'], index_col=False)
log = log.sort_values(by = ['valid score', 'train score']) #.reset_index(drop=True)
log.head(10)

Unnamed: 0,train score,valid score,params,note
1,0.6775,0.8431,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --quadratic nw,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
4,0.6775,0.8431,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --quadratic vi --quadratic nw,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
3,0.6839,0.8438,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --quadratic wh --quadratic nw,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
2,0.7419,0.848,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --quadratic nh --quadratic nw,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
0,0.6534,0.8497,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --quadratic nn,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc


In [312]:
best_params = log.iloc[0]['params']
valid_score = log.iloc[0]['valid score']

command = 'vw -c -k -d ../../data/data.hw8/habr_train.vw -f ../../data/data.hw8/model.vw ' + best_params
print command
call(command.split())
command = 'vw -i ../../data/data.hw8/model.vw -t -d ../../data/data.hw8/habr_test.vw -p ' + \
              '../../data/data.hw8/tmp.txt'
print command
call(command.split())

with open('../../data/data.hw8/tmp.txt') as f:
    pred = [float(label) for label in f.readlines()]
    
sub = pd.read_csv('../../data/data.hw8/sample_submission.csv', index_col='url')
sub['target'] = pred

name = "{0:.4f}".format(valid_score) + ' ' + best_params + '.csv'
sub.to_csv('../../data/data.hw8/' + name)

vw -c -k -d ../../data/data.hw8/habr_train.vw -f ../../data/data.hw8/model.vw  --passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --quadratic nw
vw -i ../../data/data.hw8/model.vw -t -d ../../data/data.hw8/habr_test.vw -p ../../data/data.hw8/tmp.txt


Выбор того, как валидировать модель, остается за Вами. Проще всего, конечно, сделать отложенную выборку. Бенчмарк, который Вы видите в соревновании (**vw_baseline.csv**) и который надо побить, получен с Vowpal Wabbit, 3 проходами по выборке (не забываем удалять кэш), биграммами и настроенными гиперпараметрами `bits`, `learning_rate` и `power_t`. 