In [356]:
from __future__ import division

import numpy as np
import pandas as pd
import os
import codecs

from sklearn.metrics import mean_absolute_error
from subprocess import call
from itertools import product
from tqdm import tqdm_notebook

from custom_vw import holdout_cv_vw

pd.set_option('display.max_colwidth', 200)

# Cross Validation

In [357]:
# paths to files
train_file_vw = '../../data/data.hw8/habr_train.vw'
test_file_vw  = '../../data/data.hw8/habr_test.vw'
to_dir = 'tmp'

# cross-validation parameters
scoring = mean_absolute_error
train_length = 120000
test_fraction = 1/5
train_size = int(train_length * (1-test_fraction))
random_state=42

# use whenever vw file format is changed to create new validation sets
holdout_cv_vw(train_file_vw = train_file_vw,
              train_size = train_size,
              to_dir = to_dir,
              scoring = scoring,
              make_sets = True,
              mute = False)

vw -c -k -d ../../data/data.hw8/tmp/train.vw -f ../../data/data.hw8/tmp/model.vw 
vw -i ../../data/data.hw8/tmp/model.vw -t -d ../../data/data.hw8/tmp/train.vw -p ../../data/data.hw8/tmp/train_p.txt
vw -i ../../data/data.hw8/tmp/model.vw -t -d ../../data/data.hw8/tmp/valid.vw -p ../../data/data.hw8/tmp/valid_p.txt


(0.9313918272291668, 0.89263090245833332)

In [358]:
!head -2 ../../data/data.hw8/tmp/valid.vw

1.386294 |title прямая текстовая трансляция с macworld 2009 |xtags apple macworld 2009 прямая трансляция |domain geektimes.ru |author @alfsoft |hubs IT-компании |num content_len:0.02 month:1 hour:18 |mynum link:5 user:0 image:78 video:0 day:1 weekday:1 weekend:0 year:2009 |weekday Tue |description здравствуйте друзья это стало хабрахабре уже традицией сегодня этом топике проводится текстовая трансляция macworld conference expo 2009
0.693147 |title и снова о подставках для ноутбука |xtags подставка кулер Akasa LIBRA |domain geektimes.ru |author @Tylerskald |hubs Железо |num content_len:0.0 month:1 hour:19 |mynum link:6 user:0 image:2 video:0 day:1 weekday:1 weekend:0 year:2009 |weekday Tue |description компания akasa одной специализаций которой является выпуск разного рода внешних кулеров примочек для ноутбуков анонсировала выпуск новых ноутбучных


```
Features:
    title: exact title. Modicfication: to lower()
    xtags: exact tags, space-separated
    domain: exact domain
    author: exact author (nickname)
    hubs: exact hubs
    num: content_len, in millions, rounded to third decimal,
         month
         hour
    mynum: link: number of links in content
           user: number of links to users in content
           image: number of images in content
           video: number of videos in content
           day: day of month, int
           weekday: weekday, int
           weekend: binary, weekend or not
    weekday: weekday, str   
    description: trunkated content, same transform
    
#     flow: none for everyone? ignore it <- useless, removed    
#     content: content to lower, keep only 3+ long words <- useless, removed    
#     month: string month <- useless, removed
```

# Grid Search

In [360]:
# PARAMETERS TO TUNE
params = [
    ('passes',        [3]),
    ('ngram',         [2]), # title    
    
    ('learning_rate', [0.3]),
    ('power_t',       [0.17]),
    ('bit_precision', [28]),
    
#     ('ignore',        ['t']), # title
#     ('ignore',        ['x']), # tags
#     ('ignore',        ['d']), # domain
#     ('ignore',        ['a']), # author
#     ('ignore',        ['h']), # hubs
#     ('ignore',        ['m']), # mynum

#     ('ignore',        ['l']), # link    
#     ('ignore',        ['u']), # user  
#     ('ignore',        ['i']), # image 
#     ('ignore',        ['v']), # video    

#     ('ignore',        ['w']), # weekday (string) 
#     ('ignore',        ['h']), # weekend or not  
#     ('ignore',        ['d']), # description
    
#    ('quadratic',        ['vi']),
#    ('quadratic',        ['nw']),
    
#     ('l1',             ['0.000001']),
#     ('l2',             ['0.000001']),
]

note = 'title.lower link_int user_int video_int image_int weekday_str weekend_bin desc'

param_names = [p[0] for p in params]
param_combos = list(product(*[p[1] for p in params]))
param_list = []
for param_combo in param_combos:
    s = ''
    for i, p in enumerate(param_combo):
        if p != '':
            s += '--' + param_names[i] + ' ' + str(p) + ' '
    param_list.append(s[:-1])
print str(len(param_list)) + ' models to train'

1 models to train


In [361]:
# !vw -i ../../data/data.hw8/tmp/model.vw -t -d ../../data/data.hw8/tmp/train.vw -p ../../data/data.hw8/tmp/train_p.txt

In [362]:
# with open('../../data/data.hw8/tmp/train_y.txt') as f:
#     train_y = [float(label) for label in f.readlines()]

# with open('../../data/data.hw8/tmp/train_y.txt') as f:
#     train_p = [float(label) for label in f.readlines()]
# len(train_p), len(train_y)

In [363]:
# !vw -c -k -d ../../data/data.hw8/tmp/train.vw -f ../../data/data.hw8/tmp/model.vw --passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.4 --bit_precision 29 --ignore  --ignore m --ignore  --l1 0 --l2 0

In [364]:
for i, train_params in enumerate(tqdm_notebook(param_list, total = len(param_list))):
      
    train_score, valid_score = holdout_cv_vw(
        train_file_vw = train_file_vw,
        train_size = train_size,
        train_params = train_params,
        to_dir = to_dir,
        scoring = scoring,
        make_sets = False,
        mute = True)
    # write to log
    log = "{0:.4f}".format(train_score) + ', ' + "{0:.4f}".format(valid_score) + ', ' +train_params + \
          ', ' + note + '\n'
    with open('../../data/data.hw8/log.txt', 'a') as f:
        f.write(log)    




In [366]:
log = pd.read_csv('../../data/data.hw8/log.txt',
                  names = ['train score', 'valid score', 'params', 'note'], index_col=False)
log = log.sort_values(by = ['valid score', 'train score']) #.reset_index(drop=True)
log.head(10)

Unnamed: 0,train score,valid score,params,note
17,0.6515,0.8264,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
0,0.6915,0.8431,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
16,0.9887,0.8486,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --l1 0.000001 --l2 0.000001,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
3,0.799,0.8495,--passes 3 --ngram 2 --learning_rate 0.4 --power_t 0.2 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
1,0.8266,0.8497,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
7,0.7818,0.8524,--passes 3 --ngram 2 --learning_rate 0.8 --power_t 0.2 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
11,0.7786,0.8542,--passes 3 --ngram 2 --learning_rate 1.2 --power_t 0.2 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
4,0.6641,0.8652,--passes 3 --ngram 2 --learning_rate 0.4 --power_t 0.4 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
2,0.8588,0.8667,--passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28 --ignore m,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc
8,0.6453,0.8698,--passes 3 --ngram 2 --learning_rate 0.8 --power_t 0.4 --bit_precision 28,title.lower link_int user_int video_int image_int weekday_str weekend_bin desc


In [367]:
best_params = log.iloc[0]['params']
valid_score = log.iloc[0]['valid score']

command = 'vw -c -k -d ../../data/data.hw8/habr_train.vw -f ../../data/data.hw8/model.vw ' + best_params
print command
call(command.split())
command = 'vw -i ../../data/data.hw8/model.vw -t -d ../../data/data.hw8/habr_test.vw -p ' + \
              '../../data/data.hw8/tmp.txt'
print command
call(command.split())

with open('../../data/data.hw8/tmp.txt') as f:
    pred = [float(label) for label in f.readlines()]
    
sub = pd.read_csv('../../data/data.hw8/sample_submission.csv', index_col='url')
sub['target'] = pred

name = "{0:.4f}".format(valid_score) + ' ' + best_params + '.csv'
sub.to_csv('../../data/data.hw8/' + name)

vw -c -k -d ../../data/data.hw8/habr_train.vw -f ../../data/data.hw8/model.vw  --passes 3 --ngram 2 --learning_rate 0.3 --power_t 0.17 --bit_precision 28
vw -i ../../data/data.hw8/model.vw -t -d ../../data/data.hw8/habr_test.vw -p ../../data/data.hw8/tmp.txt


Выбор того, как валидировать модель, остается за Вами. Проще всего, конечно, сделать отложенную выборку. Бенчмарк, который Вы видите в соревновании (**vw_baseline.csv**) и который надо побить, получен с Vowpal Wabbit, 3 проходами по выборке (не забываем удалять кэш), биграммами и настроенными гиперпараметрами `bits`, `learning_rate` и `power_t`. 