In [911]:
%config IPCompleter.greedy=True
import string
import re
import json
from collections import defaultdict, Counter
import numpy as np
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import make_scorer
from pymystem3 import Mystem
from catboost import CatBoostRegressor
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm_notebook as tqdm

In [977]:
def json_read(filename):
    with open(filename, 'r') as inf:
        res = json.load(inf)
    return res

def json_dump(obj, filename, ea=False, indent=4):
    with open(filename, 'w') as ouf:
        json.dump(obj, ouf, ensure_ascii=ea, indent=indent)

#### Read train data

In [39]:
with open('texts_train.txt', 'r') as inf:
    train_data = list(inf)

In [160]:
with open('scores_train.txt', 'r') as inf:
    train_scores = []
    for line in inf:
        train_scores.append(int(line.strip()))

#### Tokenizers

In [7]:
word_tokenizer = CountVectorizer(lowercase=False, token_pattern=r'\w+').build_analyzer()

In [99]:
def punct_preprocessor(s):
    s = s.replace('=', ':').replace(';', ':').replace('-', '')
    s = s.replace(':)', '))')\
         .replace(':(', '((')\
         .replace('):', '((')\
         .replace('(:', '))')\
         .replace(':', '')
    syms = ['(', ')', '!', '?']
    for i in range(len(syms)):
        for j in range(len(syms)):
            if i != j:
                s = s.replace(f'{syms[i]}{syms[j]}', f'{syms[i]} {syms[j]}')
    s = re.sub(r'\){3,}', r')))', s)
    s = re.sub(r'!{3,}', r'!!!', s)
    s = re.sub(r'\?{3,}', r'???', s)
    s = re.sub(r'\({3,}', r'(((', s)
    s = re.sub(r'-{2,}', r'-', s)
    return s

punct_tokenizer = CountVectorizer(preprocessor=punct_preprocessor, token_pattern=r'[!()\-:;=?]{2,}').build_analyzer()

#### Preprocessor

In [726]:
mystem = Mystem()
snowball_stemmer = SnowballStemmer('russian')

def text_preprocessor(text):
    word_tokens = word_tokenizer(text)
    punct_tokens = punct_tokenizer(text)
    word_tokens_lemmas = mystem.lemmatize(' '.join(word_tokens))[:-1]
    word_tokens_stems = [snowball_stemmer.stem(word) for word in word_tokens_lemmas]
    return ' '.join([''.join(word_tokens_stems), ' '.join(punct_tokens)]).lower()

In [729]:
preprocessed_train_data = [
    text_preprocessor(text) for text in tqdm(train_data)
]

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




In [727]:
text_preprocessor('Классно)))))))))) С этого начался мой Фрай))))')

'классн с эт начина мо фра ))) )))'

In [723]:
[stem.stem(word) for word in preprocessed_train_data[3729].split()]

['литературн',
 'группов',
 'портрет',
 'черт',
 'центральн',
 'фигур',
 'особен',
 'отчетлив',
 'показыва',
 'в',
 'отношен',
 'с',
 'человек',
 'выписыва',
 'вокруг',
 'он',
 'качествен',
 'детальн',
 'прорисовыва',
 'картин',
 'стар',
 'школ']

In [730]:
preprocessed_train_data[3729]

'литературн группов портрет черт центральн фигур особен отчетлив показыва в отношен с человек выписыва вокруг он качествен детальн прорисовыва картин стар школ '

#### Cross validation

In [221]:
def sent_score(y_true, y_pred):
    y_true = np.array(y_true)
    y_prec = np.array(get_scores(y_pred))
    return np.sqrt(np.mean(np.square(y_true - y_pred)))

scorer = make_scorer(sent_score, greater_is_better=False)

def cross_validate_model(model, X, y):
    return np.mean(np.abs(cross_val_score(model, X, y, cv=5, scoring=scorer)))

#### Read test data

In [148]:
def read_test_data():
    data = []
    with open('dataset_40757_1.txt', 'r') as inf:
        for line in inf:
            data.append(line.strip())
    return data

#### Convert output to scores

In [143]:
def get_scores(result):
    scores = []
    for r in np.rint(result):
        cur = int(r)
        if cur < 1:
            cur = 1
        if cur > 10:
            cur = 10
        scores.append(cur)
    return scores

#### Dump scores

In [146]:
def dump_scores(scores):
    with open('output.txt', 'w') as ouf:
        for score in scores:
            print(score, file=ouf)

### Simple tf-idf vectorizer + linear regression

In [324]:
vectorizer = TfidfVectorizer(token_pattern=r'\S+', max_features=1000)

In [325]:
tfidf_matrix_train = vectorizer.fit_transform(preprocessed_train_data)
tfidf_matrix_train.shape

(20000, 1000)

In [326]:
regressor = LinearRegression()

In [320]:
cross_validate_model(regressor, tfidf_matrix_train, train_scores)

2.041714823660871

In [327]:
regressor.fit(tfidf_matrix_train, train_scores)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [328]:
test_data = read_test_data()
test_data = [text_preprocessor(text) for text in test_data]
tfidf_matrix_test = vectorizer.transform(test_data)
result = regressor.predict(tfidf_matrix_test)

In [329]:
scores = get_scores(result)
dump_scores(scores)

### Simple tf-idf vectorizer + SGD regressor

In [789]:
vectorizer = TfidfVectorizer(token_pattern=r'\S{2,}', ngram_range=(1, 1), min_df=10)

In [790]:
tfidf_matrix_train = vectorizer.fit_transform(preprocessed_train_data)
tfidf_matrix_train.shape

(20000, 6432)

In [791]:
regressor = SGDRegressor(penalty='l1', alpha=0.0001, max_iter=10000)

In [343]:
cross_validate_model(regressor, tfidf_matrix_train, train_scores)

1.9641034047945822

In [792]:
regressor.fit(tfidf_matrix_train, train_scores)

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=10000,
             n_iter_no_change=5, penalty='l1', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [793]:
word_by_id = {v: k for k, v in vectorizer.vocabulary_.items()}

In [794]:
new_vocabulary = []
for i, coef in enumerate(regressor.coef_):
    if abs(coef) > 0.01:
        print(word_by_id[i], coef)
        new_vocabulary.append(word_by_id[i])
len(new_vocabulary)

!! 1.5237939704452588
!!! 2.104789907748668
(( -0.37219548559118953
((( -0.46336879763531785
)) 2.712122218197689
))) 2.274251053150691
10 2.4484865604693464
100 0.5931945847299954
14 0.15196783921443624
15 -0.2646040726113992
1996 0.04140864030496109
1997 0.013571579247583345
20 0.22713723904013822
2001 0.046687469989382537
2005 -0.019445378559934064
2007 -0.13580231291672043
22 0.10054571682447103
25 0.013543017713181527
250 -0.028634924975166925
500 0.15658656408589078
70 0.07647167401858436
80 0.027818303029534332
90 -0.09417729184978073
99 0.01434169792774093
??? -0.3803663325700031
afisha -0.14977615224040092
articl -0.010093936187328525
article -0.04912735386499395
aspx -0.07870806335749246
com 0.10755143186731543
doc -0.052937364800440034
docsid -0.07870806335749246
dvd 0.021106713571860013
echo -0.030206126427122868
end 0.07519990916491531
film -0.9728391046743017
happy 0.32750411733919416
html -0.1622568784035577
http -1.0727021502851257
imdb 0.17453209690069324
imho 0.399417

земн 0.17680180377954463
зерка 0.06763589492169074
зим 0.02714148259899526
злоб 0.1339633983062935
злободневн 0.21486084851221535
злод -0.07992293169617944
знак 0.03389546615591855
знаков 0.22088303433756806
знакомств 0.2452781210272972
знаменит 0.2362215959980136
знан 0.09629622609594811
знат 0.8055498198735795
знаток 0.04307987070863777
знач 0.029517268059498572
золот 0.1714132687930371
золушк 0.23524490637839335
зомб -0.3010331610046831
зон 0.021289186627456217
зрелищ -0.33998298340790845
зрелищн 0.11532579952263973
зрен 0.28431034574565917
зрител -1.1018188709280372
зуб -0.07497148777679749
зюскинд -0.026433994280720755
ива -0.10848896824802866
иван 0.010791153338498912
игор -0.26229672357234324
игр 0.5505910904515072
игра -0.21165824949853287
иде -0.12902864229670402
идеа 0.27613203656475915
идеальн 1.4359977307180505
идиотизм -0.40487283738921104
идиотск -0.38076646214231424
идт 0.6484199305985816
из 0.22998856218700925
избавля -0.07654073944194476
избива -0.04859491312473742
изв

непрост 0.35389926811463646
неравнодушн 0.011133510628442887
нервн 0.18393526906225882
нереальн 0.3709192671122057
неровн -0.05193269276979786
несимпатичн -0.1306180099144533
нескольк 0.2845371301963341
несмешн -0.4134341744544048
несмотр 0.48145316417357426
несовместим 0.017067808458826945
несомнен 0.3878012892467878
неспешн 0.22813750763890225
нест -0.15668377353251872
нестандартн 1.028716634459507
несуразн -0.10998027092397217
несусветн -0.1987785528659376
нет -0.3433959940012017
нетерпен 0.3953055437343597
нетороплив -0.061165114206941214
нетривиальн 0.09915762838132931
неубедительн -0.4893638645996827
неудачн -0.24816071873579587
неудачник -0.24513684674374725
неуловим 0.03806070442845606
нехил -0.0718246247982332
нехорош -0.07750102455816935
неч -1.1700865119848896
нечеловеческ 0.20686013607338813
нечист 0.0507928776455378
нечт 0.3058060962990851
неясн -0.05551646019356271
ни -2.1026960365215843
нибуд -0.030791103858594723
нигд 0.28877420879036486
ниж -0.1965214952892679
ник 0.03

прокат -0.34791041239927656
прокатчик -0.018197519791419155
пром -0.07403253888401978
пронзительн 1.6318974456763764
пронизыва 0.3929290195889218
проника 0.43886232307246803
проникновен 0.2872316308508266
пропаганд -0.03574321185760916
пропива 0.04544853999832919
пропитыва 0.22538154548170894
пропуска 0.09484270529058764
прорисовыва 0.01903761137372163
пророк 0.11304808269071527
прос -0.14322949614464683
прослез 0.06678652463722365
прослушива 0.22466714376467453
просматрива 0.44698306493457485
просмотр 0.47406577210285966
прост 1.0950960543678112
проститутк -0.2285894842564172
простот 0.2083052069982648
простра -0.03132730978212167
пространств 0.2509496877689533
противн -1.26829777277005
противник 0.05310752204141957
противоположн -0.019252189216754284
протяжен -0.21545189644912863
професс 0.0678100537691097
профессиона -0.019458834204589142
профессиональн -0.07694557555641499
проход 0.07968296081413416
проч -0.15170095022138955
прочитыва 1.5179250532218893
прочтен 0.808887979972439
пр

трюк 0.02928493014268429
туалет -0.14651562755974498
туп -2.72712348843872
тупост -0.22725636564678256
тщательн 0.2279433417218104
ты 1.6373217634526136
тьма -0.08598808660139196
тьфу -0.04722491533367503
тяг 0.08430449603883872
тягомотин -0.24692532680805976
тягомотн -0.06587754978095978
тягостн -0.018032265315280096
тяжел 0.24935045200032258
тянут -0.18501663480965766
уайльд 0.31322682495056337
убедительн 0.5247915571609769
убежда -0.022142767574686296
убежден 0.04828277647486212
убива -0.6825545771117643
убийств 0.06108752439049566
убийствен 0.07304632963225423
убийц -0.5651488393763561
убира 0.2284090181780747
убог -0.5567813212224114
убожеств -0.6701759585363638
убойн 0.30473547910861376
ув -0.30187496953986026
уважа -0.09139291778999929
уважен -0.0808471478838079
уверен 0.11594510643468116
увидет 0.3708346853218698
увлека 0.4883909068078277
увлекательн 1.6566378483111057
увод -0.04043264886786576
увольня -0.1696627306742959
угожда -0.023894071019392376
угол 0.20531832732957797
уд

3024

In [1042]:
sign_vocabulary = json_read('sign_vocab.txt')
l1_vocabulary = json_read('strong_vocab.txt')

In [1043]:
vectorizer = TfidfVectorizer(token_pattern=r'\S{2,}', ngram_range=(1, 2), vocabulary=strong_vocabulary)
tfidf_matrix_train = vectorizer.fit_transform(preprocessed_train_data)
tfidf_matrix_train.shape

(20000, 3392)

In [1020]:
svm_regressor = SVR(kernel='linear')
svm_regressor.fit(tfidf_matrix_train, train_scores)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='poly', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [1016]:
for i, coef in enumerate(svm_regressor.coef_.toarray().flatten()):
    print(sign_vocabulary[i], coef)

AttributeError: coef_ is only available when using a linear kernel

In [834]:
train_predict = svm_regressor.predict(tfidf_matrix_train)

In [839]:
train_predict = train_predict.reshape(-1, 1)

In [1024]:
tfidf_matrix_train_train = tfidf_matrix_train[:16000, :]
# tfidf_matrix_train_train = hstack([tfidf_matrix_train_train, train_predict[:16000, :]])
tfidf_matrix_train_validate = tfidf_matrix_train[16000:, :]
# tfidf_matrix_train_validate = hstack([tfidf_matrix_train_validate, train_predict[16000:, :]])
train_scores_train = train_scores[:16000]
train_scores_validate = train_scores[16000:]

In [1054]:
cb_regressor = CatBoostRegressor(iterations=2000, learning_rate=0.02, depth=6)
# eval_set = [(tfidf_matrix_train_validate, train_scores_validate)]
cb_regressor.fit(tfidf_matrix_train, train_scores)

0:	learn: 2.2449959	total: 470ms	remaining: 15m 40s
1:	learn: 2.2407578	total: 685ms	remaining: 11m 24s
2:	learn: 2.2372582	total: 889ms	remaining: 9m 51s
3:	learn: 2.2337891	total: 1.11s	remaining: 9m 13s
4:	learn: 2.2297221	total: 1.33s	remaining: 8m 52s
5:	learn: 2.2259994	total: 1.56s	remaining: 8m 38s
6:	learn: 2.2225412	total: 1.78s	remaining: 8m 28s
7:	learn: 2.2191529	total: 1.99s	remaining: 8m 15s
8:	learn: 2.2156974	total: 2.19s	remaining: 8m 4s
9:	learn: 2.2125052	total: 2.38s	remaining: 7m 54s
10:	learn: 2.2095013	total: 2.57s	remaining: 7m 45s
11:	learn: 2.2063756	total: 2.77s	remaining: 7m 39s
12:	learn: 2.2032142	total: 2.97s	remaining: 7m 34s
13:	learn: 2.2002463	total: 3.17s	remaining: 7m 30s
14:	learn: 2.1974879	total: 3.37s	remaining: 7m 26s
15:	learn: 2.1944775	total: 3.56s	remaining: 7m 21s
16:	learn: 2.1920633	total: 3.76s	remaining: 7m 19s
17:	learn: 2.1893048	total: 3.95s	remaining: 7m 15s
18:	learn: 2.1866912	total: 4.15s	remaining: 7m 12s
19:	learn: 2.1842265	

158:	learn: 2.0059752	total: 33.8s	remaining: 6m 31s
159:	learn: 2.0053055	total: 34s	remaining: 6m 31s
160:	learn: 2.0044734	total: 34.2s	remaining: 6m 30s
161:	learn: 2.0036584	total: 34.4s	remaining: 6m 30s
162:	learn: 2.0029686	total: 34.6s	remaining: 6m 29s
163:	learn: 2.0023130	total: 34.8s	remaining: 6m 29s
164:	learn: 2.0015642	total: 35s	remaining: 6m 29s
165:	learn: 2.0008376	total: 35.2s	remaining: 6m 28s
166:	learn: 2.0002140	total: 35.4s	remaining: 6m 28s
167:	learn: 1.9995118	total: 35.6s	remaining: 6m 28s
168:	learn: 1.9988128	total: 35.8s	remaining: 6m 27s
169:	learn: 1.9980830	total: 36s	remaining: 6m 27s
170:	learn: 1.9974472	total: 36.2s	remaining: 6m 27s
171:	learn: 1.9966741	total: 36.4s	remaining: 6m 27s
172:	learn: 1.9959122	total: 36.6s	remaining: 6m 26s
173:	learn: 1.9951792	total: 36.8s	remaining: 6m 26s
174:	learn: 1.9946465	total: 37s	remaining: 6m 26s
175:	learn: 1.9938956	total: 37.2s	remaining: 6m 25s
176:	learn: 1.9932963	total: 37.4s	remaining: 6m 25s
1

315:	learn: 1.9214200	total: 1m 4s	remaining: 5m 44s
316:	learn: 1.9209799	total: 1m 4s	remaining: 5m 44s
317:	learn: 1.9205586	total: 1m 5s	remaining: 5m 43s
318:	learn: 1.9201849	total: 1m 5s	remaining: 5m 43s
319:	learn: 1.9197681	total: 1m 5s	remaining: 5m 43s
320:	learn: 1.9193465	total: 1m 5s	remaining: 5m 43s
321:	learn: 1.9188723	total: 1m 5s	remaining: 5m 42s
322:	learn: 1.9183548	total: 1m 6s	remaining: 5m 42s
323:	learn: 1.9179221	total: 1m 6s	remaining: 5m 42s
324:	learn: 1.9175469	total: 1m 6s	remaining: 5m 42s
325:	learn: 1.9170840	total: 1m 6s	remaining: 5m 41s
326:	learn: 1.9167617	total: 1m 6s	remaining: 5m 41s
327:	learn: 1.9163501	total: 1m 6s	remaining: 5m 41s
328:	learn: 1.9159246	total: 1m 7s	remaining: 5m 41s
329:	learn: 1.9155284	total: 1m 7s	remaining: 5m 41s
330:	learn: 1.9151231	total: 1m 7s	remaining: 5m 41s
331:	learn: 1.9147007	total: 1m 7s	remaining: 5m 40s
332:	learn: 1.9142994	total: 1m 8s	remaining: 5m 40s
333:	learn: 1.9139102	total: 1m 8s	remaining: 

469:	learn: 1.8650860	total: 1m 36s	remaining: 5m 12s
470:	learn: 1.8648081	total: 1m 36s	remaining: 5m 12s
471:	learn: 1.8644476	total: 1m 36s	remaining: 5m 12s
472:	learn: 1.8640819	total: 1m 36s	remaining: 5m 12s
473:	learn: 1.8637153	total: 1m 36s	remaining: 5m 11s
474:	learn: 1.8634112	total: 1m 37s	remaining: 5m 11s
475:	learn: 1.8630705	total: 1m 37s	remaining: 5m 11s
476:	learn: 1.8627222	total: 1m 37s	remaining: 5m 11s
477:	learn: 1.8623809	total: 1m 37s	remaining: 5m 10s
478:	learn: 1.8620598	total: 1m 37s	remaining: 5m 10s
479:	learn: 1.8616941	total: 1m 38s	remaining: 5m 10s
480:	learn: 1.8613885	total: 1m 38s	remaining: 5m 10s
481:	learn: 1.8610438	total: 1m 38s	remaining: 5m 9s
482:	learn: 1.8607268	total: 1m 38s	remaining: 5m 9s
483:	learn: 1.8603839	total: 1m 38s	remaining: 5m 9s
484:	learn: 1.8600352	total: 1m 38s	remaining: 5m 9s
485:	learn: 1.8597510	total: 1m 39s	remaining: 5m 8s
486:	learn: 1.8594407	total: 1m 39s	remaining: 5m 8s
487:	learn: 1.8591191	total: 1m 39

623:	learn: 1.8160207	total: 2m 6s	remaining: 4m 38s
624:	learn: 1.8157091	total: 2m 6s	remaining: 4m 38s
625:	learn: 1.8154506	total: 2m 6s	remaining: 4m 37s
626:	learn: 1.8151831	total: 2m 6s	remaining: 4m 37s
627:	learn: 1.8148658	total: 2m 6s	remaining: 4m 37s
628:	learn: 1.8145943	total: 2m 7s	remaining: 4m 37s
629:	learn: 1.8142940	total: 2m 7s	remaining: 4m 36s
630:	learn: 1.8139456	total: 2m 7s	remaining: 4m 36s
631:	learn: 1.8136621	total: 2m 7s	remaining: 4m 36s
632:	learn: 1.8133539	total: 2m 7s	remaining: 4m 36s
633:	learn: 1.8130589	total: 2m 8s	remaining: 4m 36s
634:	learn: 1.8127010	total: 2m 8s	remaining: 4m 35s
635:	learn: 1.8124112	total: 2m 8s	remaining: 4m 35s
636:	learn: 1.8120650	total: 2m 8s	remaining: 4m 35s
637:	learn: 1.8117201	total: 2m 8s	remaining: 4m 35s
638:	learn: 1.8114110	total: 2m 9s	remaining: 4m 34s
639:	learn: 1.8110973	total: 2m 9s	remaining: 4m 34s
640:	learn: 1.8107529	total: 2m 9s	remaining: 4m 34s
641:	learn: 1.8104706	total: 2m 9s	remaining: 

777:	learn: 1.7734436	total: 2m 36s	remaining: 4m 5s
778:	learn: 1.7732952	total: 2m 36s	remaining: 4m 5s
779:	learn: 1.7730213	total: 2m 36s	remaining: 4m 5s
780:	learn: 1.7727560	total: 2m 36s	remaining: 4m 4s
781:	learn: 1.7724666	total: 2m 37s	remaining: 4m 4s
782:	learn: 1.7722294	total: 2m 37s	remaining: 4m 4s
783:	learn: 1.7720748	total: 2m 37s	remaining: 4m 4s
784:	learn: 1.7717834	total: 2m 37s	remaining: 4m 4s
785:	learn: 1.7715210	total: 2m 37s	remaining: 4m 3s
786:	learn: 1.7712209	total: 2m 38s	remaining: 4m 3s
787:	learn: 1.7709782	total: 2m 38s	remaining: 4m 3s
788:	learn: 1.7708427	total: 2m 38s	remaining: 4m 3s
789:	learn: 1.7705794	total: 2m 38s	remaining: 4m 2s
790:	learn: 1.7703658	total: 2m 38s	remaining: 4m 2s
791:	learn: 1.7701906	total: 2m 39s	remaining: 4m 2s
792:	learn: 1.7699476	total: 2m 39s	remaining: 4m 2s
793:	learn: 1.7696950	total: 2m 39s	remaining: 4m 2s
794:	learn: 1.7693809	total: 2m 39s	remaining: 4m 1s
795:	learn: 1.7692428	total: 2m 39s	remaining:

932:	learn: 1.7387330	total: 3m 7s	remaining: 3m 34s
933:	learn: 1.7385278	total: 3m 7s	remaining: 3m 34s
934:	learn: 1.7383081	total: 3m 7s	remaining: 3m 33s
935:	learn: 1.7381514	total: 3m 8s	remaining: 3m 33s
936:	learn: 1.7379801	total: 3m 8s	remaining: 3m 33s
937:	learn: 1.7377429	total: 3m 8s	remaining: 3m 33s
938:	learn: 1.7375363	total: 3m 8s	remaining: 3m 33s
939:	learn: 1.7372953	total: 3m 8s	remaining: 3m 32s
940:	learn: 1.7370652	total: 3m 8s	remaining: 3m 32s
941:	learn: 1.7367123	total: 3m 9s	remaining: 3m 32s
942:	learn: 1.7364969	total: 3m 9s	remaining: 3m 32s
943:	learn: 1.7362570	total: 3m 9s	remaining: 3m 32s
944:	learn: 1.7360394	total: 3m 9s	remaining: 3m 31s
945:	learn: 1.7359022	total: 3m 10s	remaining: 3m 31s
946:	learn: 1.7356758	total: 3m 10s	remaining: 3m 31s
947:	learn: 1.7354649	total: 3m 10s	remaining: 3m 31s
948:	learn: 1.7352870	total: 3m 10s	remaining: 3m 31s
949:	learn: 1.7351720	total: 3m 10s	remaining: 3m 30s
950:	learn: 1.7349249	total: 3m 11s	remai

1083:	learn: 1.7096013	total: 3m 41s	remaining: 3m 7s
1084:	learn: 1.7094342	total: 3m 41s	remaining: 3m 7s
1085:	learn: 1.7092084	total: 3m 42s	remaining: 3m 6s
1086:	learn: 1.7090398	total: 3m 42s	remaining: 3m 6s
1087:	learn: 1.7088475	total: 3m 42s	remaining: 3m 6s
1088:	learn: 1.7086205	total: 3m 42s	remaining: 3m 6s
1089:	learn: 1.7084186	total: 3m 42s	remaining: 3m 6s
1090:	learn: 1.7081828	total: 3m 43s	remaining: 3m 5s
1091:	learn: 1.7081014	total: 3m 43s	remaining: 3m 5s
1092:	learn: 1.7078646	total: 3m 43s	remaining: 3m 5s
1093:	learn: 1.7077502	total: 3m 43s	remaining: 3m 5s
1094:	learn: 1.7076238	total: 3m 43s	remaining: 3m 5s
1095:	learn: 1.7075095	total: 3m 44s	remaining: 3m 4s
1096:	learn: 1.7073318	total: 3m 44s	remaining: 3m 4s
1097:	learn: 1.7071899	total: 3m 44s	remaining: 3m 4s
1098:	learn: 1.7069880	total: 3m 44s	remaining: 3m 4s
1099:	learn: 1.7067653	total: 3m 44s	remaining: 3m 3s
1100:	learn: 1.7066162	total: 3m 45s	remaining: 3m 3s
1101:	learn: 1.7064315	total

1234:	learn: 1.6842860	total: 4m 11s	remaining: 2m 36s
1235:	learn: 1.6840737	total: 4m 12s	remaining: 2m 35s
1236:	learn: 1.6839309	total: 4m 12s	remaining: 2m 35s
1237:	learn: 1.6837796	total: 4m 12s	remaining: 2m 35s
1238:	learn: 1.6836134	total: 4m 12s	remaining: 2m 35s
1239:	learn: 1.6834271	total: 4m 12s	remaining: 2m 35s
1240:	learn: 1.6832678	total: 4m 13s	remaining: 2m 34s
1241:	learn: 1.6830850	total: 4m 13s	remaining: 2m 34s
1242:	learn: 1.6828690	total: 4m 13s	remaining: 2m 34s
1243:	learn: 1.6826584	total: 4m 13s	remaining: 2m 34s
1244:	learn: 1.6825280	total: 4m 13s	remaining: 2m 33s
1245:	learn: 1.6823609	total: 4m 14s	remaining: 2m 33s
1246:	learn: 1.6822284	total: 4m 14s	remaining: 2m 33s
1247:	learn: 1.6821255	total: 4m 14s	remaining: 2m 33s
1248:	learn: 1.6819983	total: 4m 14s	remaining: 2m 33s
1249:	learn: 1.6818428	total: 4m 14s	remaining: 2m 32s
1250:	learn: 1.6816942	total: 4m 15s	remaining: 2m 32s
1251:	learn: 1.6815211	total: 4m 15s	remaining: 2m 32s
1252:	lear

1384:	learn: 1.6618229	total: 4m 44s	remaining: 2m 6s
1385:	learn: 1.6616557	total: 4m 44s	remaining: 2m 6s
1386:	learn: 1.6615117	total: 4m 44s	remaining: 2m 5s
1387:	learn: 1.6613583	total: 4m 44s	remaining: 2m 5s
1388:	learn: 1.6612302	total: 4m 45s	remaining: 2m 5s
1389:	learn: 1.6610701	total: 4m 45s	remaining: 2m 5s
1390:	learn: 1.6609117	total: 4m 45s	remaining: 2m 5s
1391:	learn: 1.6607639	total: 4m 45s	remaining: 2m 4s
1392:	learn: 1.6605812	total: 4m 45s	remaining: 2m 4s
1393:	learn: 1.6604221	total: 4m 46s	remaining: 2m 4s
1394:	learn: 1.6602772	total: 4m 46s	remaining: 2m 4s
1395:	learn: 1.6601129	total: 4m 46s	remaining: 2m 3s
1396:	learn: 1.6599636	total: 4m 46s	remaining: 2m 3s
1397:	learn: 1.6598492	total: 4m 46s	remaining: 2m 3s
1398:	learn: 1.6596873	total: 4m 47s	remaining: 2m 3s
1399:	learn: 1.6595532	total: 4m 47s	remaining: 2m 3s
1400:	learn: 1.6593950	total: 4m 47s	remaining: 2m 2s
1401:	learn: 1.6592363	total: 4m 47s	remaining: 2m 2s
1402:	learn: 1.6590685	total

1535:	learn: 1.6419115	total: 5m 24s	remaining: 1m 38s
1536:	learn: 1.6418725	total: 5m 24s	remaining: 1m 37s
1537:	learn: 1.6417173	total: 5m 25s	remaining: 1m 37s
1538:	learn: 1.6415620	total: 5m 25s	remaining: 1m 37s
1539:	learn: 1.6414282	total: 5m 25s	remaining: 1m 37s
1540:	learn: 1.6413314	total: 5m 25s	remaining: 1m 37s
1541:	learn: 1.6412113	total: 5m 25s	remaining: 1m 36s
1542:	learn: 1.6411098	total: 5m 26s	remaining: 1m 36s
1543:	learn: 1.6409767	total: 5m 26s	remaining: 1m 36s
1544:	learn: 1.6408423	total: 5m 26s	remaining: 1m 36s
1545:	learn: 1.6407007	total: 5m 26s	remaining: 1m 35s
1546:	learn: 1.6405708	total: 5m 26s	remaining: 1m 35s
1547:	learn: 1.6404025	total: 5m 27s	remaining: 1m 35s
1548:	learn: 1.6402321	total: 5m 27s	remaining: 1m 35s
1549:	learn: 1.6401397	total: 5m 27s	remaining: 1m 35s
1550:	learn: 1.6399819	total: 5m 27s	remaining: 1m 34s
1551:	learn: 1.6398202	total: 5m 28s	remaining: 1m 34s
1552:	learn: 1.6396803	total: 5m 28s	remaining: 1m 34s
1553:	lear

1685:	learn: 1.6235555	total: 6m 2s	remaining: 1m 7s
1686:	learn: 1.6234480	total: 6m 2s	remaining: 1m 7s
1687:	learn: 1.6233676	total: 6m 2s	remaining: 1m 6s
1688:	learn: 1.6233278	total: 6m 2s	remaining: 1m 6s
1689:	learn: 1.6232389	total: 6m 2s	remaining: 1m 6s
1690:	learn: 1.6231358	total: 6m 3s	remaining: 1m 6s
1691:	learn: 1.6229596	total: 6m 3s	remaining: 1m 6s
1692:	learn: 1.6228122	total: 6m 3s	remaining: 1m 5s
1693:	learn: 1.6226730	total: 6m 3s	remaining: 1m 5s
1694:	learn: 1.6225324	total: 6m 3s	remaining: 1m 5s
1695:	learn: 1.6223653	total: 6m 4s	remaining: 1m 5s
1696:	learn: 1.6222381	total: 6m 4s	remaining: 1m 5s
1697:	learn: 1.6221640	total: 6m 4s	remaining: 1m 4s
1698:	learn: 1.6220021	total: 6m 4s	remaining: 1m 4s
1699:	learn: 1.6218571	total: 6m 5s	remaining: 1m 4s
1700:	learn: 1.6216736	total: 6m 5s	remaining: 1m 4s
1701:	learn: 1.6215212	total: 6m 5s	remaining: 1m 3s
1702:	learn: 1.6213748	total: 6m 5s	remaining: 1m 3s
1703:	learn: 1.6212738	total: 6m 5s	remaining:

1839:	learn: 1.6062455	total: 6m 36s	remaining: 34.5s
1840:	learn: 1.6061705	total: 6m 36s	remaining: 34.3s
1841:	learn: 1.6060029	total: 6m 36s	remaining: 34s
1842:	learn: 1.6059465	total: 6m 37s	remaining: 33.8s
1843:	learn: 1.6058371	total: 6m 37s	remaining: 33.6s
1844:	learn: 1.6057195	total: 6m 37s	remaining: 33.4s
1845:	learn: 1.6056828	total: 6m 37s	remaining: 33.2s
1846:	learn: 1.6055761	total: 6m 37s	remaining: 33s
1847:	learn: 1.6054489	total: 6m 38s	remaining: 32.7s
1848:	learn: 1.6053966	total: 6m 38s	remaining: 32.5s
1849:	learn: 1.6052953	total: 6m 38s	remaining: 32.3s
1850:	learn: 1.6051437	total: 6m 38s	remaining: 32.1s
1851:	learn: 1.6051081	total: 6m 38s	remaining: 31.9s
1852:	learn: 1.6050722	total: 6m 39s	remaining: 31.7s
1853:	learn: 1.6049901	total: 6m 39s	remaining: 31.5s
1854:	learn: 1.6049057	total: 6m 39s	remaining: 31.2s
1855:	learn: 1.6048122	total: 6m 39s	remaining: 31s
1856:	learn: 1.6047167	total: 6m 40s	remaining: 30.8s
1857:	learn: 1.6046797	total: 6m 4

1993:	learn: 1.5903716	total: 7m 11s	remaining: 1.3s
1994:	learn: 1.5902499	total: 7m 11s	remaining: 1.08s
1995:	learn: 1.5901052	total: 7m 11s	remaining: 865ms
1996:	learn: 1.5900003	total: 7m 11s	remaining: 649ms
1997:	learn: 1.5899646	total: 7m 11s	remaining: 432ms
1998:	learn: 1.5899290	total: 7m 12s	remaining: 216ms
1999:	learn: 1.5898952	total: 7m 12s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x1a3c22d470>

In [1028]:
mystem = Mystem()

In [1064]:
test_data = read_test_data()
test_data = [text_preprocessor(text) for text in test_data]

In [1065]:
tfidf_matrix_test = vectorizer.transform(test_data)
result = cb_regressor.predict(tfidf_matrix_test)

In [1066]:
scores = get_scores(result)
dump_scores(scores)

### SVM + CatBoost

In [1139]:
with open('strong_vocab.txt', 'r') as inf:
    strong_vocabulary = json.load(inf)

In [1140]:
vectorizer = TfidfVectorizer(token_pattern=r'\S{2,}', ngram_range=(1, 2), vocabulary=strong_vocabulary)
tfidf_matrix_train = vectorizer.fit_transform(preprocessed_train_data)
tfidf_matrix_train.shape

(20000, 3392)

In [1141]:
tfidf_matrix_train_train, tfidf_matrix_train_validate, train_scores_train, train_scores_validate = train_test_split(
    tfidf_matrix_train,
    train_scores,
    test_size=0.2
)

In [1142]:
svm_regressor = SVR(kernel='linear')
svm_regressor.fit(tfidf_matrix_train_train, train_scores_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [1143]:
svm_predict_train = svm_regressor.predict(tfidf_matrix_train_train)
svm_predict_validate = svm_regressor.predict(tfidf_matrix_train_validate)

In [1144]:
cb_tfidf_matrix_train_train = hstack([tfidf_matrix_train_train, svm_predict_train.reshape(-1, 1)])
cb_tfidf_matrix_train_validate = hstack([tfidf_matrix_train_validate, svm_predict_validate.reshape(-1, 1)])

In [1145]:
cb_regressor = CatBoostRegressor(iterations=3000, learning_rate=0.02, depth=6)
eval_set = [(tfidf_matrix_train_validate, train_scores_validate)]
cb_regressor.fit(tfidf_matrix_train_train, train_scores_train, eval_set=eval_set)

0:	learn: 2.2422121	test: 2.2575413	best: 2.2575413 (0)	total: 591ms	remaining: 29m 31s
1:	learn: 2.2384454	test: 2.2544458	best: 2.2544458 (1)	total: 866ms	remaining: 21m 37s
2:	learn: 2.2349621	test: 2.2516555	best: 2.2516555 (2)	total: 1.14s	remaining: 19m 3s
3:	learn: 2.2316053	test: 2.2489052	best: 2.2489052 (3)	total: 1.32s	remaining: 16m 32s
4:	learn: 2.2281870	test: 2.2458963	best: 2.2458963 (4)	total: 1.63s	remaining: 16m 14s
5:	learn: 2.2245589	test: 2.2424586	best: 2.2424586 (5)	total: 1.86s	remaining: 15m 27s
6:	learn: 2.2212703	test: 2.2396079	best: 2.2396079 (6)	total: 2.08s	remaining: 14m 48s
7:	learn: 2.2178382	test: 2.2367215	best: 2.2367215 (7)	total: 2.25s	remaining: 14m 1s
8:	learn: 2.2144125	test: 2.2333728	best: 2.2333728 (8)	total: 2.42s	remaining: 13m 25s
9:	learn: 2.2111969	test: 2.2303838	best: 2.2303838 (9)	total: 2.6s	remaining: 12m 56s
10:	learn: 2.2083364	test: 2.2279042	best: 2.2279042 (10)	total: 2.76s	remaining: 12m 31s
11:	learn: 2.2051540	test: 2.2247

93:	learn: 2.0612630	test: 2.0979257	best: 2.0979257 (93)	total: 17.3s	remaining: 8m 55s
94:	learn: 2.0601270	test: 2.0968920	best: 2.0968920 (94)	total: 17.5s	remaining: 8m 55s
95:	learn: 2.0589579	test: 2.0960553	best: 2.0960553 (95)	total: 17.7s	remaining: 8m 54s
96:	learn: 2.0577651	test: 2.0951253	best: 2.0951253 (96)	total: 17.8s	remaining: 8m 53s
97:	learn: 2.0566544	test: 2.0941795	best: 2.0941795 (97)	total: 18s	remaining: 8m 53s
98:	learn: 2.0555117	test: 2.0930417	best: 2.0930417 (98)	total: 18.2s	remaining: 8m 52s
99:	learn: 2.0543751	test: 2.0919565	best: 2.0919565 (99)	total: 18.3s	remaining: 8m 51s
100:	learn: 2.0531584	test: 2.0908522	best: 2.0908522 (100)	total: 18.5s	remaining: 8m 51s
101:	learn: 2.0522181	test: 2.0902380	best: 2.0902380 (101)	total: 18.7s	remaining: 8m 51s
102:	learn: 2.0510968	test: 2.0893162	best: 2.0893162 (102)	total: 18.9s	remaining: 8m 50s
103:	learn: 2.0500732	test: 2.0883520	best: 2.0883520 (103)	total: 19s	remaining: 8m 49s
104:	learn: 2.049

185:	learn: 1.9826325	test: 2.0337885	best: 2.0337885 (185)	total: 33.1s	remaining: 8m 20s
186:	learn: 1.9821023	test: 2.0332424	best: 2.0332424 (186)	total: 33.3s	remaining: 8m 20s
187:	learn: 1.9814565	test: 2.0328071	best: 2.0328071 (187)	total: 33.4s	remaining: 8m 20s
188:	learn: 1.9807379	test: 2.0321509	best: 2.0321509 (188)	total: 33.6s	remaining: 8m 19s
189:	learn: 1.9800448	test: 2.0315283	best: 2.0315283 (189)	total: 33.8s	remaining: 8m 19s
190:	learn: 1.9793589	test: 2.0309669	best: 2.0309669 (190)	total: 33.9s	remaining: 8m 19s
191:	learn: 1.9787926	test: 2.0304112	best: 2.0304112 (191)	total: 34.1s	remaining: 8m 19s
192:	learn: 1.9781285	test: 2.0301630	best: 2.0301630 (192)	total: 34.3s	remaining: 8m 18s
193:	learn: 1.9774241	test: 2.0293718	best: 2.0293718 (193)	total: 34.5s	remaining: 8m 18s
194:	learn: 1.9767772	test: 2.0288471	best: 2.0288471 (194)	total: 34.6s	remaining: 8m 18s
195:	learn: 1.9760882	test: 2.0283255	best: 2.0283255 (195)	total: 34.8s	remaining: 8m 17s

276:	learn: 1.9308400	test: 1.9931962	best: 1.9931962 (276)	total: 49s	remaining: 8m 1s
277:	learn: 1.9302977	test: 1.9926682	best: 1.9926682 (277)	total: 49.2s	remaining: 8m 1s
278:	learn: 1.9298731	test: 1.9924172	best: 1.9924172 (278)	total: 49.4s	remaining: 8m 1s
279:	learn: 1.9294664	test: 1.9922404	best: 1.9922404 (279)	total: 49.5s	remaining: 8m 1s
280:	learn: 1.9290765	test: 1.9919602	best: 1.9919602 (280)	total: 49.7s	remaining: 8m 1s
281:	learn: 1.9285994	test: 1.9916314	best: 1.9916314 (281)	total: 49.9s	remaining: 8m
282:	learn: 1.9281697	test: 1.9913332	best: 1.9913332 (282)	total: 50.1s	remaining: 8m
283:	learn: 1.9277168	test: 1.9910362	best: 1.9910362 (283)	total: 50.2s	remaining: 8m
284:	learn: 1.9272715	test: 1.9905686	best: 1.9905686 (284)	total: 50.4s	remaining: 8m
285:	learn: 1.9267580	test: 1.9901485	best: 1.9901485 (285)	total: 50.6s	remaining: 7m 59s
286:	learn: 1.9263822	test: 1.9898477	best: 1.9898477 (286)	total: 50.7s	remaining: 7m 59s
287:	learn: 1.9258126	

367:	learn: 1.8914608	test: 1.9621189	best: 1.9621189 (367)	total: 1m 5s	remaining: 7m 46s
368:	learn: 1.8910769	test: 1.9618420	best: 1.9618420 (368)	total: 1m 5s	remaining: 7m 45s
369:	learn: 1.8907227	test: 1.9614553	best: 1.9614553 (369)	total: 1m 5s	remaining: 7m 45s
370:	learn: 1.8903887	test: 1.9611895	best: 1.9611895 (370)	total: 1m 5s	remaining: 7m 45s
371:	learn: 1.8899559	test: 1.9608331	best: 1.9608331 (371)	total: 1m 5s	remaining: 7m 45s
372:	learn: 1.8896059	test: 1.9604718	best: 1.9604718 (372)	total: 1m 6s	remaining: 7m 45s
373:	learn: 1.8892281	test: 1.9602448	best: 1.9602448 (373)	total: 1m 6s	remaining: 7m 44s
374:	learn: 1.8888135	test: 1.9601283	best: 1.9601283 (374)	total: 1m 6s	remaining: 7m 44s
375:	learn: 1.8884505	test: 1.9597653	best: 1.9597653 (375)	total: 1m 6s	remaining: 7m 44s
376:	learn: 1.8880348	test: 1.9595401	best: 1.9595401 (376)	total: 1m 6s	remaining: 7m 44s
377:	learn: 1.8877081	test: 1.9592080	best: 1.9592080 (377)	total: 1m 7s	remaining: 7m 45s

457:	learn: 1.8592584	test: 1.9361937	best: 1.9361937 (457)	total: 1m 23s	remaining: 7m 42s
458:	learn: 1.8589171	test: 1.9359856	best: 1.9359856 (458)	total: 1m 23s	remaining: 7m 42s
459:	learn: 1.8585610	test: 1.9357465	best: 1.9357465 (459)	total: 1m 23s	remaining: 7m 42s
460:	learn: 1.8582348	test: 1.9354284	best: 1.9354284 (460)	total: 1m 23s	remaining: 7m 42s
461:	learn: 1.8579204	test: 1.9352373	best: 1.9352373 (461)	total: 1m 24s	remaining: 7m 41s
462:	learn: 1.8575769	test: 1.9349375	best: 1.9349375 (462)	total: 1m 24s	remaining: 7m 41s
463:	learn: 1.8571788	test: 1.9346171	best: 1.9346171 (463)	total: 1m 24s	remaining: 7m 41s
464:	learn: 1.8568773	test: 1.9345281	best: 1.9345281 (464)	total: 1m 24s	remaining: 7m 41s
465:	learn: 1.8565179	test: 1.9343404	best: 1.9343404 (465)	total: 1m 24s	remaining: 7m 41s
466:	learn: 1.8562335	test: 1.9341509	best: 1.9341509 (466)	total: 1m 25s	remaining: 7m 42s
467:	learn: 1.8558533	test: 1.9338548	best: 1.9338548 (467)	total: 1m 25s	remain

548:	learn: 1.8286431	test: 1.9141941	best: 1.9141941 (548)	total: 1m 39s	remaining: 7m 24s
549:	learn: 1.8283386	test: 1.9138523	best: 1.9138523 (549)	total: 1m 39s	remaining: 7m 24s
550:	learn: 1.8279927	test: 1.9136016	best: 1.9136016 (550)	total: 1m 39s	remaining: 7m 24s
551:	learn: 1.8276742	test: 1.9133478	best: 1.9133478 (551)	total: 1m 40s	remaining: 7m 24s
552:	learn: 1.8273188	test: 1.9130461	best: 1.9130461 (552)	total: 1m 40s	remaining: 7m 23s
553:	learn: 1.8269720	test: 1.9127926	best: 1.9127926 (553)	total: 1m 40s	remaining: 7m 23s
554:	learn: 1.8266305	test: 1.9126173	best: 1.9126173 (554)	total: 1m 40s	remaining: 7m 23s
555:	learn: 1.8263298	test: 1.9123462	best: 1.9123462 (555)	total: 1m 40s	remaining: 7m 23s
556:	learn: 1.8259848	test: 1.9120584	best: 1.9120584 (556)	total: 1m 41s	remaining: 7m 23s
557:	learn: 1.8256303	test: 1.9119273	best: 1.9119273 (557)	total: 1m 41s	remaining: 7m 22s
558:	learn: 1.8253081	test: 1.9118472	best: 1.9118472 (558)	total: 1m 41s	remain

638:	learn: 1.7995104	test: 1.8948956	best: 1.8948956 (638)	total: 1m 54s	remaining: 7m 4s
639:	learn: 1.7991516	test: 1.8948252	best: 1.8948252 (639)	total: 1m 54s	remaining: 7m 4s
640:	learn: 1.7989905	test: 1.8947964	best: 1.8947964 (640)	total: 1m 55s	remaining: 7m 3s
641:	learn: 1.7985918	test: 1.8946804	best: 1.8946804 (641)	total: 1m 55s	remaining: 7m 3s
642:	learn: 1.7983040	test: 1.8945848	best: 1.8945848 (642)	total: 1m 55s	remaining: 7m 3s
643:	learn: 1.7979964	test: 1.8943094	best: 1.8943094 (643)	total: 1m 55s	remaining: 7m 3s
644:	learn: 1.7976557	test: 1.8941173	best: 1.8941173 (644)	total: 1m 55s	remaining: 7m 3s
645:	learn: 1.7973306	test: 1.8937942	best: 1.8937942 (645)	total: 1m 56s	remaining: 7m 2s
646:	learn: 1.7969092	test: 1.8934953	best: 1.8934953 (646)	total: 1m 56s	remaining: 7m 2s
647:	learn: 1.7965963	test: 1.8934003	best: 1.8934003 (647)	total: 1m 56s	remaining: 7m 2s
648:	learn: 1.7962653	test: 1.8932538	best: 1.8932538 (648)	total: 1m 56s	remaining: 7m 2s

729:	learn: 1.7731578	test: 1.8804207	best: 1.8804207 (729)	total: 2m 11s	remaining: 6m 48s
730:	learn: 1.7729842	test: 1.8804036	best: 1.8804036 (730)	total: 2m 11s	remaining: 6m 48s
731:	learn: 1.7726737	test: 1.8801375	best: 1.8801375 (731)	total: 2m 11s	remaining: 6m 48s
732:	learn: 1.7723772	test: 1.8799493	best: 1.8799493 (732)	total: 2m 11s	remaining: 6m 48s
733:	learn: 1.7720991	test: 1.8798643	best: 1.8798643 (733)	total: 2m 12s	remaining: 6m 47s
734:	learn: 1.7718133	test: 1.8796826	best: 1.8796826 (734)	total: 2m 12s	remaining: 6m 47s
735:	learn: 1.7715454	test: 1.8795591	best: 1.8795591 (735)	total: 2m 12s	remaining: 6m 47s
736:	learn: 1.7713494	test: 1.8795014	best: 1.8795014 (736)	total: 2m 12s	remaining: 6m 47s
737:	learn: 1.7710369	test: 1.8792963	best: 1.8792963 (737)	total: 2m 12s	remaining: 6m 47s
738:	learn: 1.7707060	test: 1.8789521	best: 1.8789521 (738)	total: 2m 13s	remaining: 6m 46s
739:	learn: 1.7704873	test: 1.8788633	best: 1.8788633 (739)	total: 2m 13s	remain

819:	learn: 1.7494927	test: 1.8679695	best: 1.8679695 (819)	total: 2m 26s	remaining: 6m 30s
820:	learn: 1.7492114	test: 1.8678874	best: 1.8678874 (820)	total: 2m 27s	remaining: 6m 30s
821:	learn: 1.7489198	test: 1.8677348	best: 1.8677348 (821)	total: 2m 27s	remaining: 6m 30s
822:	learn: 1.7487429	test: 1.8676250	best: 1.8676250 (822)	total: 2m 27s	remaining: 6m 30s
823:	learn: 1.7484982	test: 1.8674221	best: 1.8674221 (823)	total: 2m 27s	remaining: 6m 29s
824:	learn: 1.7483879	test: 1.8672954	best: 1.8672954 (824)	total: 2m 27s	remaining: 6m 29s
825:	learn: 1.7481551	test: 1.8671261	best: 1.8671261 (825)	total: 2m 27s	remaining: 6m 29s
826:	learn: 1.7478982	test: 1.8669640	best: 1.8669640 (826)	total: 2m 28s	remaining: 6m 29s
827:	learn: 1.7476850	test: 1.8668149	best: 1.8668149 (827)	total: 2m 28s	remaining: 6m 29s
828:	learn: 1.7475250	test: 1.8667375	best: 1.8667375 (828)	total: 2m 28s	remaining: 6m 28s
829:	learn: 1.7472662	test: 1.8665165	best: 1.8665165 (829)	total: 2m 28s	remain

910:	learn: 1.7283939	test: 1.8575904	best: 1.8575904 (910)	total: 2m 44s	remaining: 6m 17s
911:	learn: 1.7282029	test: 1.8575516	best: 1.8575516 (911)	total: 2m 44s	remaining: 6m 17s
912:	learn: 1.7279573	test: 1.8573698	best: 1.8573698 (912)	total: 2m 44s	remaining: 6m 17s
913:	learn: 1.7277146	test: 1.8573081	best: 1.8573081 (913)	total: 2m 45s	remaining: 6m 16s
914:	learn: 1.7274374	test: 1.8572638	best: 1.8572638 (914)	total: 2m 45s	remaining: 6m 16s
915:	learn: 1.7271384	test: 1.8571001	best: 1.8571001 (915)	total: 2m 45s	remaining: 6m 16s
916:	learn: 1.7269223	test: 1.8569399	best: 1.8569399 (916)	total: 2m 45s	remaining: 6m 16s
917:	learn: 1.7266399	test: 1.8568028	best: 1.8568028 (917)	total: 2m 45s	remaining: 6m 16s
918:	learn: 1.7263677	test: 1.8566508	best: 1.8566508 (918)	total: 2m 45s	remaining: 6m 15s
919:	learn: 1.7261290	test: 1.8565355	best: 1.8565355 (919)	total: 2m 46s	remaining: 6m 15s
920:	learn: 1.7258884	test: 1.8565497	best: 1.8565355 (919)	total: 2m 46s	remain

1000:	learn: 1.7081628	test: 1.8481424	best: 1.8481424 (1000)	total: 3m	remaining: 6m 1s
1001:	learn: 1.7080668	test: 1.8480817	best: 1.8480817 (1001)	total: 3m 1s	remaining: 6m 1s
1002:	learn: 1.7078323	test: 1.8479926	best: 1.8479926 (1002)	total: 3m 1s	remaining: 6m
1003:	learn: 1.7076796	test: 1.8479328	best: 1.8479328 (1003)	total: 3m 1s	remaining: 6m
1004:	learn: 1.7074799	test: 1.8478266	best: 1.8478266 (1004)	total: 3m 1s	remaining: 6m
1005:	learn: 1.7072865	test: 1.8477625	best: 1.8477625 (1005)	total: 3m 1s	remaining: 6m
1006:	learn: 1.7070720	test: 1.8477258	best: 1.8477258 (1006)	total: 3m 1s	remaining: 6m
1007:	learn: 1.7068469	test: 1.8476038	best: 1.8476038 (1007)	total: 3m 2s	remaining: 5m 59s
1008:	learn: 1.7066947	test: 1.8475683	best: 1.8475683 (1008)	total: 3m 2s	remaining: 5m 59s
1009:	learn: 1.7064780	test: 1.8474098	best: 1.8474098 (1009)	total: 3m 2s	remaining: 5m 59s
1010:	learn: 1.7063195	test: 1.8473332	best: 1.8473332 (1010)	total: 3m 2s	remaining: 5m 59s
10

1090:	learn: 1.6901952	test: 1.8410824	best: 1.8410824 (1090)	total: 3m 15s	remaining: 5m 42s
1091:	learn: 1.6899978	test: 1.8410459	best: 1.8410459 (1091)	total: 3m 15s	remaining: 5m 42s
1092:	learn: 1.6897804	test: 1.8409235	best: 1.8409235 (1092)	total: 3m 15s	remaining: 5m 41s
1093:	learn: 1.6896843	test: 1.8408139	best: 1.8408139 (1093)	total: 3m 16s	remaining: 5m 41s
1094:	learn: 1.6894561	test: 1.8406772	best: 1.8406772 (1094)	total: 3m 16s	remaining: 5m 41s
1095:	learn: 1.6893100	test: 1.8405985	best: 1.8405985 (1095)	total: 3m 16s	remaining: 5m 41s
1096:	learn: 1.6892559	test: 1.8405932	best: 1.8405932 (1096)	total: 3m 16s	remaining: 5m 40s
1097:	learn: 1.6891697	test: 1.8405349	best: 1.8405349 (1097)	total: 3m 16s	remaining: 5m 40s
1098:	learn: 1.6890201	test: 1.8404679	best: 1.8404679 (1098)	total: 3m 16s	remaining: 5m 40s
1099:	learn: 1.6887632	test: 1.8403595	best: 1.8403595 (1099)	total: 3m 17s	remaining: 5m 40s
1100:	learn: 1.6885949	test: 1.8402981	best: 1.8402981 (1100

1178:	learn: 1.6746936	test: 1.8344801	best: 1.8344801 (1178)	total: 3m 30s	remaining: 5m 25s
1179:	learn: 1.6745263	test: 1.8344887	best: 1.8344801 (1178)	total: 3m 30s	remaining: 5m 24s
1180:	learn: 1.6743238	test: 1.8343925	best: 1.8343925 (1180)	total: 3m 30s	remaining: 5m 24s
1181:	learn: 1.6742228	test: 1.8343303	best: 1.8343303 (1181)	total: 3m 31s	remaining: 5m 24s
1182:	learn: 1.6741239	test: 1.8342898	best: 1.8342898 (1182)	total: 3m 31s	remaining: 5m 24s
1183:	learn: 1.6739359	test: 1.8342889	best: 1.8342889 (1183)	total: 3m 31s	remaining: 5m 24s
1184:	learn: 1.6737659	test: 1.8341935	best: 1.8341935 (1184)	total: 3m 31s	remaining: 5m 23s
1185:	learn: 1.6736801	test: 1.8341933	best: 1.8341933 (1185)	total: 3m 31s	remaining: 5m 23s
1186:	learn: 1.6735585	test: 1.8341953	best: 1.8341933 (1185)	total: 3m 31s	remaining: 5m 23s
1187:	learn: 1.6733868	test: 1.8340664	best: 1.8340664 (1187)	total: 3m 31s	remaining: 5m 23s
1188:	learn: 1.6732086	test: 1.8339546	best: 1.8339546 (1188

1267:	learn: 1.6604819	test: 1.8297078	best: 1.8297078 (1267)	total: 3m 47s	remaining: 5m 11s
1268:	learn: 1.6602355	test: 1.8296036	best: 1.8296036 (1268)	total: 3m 48s	remaining: 5m 11s
1269:	learn: 1.6601402	test: 1.8295462	best: 1.8295462 (1269)	total: 3m 48s	remaining: 5m 10s
1270:	learn: 1.6599722	test: 1.8295472	best: 1.8295462 (1269)	total: 3m 48s	remaining: 5m 10s
1271:	learn: 1.6597614	test: 1.8294827	best: 1.8294827 (1271)	total: 3m 48s	remaining: 5m 10s
1272:	learn: 1.6596101	test: 1.8294417	best: 1.8294417 (1272)	total: 3m 48s	remaining: 5m 10s
1273:	learn: 1.6594539	test: 1.8294176	best: 1.8294176 (1273)	total: 3m 49s	remaining: 5m 10s
1274:	learn: 1.6592926	test: 1.8293960	best: 1.8293960 (1274)	total: 3m 49s	remaining: 5m 10s
1275:	learn: 1.6590939	test: 1.8292933	best: 1.8292933 (1275)	total: 3m 49s	remaining: 5m 10s
1276:	learn: 1.6588831	test: 1.8291898	best: 1.8291898 (1276)	total: 3m 49s	remaining: 5m 9s
1277:	learn: 1.6588010	test: 1.8291121	best: 1.8291121 (1277)

1357:	learn: 1.6461938	test: 1.8248818	best: 1.8248818 (1357)	total: 4m 4s	remaining: 4m 55s
1358:	learn: 1.6460269	test: 1.8248172	best: 1.8248172 (1358)	total: 4m 4s	remaining: 4m 55s
1359:	learn: 1.6458080	test: 1.8246925	best: 1.8246925 (1359)	total: 4m 5s	remaining: 4m 55s
1360:	learn: 1.6456014	test: 1.8245795	best: 1.8245795 (1360)	total: 4m 5s	remaining: 4m 55s
1361:	learn: 1.6454481	test: 1.8245554	best: 1.8245554 (1361)	total: 4m 5s	remaining: 4m 55s
1362:	learn: 1.6453759	test: 1.8245280	best: 1.8245280 (1362)	total: 4m 5s	remaining: 4m 54s
1363:	learn: 1.6452068	test: 1.8244637	best: 1.8244637 (1363)	total: 4m 5s	remaining: 4m 54s
1364:	learn: 1.6450860	test: 1.8244675	best: 1.8244637 (1363)	total: 4m 5s	remaining: 4m 54s
1365:	learn: 1.6448807	test: 1.8244242	best: 1.8244242 (1365)	total: 4m 6s	remaining: 4m 54s
1366:	learn: 1.6447954	test: 1.8244036	best: 1.8244036 (1366)	total: 4m 6s	remaining: 4m 54s
1367:	learn: 1.6446092	test: 1.8242812	best: 1.8242812 (1367)	total: 4

1446:	learn: 1.6326603	test: 1.8206353	best: 1.8206297 (1444)	total: 4m 19s	remaining: 4m 38s
1447:	learn: 1.6324874	test: 1.8206067	best: 1.8206067 (1447)	total: 4m 19s	remaining: 4m 38s
1448:	learn: 1.6323011	test: 1.8205434	best: 1.8205434 (1448)	total: 4m 19s	remaining: 4m 37s
1449:	learn: 1.6321622	test: 1.8205135	best: 1.8205135 (1449)	total: 4m 19s	remaining: 4m 37s
1450:	learn: 1.6319562	test: 1.8204020	best: 1.8204020 (1450)	total: 4m 19s	remaining: 4m 37s
1451:	learn: 1.6318884	test: 1.8203890	best: 1.8203890 (1451)	total: 4m 20s	remaining: 4m 37s
1452:	learn: 1.6317359	test: 1.8204173	best: 1.8203890 (1451)	total: 4m 20s	remaining: 4m 37s
1453:	learn: 1.6315797	test: 1.8203964	best: 1.8203890 (1451)	total: 4m 20s	remaining: 4m 36s
1454:	learn: 1.6315302	test: 1.8204022	best: 1.8203890 (1451)	total: 4m 20s	remaining: 4m 36s
1455:	learn: 1.6314429	test: 1.8203743	best: 1.8203743 (1455)	total: 4m 20s	remaining: 4m 36s
1456:	learn: 1.6312746	test: 1.8203114	best: 1.8203114 (1456

1534:	learn: 1.6201717	test: 1.8168747	best: 1.8168747 (1534)	total: 4m 33s	remaining: 4m 20s
1535:	learn: 1.6200091	test: 1.8168721	best: 1.8168721 (1535)	total: 4m 33s	remaining: 4m 20s
1536:	learn: 1.6198656	test: 1.8168741	best: 1.8168721 (1535)	total: 4m 33s	remaining: 4m 20s
1537:	learn: 1.6196709	test: 1.8167913	best: 1.8167913 (1537)	total: 4m 33s	remaining: 4m 20s
1538:	learn: 1.6195452	test: 1.8167536	best: 1.8167536 (1538)	total: 4m 34s	remaining: 4m 20s
1539:	learn: 1.6194731	test: 1.8166836	best: 1.8166836 (1539)	total: 4m 34s	remaining: 4m 20s
1540:	learn: 1.6193102	test: 1.8166168	best: 1.8166168 (1540)	total: 4m 34s	remaining: 4m 19s
1541:	learn: 1.6192028	test: 1.8166146	best: 1.8166146 (1541)	total: 4m 34s	remaining: 4m 19s
1542:	learn: 1.6191567	test: 1.8166117	best: 1.8166117 (1542)	total: 4m 34s	remaining: 4m 19s
1543:	learn: 1.6189715	test: 1.8165260	best: 1.8165260 (1543)	total: 4m 34s	remaining: 4m 19s
1544:	learn: 1.6189027	test: 1.8165267	best: 1.8165260 (1543

1622:	learn: 1.6083269	test: 1.8141787	best: 1.8141787 (1622)	total: 4m 47s	remaining: 4m 4s
1623:	learn: 1.6082734	test: 1.8141543	best: 1.8141543 (1623)	total: 4m 47s	remaining: 4m 3s
1624:	learn: 1.6081450	test: 1.8140564	best: 1.8140564 (1624)	total: 4m 47s	remaining: 4m 3s
1625:	learn: 1.6080523	test: 1.8139953	best: 1.8139953 (1625)	total: 4m 48s	remaining: 4m 3s
1626:	learn: 1.6079152	test: 1.8139748	best: 1.8139748 (1626)	total: 4m 48s	remaining: 4m 3s
1627:	learn: 1.6077473	test: 1.8139215	best: 1.8139215 (1627)	total: 4m 48s	remaining: 4m 3s
1628:	learn: 1.6075815	test: 1.8139279	best: 1.8139215 (1627)	total: 4m 48s	remaining: 4m 2s
1629:	learn: 1.6074042	test: 1.8139290	best: 1.8139215 (1627)	total: 4m 48s	remaining: 4m 2s
1630:	learn: 1.6072527	test: 1.8139118	best: 1.8139118 (1630)	total: 4m 48s	remaining: 4m 2s
1631:	learn: 1.6071705	test: 1.8139073	best: 1.8139073 (1631)	total: 4m 49s	remaining: 4m 2s
1632:	learn: 1.6069638	test: 1.8138322	best: 1.8138322 (1632)	total: 4

1710:	learn: 1.5971833	test: 1.8113571	best: 1.8113571 (1710)	total: 5m 2s	remaining: 3m 47s
1711:	learn: 1.5971452	test: 1.8113552	best: 1.8113552 (1711)	total: 5m 2s	remaining: 3m 47s
1712:	learn: 1.5970675	test: 1.8113213	best: 1.8113213 (1712)	total: 5m 2s	remaining: 3m 47s
1713:	learn: 1.5968965	test: 1.8112861	best: 1.8112861 (1713)	total: 5m 2s	remaining: 3m 47s
1714:	learn: 1.5967494	test: 1.8111685	best: 1.8111685 (1714)	total: 5m 3s	remaining: 3m 47s
1715:	learn: 1.5966190	test: 1.8111834	best: 1.8111685 (1714)	total: 5m 3s	remaining: 3m 46s
1716:	learn: 1.5965450	test: 1.8112076	best: 1.8111685 (1714)	total: 5m 3s	remaining: 3m 46s
1717:	learn: 1.5964818	test: 1.8112045	best: 1.8111685 (1714)	total: 5m 3s	remaining: 3m 46s
1718:	learn: 1.5963708	test: 1.8111904	best: 1.8111685 (1714)	total: 5m 3s	remaining: 3m 46s
1719:	learn: 1.5962629	test: 1.8111416	best: 1.8111416 (1719)	total: 5m 3s	remaining: 3m 46s
1720:	learn: 1.5961237	test: 1.8111217	best: 1.8111217 (1720)	total: 5

1798:	learn: 1.5865760	test: 1.8084233	best: 1.8084233 (1798)	total: 5m 16s	remaining: 3m 31s
1799:	learn: 1.5864240	test: 1.8083984	best: 1.8083984 (1799)	total: 5m 16s	remaining: 3m 31s
1800:	learn: 1.5863263	test: 1.8084390	best: 1.8083984 (1799)	total: 5m 16s	remaining: 3m 31s
1801:	learn: 1.5862840	test: 1.8084552	best: 1.8083984 (1799)	total: 5m 17s	remaining: 3m 30s
1802:	learn: 1.5860775	test: 1.8084282	best: 1.8083984 (1799)	total: 5m 17s	remaining: 3m 30s
1803:	learn: 1.5860028	test: 1.8084212	best: 1.8083984 (1799)	total: 5m 17s	remaining: 3m 30s
1804:	learn: 1.5858739	test: 1.8083907	best: 1.8083907 (1804)	total: 5m 17s	remaining: 3m 30s
1805:	learn: 1.5857209	test: 1.8083875	best: 1.8083875 (1805)	total: 5m 17s	remaining: 3m 30s
1806:	learn: 1.5855580	test: 1.8083174	best: 1.8083174 (1806)	total: 5m 17s	remaining: 3m 29s
1807:	learn: 1.5854676	test: 1.8083405	best: 1.8083174 (1806)	total: 5m 18s	remaining: 3m 29s
1808:	learn: 1.5853690	test: 1.8083237	best: 1.8083174 (1806

1886:	learn: 1.5756207	test: 1.8055094	best: 1.8055094 (1886)	total: 5m 30s	remaining: 3m 15s
1887:	learn: 1.5754278	test: 1.8054429	best: 1.8054429 (1887)	total: 5m 31s	remaining: 3m 14s
1888:	learn: 1.5753371	test: 1.8054143	best: 1.8054143 (1888)	total: 5m 31s	remaining: 3m 14s
1889:	learn: 1.5751869	test: 1.8053552	best: 1.8053552 (1889)	total: 5m 31s	remaining: 3m 14s
1890:	learn: 1.5750480	test: 1.8052845	best: 1.8052845 (1890)	total: 5m 31s	remaining: 3m 14s
1891:	learn: 1.5749322	test: 1.8052853	best: 1.8052845 (1890)	total: 5m 31s	remaining: 3m 14s
1892:	learn: 1.5748759	test: 1.8052984	best: 1.8052845 (1890)	total: 5m 31s	remaining: 3m 14s
1893:	learn: 1.5747940	test: 1.8052861	best: 1.8052845 (1890)	total: 5m 32s	remaining: 3m 13s
1894:	learn: 1.5747517	test: 1.8052930	best: 1.8052845 (1890)	total: 5m 32s	remaining: 3m 13s
1895:	learn: 1.5745833	test: 1.8052588	best: 1.8052588 (1895)	total: 5m 32s	remaining: 3m 13s
1896:	learn: 1.5745427	test: 1.8052587	best: 1.8052587 (1896

1974:	learn: 1.5663414	test: 1.8039680	best: 1.8039480 (1971)	total: 5m 44s	remaining: 2m 59s
1975:	learn: 1.5662520	test: 1.8039500	best: 1.8039480 (1971)	total: 5m 45s	remaining: 2m 58s
1976:	learn: 1.5661718	test: 1.8039118	best: 1.8039118 (1976)	total: 5m 45s	remaining: 2m 58s
1977:	learn: 1.5660050	test: 1.8037961	best: 1.8037961 (1977)	total: 5m 45s	remaining: 2m 58s
1978:	learn: 1.5659492	test: 1.8037894	best: 1.8037894 (1978)	total: 5m 45s	remaining: 2m 58s
1979:	learn: 1.5658664	test: 1.8037835	best: 1.8037835 (1979)	total: 5m 45s	remaining: 2m 58s
1980:	learn: 1.5658257	test: 1.8038002	best: 1.8037835 (1979)	total: 5m 45s	remaining: 2m 57s
1981:	learn: 1.5656664	test: 1.8036516	best: 1.8036516 (1981)	total: 5m 46s	remaining: 2m 57s
1982:	learn: 1.5654667	test: 1.8036049	best: 1.8036049 (1982)	total: 5m 46s	remaining: 2m 57s
1983:	learn: 1.5653785	test: 1.8035937	best: 1.8035937 (1983)	total: 5m 46s	remaining: 2m 57s
1984:	learn: 1.5653378	test: 1.8035936	best: 1.8035936 (1984

2063:	learn: 1.5563755	test: 1.8016607	best: 1.8016607 (2063)	total: 5m 59s	remaining: 2m 42s
2064:	learn: 1.5563204	test: 1.8016489	best: 1.8016489 (2064)	total: 5m 59s	remaining: 2m 42s
2065:	learn: 1.5562602	test: 1.8016325	best: 1.8016325 (2065)	total: 5m 59s	remaining: 2m 42s
2066:	learn: 1.5561208	test: 1.8016401	best: 1.8016325 (2065)	total: 5m 59s	remaining: 2m 42s
2067:	learn: 1.5559076	test: 1.8015839	best: 1.8015839 (2067)	total: 6m	remaining: 2m 42s
2068:	learn: 1.5558440	test: 1.8015948	best: 1.8015839 (2067)	total: 6m	remaining: 2m 42s
2069:	learn: 1.5557013	test: 1.8015910	best: 1.8015839 (2067)	total: 6m	remaining: 2m 41s
2070:	learn: 1.5555374	test: 1.8015943	best: 1.8015839 (2067)	total: 6m	remaining: 2m 41s
2071:	learn: 1.5554996	test: 1.8015969	best: 1.8015839 (2067)	total: 6m	remaining: 2m 41s
2072:	learn: 1.5553725	test: 1.8016374	best: 1.8015839 (2067)	total: 6m	remaining: 2m 41s
2073:	learn: 1.5553167	test: 1.8016371	best: 1.8015839 (2067)	total: 6m 1s	remaining

2151:	learn: 1.5471028	test: 1.7994606	best: 1.7994469 (2150)	total: 6m 14s	remaining: 2m 27s
2152:	learn: 1.5469760	test: 1.7994896	best: 1.7994469 (2150)	total: 6m 14s	remaining: 2m 27s
2153:	learn: 1.5468423	test: 1.7993595	best: 1.7993595 (2153)	total: 6m 14s	remaining: 2m 27s
2154:	learn: 1.5467069	test: 1.7993825	best: 1.7993595 (2153)	total: 6m 14s	remaining: 2m 26s
2155:	learn: 1.5466561	test: 1.7993793	best: 1.7993595 (2153)	total: 6m 15s	remaining: 2m 26s
2156:	learn: 1.5465600	test: 1.7993012	best: 1.7993012 (2156)	total: 6m 15s	remaining: 2m 26s
2157:	learn: 1.5464330	test: 1.7992450	best: 1.7992450 (2157)	total: 6m 15s	remaining: 2m 26s
2158:	learn: 1.5463254	test: 1.7992376	best: 1.7992376 (2158)	total: 6m 15s	remaining: 2m 26s
2159:	learn: 1.5462219	test: 1.7992168	best: 1.7992168 (2159)	total: 6m 15s	remaining: 2m 26s
2160:	learn: 1.5460678	test: 1.7991445	best: 1.7991445 (2160)	total: 6m 15s	remaining: 2m 25s
2161:	learn: 1.5460289	test: 1.7991435	best: 1.7991435 (2161

2240:	learn: 1.5379332	test: 1.7976305	best: 1.7976147 (2238)	total: 6m 28s	remaining: 2m 11s
2241:	learn: 1.5378975	test: 1.7976303	best: 1.7976147 (2238)	total: 6m 29s	remaining: 2m 11s
2242:	learn: 1.5378061	test: 1.7976160	best: 1.7976147 (2238)	total: 6m 29s	remaining: 2m 11s
2243:	learn: 1.5376866	test: 1.7976079	best: 1.7976079 (2243)	total: 6m 29s	remaining: 2m 11s
2244:	learn: 1.5376485	test: 1.7976171	best: 1.7976079 (2243)	total: 6m 29s	remaining: 2m 11s
2245:	learn: 1.5375329	test: 1.7975763	best: 1.7975763 (2245)	total: 6m 29s	remaining: 2m 10s
2246:	learn: 1.5374169	test: 1.7975960	best: 1.7975763 (2245)	total: 6m 29s	remaining: 2m 10s
2247:	learn: 1.5373809	test: 1.7976133	best: 1.7975763 (2245)	total: 6m 30s	remaining: 2m 10s
2248:	learn: 1.5373286	test: 1.7975980	best: 1.7975763 (2245)	total: 6m 30s	remaining: 2m 10s
2249:	learn: 1.5372915	test: 1.7975978	best: 1.7975763 (2245)	total: 6m 30s	remaining: 2m 10s
2250:	learn: 1.5372282	test: 1.7975835	best: 1.7975763 (2245

2328:	learn: 1.5286855	test: 1.7963028	best: 1.7962931 (2327)	total: 6m 47s	remaining: 1m 57s
2329:	learn: 1.5285580	test: 1.7962630	best: 1.7962630 (2329)	total: 6m 47s	remaining: 1m 57s
2330:	learn: 1.5284580	test: 1.7962117	best: 1.7962117 (2330)	total: 6m 47s	remaining: 1m 56s
2331:	learn: 1.5283906	test: 1.7962114	best: 1.7962114 (2331)	total: 6m 47s	remaining: 1m 56s
2332:	learn: 1.5282283	test: 1.7961929	best: 1.7961929 (2332)	total: 6m 47s	remaining: 1m 56s
2333:	learn: 1.5281308	test: 1.7962061	best: 1.7961929 (2332)	total: 6m 47s	remaining: 1m 56s
2334:	learn: 1.5280963	test: 1.7962059	best: 1.7961929 (2332)	total: 6m 47s	remaining: 1m 56s
2335:	learn: 1.5279284	test: 1.7961454	best: 1.7961454 (2335)	total: 6m 48s	remaining: 1m 56s
2336:	learn: 1.5278348	test: 1.7961557	best: 1.7961454 (2335)	total: 6m 48s	remaining: 1m 55s
2337:	learn: 1.5278005	test: 1.7961556	best: 1.7961454 (2335)	total: 6m 48s	remaining: 1m 55s
2338:	learn: 1.5276685	test: 1.7961342	best: 1.7961342 (2338

2416:	learn: 1.5206558	test: 1.7947246	best: 1.7947246 (2416)	total: 7m 2s	remaining: 1m 41s
2417:	learn: 1.5206194	test: 1.7947400	best: 1.7947246 (2416)	total: 7m 2s	remaining: 1m 41s
2418:	learn: 1.5205544	test: 1.7947009	best: 1.7947009 (2418)	total: 7m 3s	remaining: 1m 41s
2419:	learn: 1.5204453	test: 1.7946821	best: 1.7946821 (2419)	total: 7m 3s	remaining: 1m 41s
2420:	learn: 1.5203493	test: 1.7947418	best: 1.7946821 (2419)	total: 7m 3s	remaining: 1m 41s
2421:	learn: 1.5202486	test: 1.7947221	best: 1.7946821 (2419)	total: 7m 3s	remaining: 1m 41s
2422:	learn: 1.5201532	test: 1.7947047	best: 1.7946821 (2419)	total: 7m 3s	remaining: 1m 40s
2423:	learn: 1.5200483	test: 1.7946786	best: 1.7946786 (2423)	total: 7m 3s	remaining: 1m 40s
2424:	learn: 1.5199879	test: 1.7946889	best: 1.7946786 (2423)	total: 7m 3s	remaining: 1m 40s
2425:	learn: 1.5198702	test: 1.7945915	best: 1.7945915 (2425)	total: 7m 4s	remaining: 1m 40s
2426:	learn: 1.5198005	test: 1.7945750	best: 1.7945750 (2426)	total: 7

2505:	learn: 1.5125469	test: 1.7930732	best: 1.7930732 (2505)	total: 7m 20s	remaining: 1m 26s
2506:	learn: 1.5124201	test: 1.7929678	best: 1.7929678 (2506)	total: 7m 20s	remaining: 1m 26s
2507:	learn: 1.5123887	test: 1.7929792	best: 1.7929678 (2506)	total: 7m 20s	remaining: 1m 26s
2508:	learn: 1.5122685	test: 1.7929545	best: 1.7929545 (2508)	total: 7m 20s	remaining: 1m 26s
2509:	learn: 1.5122333	test: 1.7929732	best: 1.7929545 (2508)	total: 7m 20s	remaining: 1m 26s
2510:	learn: 1.5121978	test: 1.7929731	best: 1.7929545 (2508)	total: 7m 20s	remaining: 1m 25s
2511:	learn: 1.5121505	test: 1.7929701	best: 1.7929545 (2508)	total: 7m 21s	remaining: 1m 25s
2512:	learn: 1.5120219	test: 1.7929757	best: 1.7929545 (2508)	total: 7m 21s	remaining: 1m 25s
2513:	learn: 1.5119605	test: 1.7929830	best: 1.7929545 (2508)	total: 7m 21s	remaining: 1m 25s
2514:	learn: 1.5118356	test: 1.7929921	best: 1.7929545 (2508)	total: 7m 21s	remaining: 1m 25s
2515:	learn: 1.5117237	test: 1.7930109	best: 1.7929545 (2508

2594:	learn: 1.5045353	test: 1.7917129	best: 1.7917129 (2594)	total: 7m 35s	remaining: 1m 11s
2595:	learn: 1.5045013	test: 1.7917134	best: 1.7917129 (2594)	total: 7m 35s	remaining: 1m 10s
2596:	learn: 1.5043925	test: 1.7916926	best: 1.7916926 (2596)	total: 7m 36s	remaining: 1m 10s
2597:	learn: 1.5043476	test: 1.7916485	best: 1.7916485 (2597)	total: 7m 36s	remaining: 1m 10s
2598:	learn: 1.5043128	test: 1.7916647	best: 1.7916485 (2597)	total: 7m 36s	remaining: 1m 10s
2599:	learn: 1.5042128	test: 1.7916320	best: 1.7916320 (2599)	total: 7m 36s	remaining: 1m 10s
2600:	learn: 1.5041825	test: 1.7916318	best: 1.7916318 (2600)	total: 7m 36s	remaining: 1m 10s
2601:	learn: 1.5040608	test: 1.7916342	best: 1.7916318 (2600)	total: 7m 36s	remaining: 1m 9s
2602:	learn: 1.5039453	test: 1.7915421	best: 1.7915421 (2602)	total: 7m 37s	remaining: 1m 9s
2603:	learn: 1.5037998	test: 1.7915310	best: 1.7915310 (2603)	total: 7m 37s	remaining: 1m 9s
2604:	learn: 1.5036984	test: 1.7914525	best: 1.7914525 (2604)	t

2683:	learn: 1.4972733	test: 1.7909252	best: 1.7908512 (2680)	total: 7m 55s	remaining: 56s
2684:	learn: 1.4971508	test: 1.7908641	best: 1.7908512 (2680)	total: 7m 55s	remaining: 55.8s
2685:	learn: 1.4970914	test: 1.7908486	best: 1.7908486 (2685)	total: 7m 55s	remaining: 55.6s
2686:	learn: 1.4970089	test: 1.7908315	best: 1.7908315 (2686)	total: 7m 55s	remaining: 55.4s
2687:	learn: 1.4969570	test: 1.7908084	best: 1.7908084 (2687)	total: 7m 56s	remaining: 55.3s
2688:	learn: 1.4969144	test: 1.7907858	best: 1.7907858 (2688)	total: 7m 56s	remaining: 55.1s
2689:	learn: 1.4967762	test: 1.7907364	best: 1.7907364 (2689)	total: 7m 56s	remaining: 54.9s
2690:	learn: 1.4967421	test: 1.7907362	best: 1.7907362 (2690)	total: 7m 56s	remaining: 54.7s
2691:	learn: 1.4966935	test: 1.7907360	best: 1.7907360 (2691)	total: 7m 56s	remaining: 54.6s
2692:	learn: 1.4966170	test: 1.7907512	best: 1.7907360 (2691)	total: 7m 57s	remaining: 54.4s
2693:	learn: 1.4965752	test: 1.7907438	best: 1.7907360 (2691)	total: 7m 

2773:	learn: 1.4899292	test: 1.7894430	best: 1.7894430 (2773)	total: 8m 15s	remaining: 40.4s
2774:	learn: 1.4898597	test: 1.7893917	best: 1.7893917 (2774)	total: 8m 15s	remaining: 40.2s
2775:	learn: 1.4898264	test: 1.7893916	best: 1.7893916 (2775)	total: 8m 16s	remaining: 40s
2776:	learn: 1.4897438	test: 1.7894164	best: 1.7893916 (2775)	total: 8m 16s	remaining: 39.9s
2777:	learn: 1.4897178	test: 1.7894163	best: 1.7893916 (2775)	total: 8m 16s	remaining: 39.7s
2778:	learn: 1.4896845	test: 1.7894524	best: 1.7893916 (2775)	total: 8m 16s	remaining: 39.5s
2779:	learn: 1.4896541	test: 1.7894557	best: 1.7893916 (2775)	total: 8m 16s	remaining: 39.3s
2780:	learn: 1.4895149	test: 1.7893686	best: 1.7893686 (2780)	total: 8m 17s	remaining: 39.1s
2781:	learn: 1.4894844	test: 1.7893685	best: 1.7893685 (2781)	total: 8m 17s	remaining: 39s
2782:	learn: 1.4893552	test: 1.7894131	best: 1.7893685 (2781)	total: 8m 17s	remaining: 38.8s
2783:	learn: 1.4893267	test: 1.7894018	best: 1.7893685 (2781)	total: 8m 17

2863:	learn: 1.4819460	test: 1.7886838	best: 1.7886838 (2863)	total: 8m 32s	remaining: 24.3s
2864:	learn: 1.4818751	test: 1.7886680	best: 1.7886680 (2864)	total: 8m 32s	remaining: 24.2s
2865:	learn: 1.4818468	test: 1.7886679	best: 1.7886679 (2865)	total: 8m 33s	remaining: 24s
2866:	learn: 1.4817699	test: 1.7886855	best: 1.7886679 (2865)	total: 8m 33s	remaining: 23.8s
2867:	learn: 1.4817261	test: 1.7886380	best: 1.7886380 (2867)	total: 8m 33s	remaining: 23.6s
2868:	learn: 1.4816467	test: 1.7885846	best: 1.7885846 (2868)	total: 8m 33s	remaining: 23.5s
2869:	learn: 1.4815328	test: 1.7885485	best: 1.7885485 (2869)	total: 8m 33s	remaining: 23.3s
2870:	learn: 1.4813945	test: 1.7885836	best: 1.7885485 (2869)	total: 8m 33s	remaining: 23.1s
2871:	learn: 1.4813631	test: 1.7885841	best: 1.7885485 (2869)	total: 8m 34s	remaining: 22.9s
2872:	learn: 1.4813226	test: 1.7885574	best: 1.7885485 (2869)	total: 8m 34s	remaining: 22.7s
2873:	learn: 1.4811951	test: 1.7885331	best: 1.7885331 (2873)	total: 8m 

2952:	learn: 1.4751468	test: 1.7875571	best: 1.7875571 (2952)	total: 8m 49s	remaining: 8.42s
2953:	learn: 1.4750480	test: 1.7875221	best: 1.7875221 (2953)	total: 8m 49s	remaining: 8.24s
2954:	learn: 1.4749785	test: 1.7875332	best: 1.7875221 (2953)	total: 8m 49s	remaining: 8.06s
2955:	learn: 1.4749283	test: 1.7875126	best: 1.7875126 (2955)	total: 8m 49s	remaining: 7.88s
2956:	learn: 1.4748190	test: 1.7874282	best: 1.7874282 (2956)	total: 8m 49s	remaining: 7.7s
2957:	learn: 1.4747622	test: 1.7874127	best: 1.7874127 (2957)	total: 8m 49s	remaining: 7.52s
2958:	learn: 1.4746187	test: 1.7874147	best: 1.7874127 (2957)	total: 8m 50s	remaining: 7.34s
2959:	learn: 1.4745075	test: 1.7873924	best: 1.7873924 (2959)	total: 8m 50s	remaining: 7.17s
2960:	learn: 1.4743663	test: 1.7873247	best: 1.7873247 (2960)	total: 8m 50s	remaining: 6.99s
2961:	learn: 1.4742263	test: 1.7872762	best: 1.7872762 (2961)	total: 8m 50s	remaining: 6.81s
2962:	learn: 1.4741013	test: 1.7872623	best: 1.7872623 (2962)	total: 8m

<catboost.core.CatBoostRegressor at 0x1a3815a278>

In [1166]:
test_data = read_test_data()
test_data = [text_preprocessor(text) for text in test_data]

In [1167]:
tfidf_matrix_test = vectorizer.transform(test_data)
svm_result = svm_regressor.predict(tfidf_matrix_test)
svm_result = svm_result.reshape(-1, 1)

In [1168]:
cb_tfidf_matrix_test = hstack([tfidf_matrix_test, svm_result])
result = cb_regressor.predict(cb_tfidf_matrix_test)

In [1169]:
scores = get_scores(result)
dump_scores(scores)

### SVM ensemble

In [1170]:
l1_vocabulary = json_read('strong_vocab.txt')
vectorizer = TfidfVectorizer(token_pattern=r'\S{2,}', ngram_range=(1, 2), vocabulary=l1_vocabulary)
tfidf_matrix_train = vectorizer.fit_transform(preprocessed_train_data)
tfidf_matrix_train.shape

(20000, 3392)

In [1171]:
svm_regressors = []
for i in tqdm(range(20)):
    ids = np.random.choice(20000, size=18000)
    svm_regressors.append(SVR(kernel='linear'))
    svm_regressors[i].fit(tfidf_matrix_train[ids], np.array(train_scores)[ids])

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




In [1073]:
sgd_regressor = SGDRegressor()

In [1076]:
predicted = []
for i in tqdm(range(5)):
    predicted.append(svm_regressors[i].predict(tfidf_matrix_train[16000:]).reshape(-1, 1))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [1077]:
predicted_train = np.hstack(predicted)

In [1079]:
sgd_regressor.fit(predicted_train, train_scores[16000:])

SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [1194]:
test_data = read_test_data()
test_data = [text_preprocessor(text) for text in test_data]

In [1195]:
tfidf_matrix_test = vectorizer.transform(test_data)
result = np.zeros(1000)
for i in tqdm(range(10)):
    result += svm_regressors[i].predict(tfidf_matrix_test) / 10

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [1196]:
scores = get_scores(result)
dump_scores(scores)

### Simple tf-idf vectorizer + linear SVM

In [618]:
vectorizer = TfidfVectorizer(token_pattern=r'\S+', min_df=10)
tfidf_matrix_train = vectorizer.fit_transform(preprocessed_train_data)
tfidf_matrix_train.shape

(20000, 6943)

In [619]:
regressor = SVR(kernel='linear')

In [332]:
regressor.fit(tfidf_matrix_train, train_scores)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [333]:
test_data = read_test_data()
test_data = [text_preprocessor(text) for text in test_data]

In [334]:
tfidf_matrix_test = vectorizer.transform(test_data)
result = regressor.predict(tfidf_matrix_test)

In [335]:
scores = get_scores(result)
dump_scores(scores)

#### Punctuation exploration

In [97]:
spec_symbols = defaultdict(int)
for sent in train_data:
    for symbol in punct_tokenizer(sent):
        spec_symbols[symbol] += 1

In [98]:
for s, c in sorted(spec_symbols.items(), key=lambda x: x[1], reverse=True):
    print(s, c)

)) 1583
!!! 948
))) 640
!! 232
(( 163
??? 93
((( 44
?? 29


#### Significance

In [939]:
helper_vectorizer = CountVectorizer(token_pattern=r'\S{2,}', ngram_range=(1, 2), min_df=10)
helper_vectorizer.fit(preprocessed_train_data)
sign_vocab = {v: k for k, v in helper_vectorizer.vocabulary_.items()}

In [940]:
cntr = Counter(train_scores)
cntr

Counter({6: 1061,
         7: 1872,
         10: 5450,
         5: 1269,
         9: 4787,
         8: 3469,
         3: 580,
         4: 806,
         1: 358,
         2: 348})

In [965]:
stratified_data = defaultdict(list)
for i, sent in enumerate(preprocessed_train_data):
    score = train_scores[i]
    stratified_data[score].append(sent)

In [967]:
significance = np.zeros((len(sign_vocab),))

for label in tqdm(range(1, 11)):
    cur_av_tf = np.sum(
        helper_vectorizer.transform(stratified_data[label]).toarray().astype('float64'), 
        axis=0
    ) / cntr[label]
    mpl = label - 5 if label >= 6 else label - 6
    cur_av_tf *= mpl
    significance += cur_av_tf

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [968]:
words_sign = {}
for i, sign in enumerate(significance):
    words_sign[sign_vocab[i]] = sign

In [982]:
sorted_sign = sorted(words_sign.items(), key=lambda p: abs(p[1]), reverse=True)
significance_vocabulary = [word for word, _ in sorted_sign[:4000]]

In [983]:
json_dump(significance_vocabulary, 'sign_vocab.txt')

In [951]:
np.argmax(s1[8000:])
s1[12911]

0.5195530726256983

In [958]:
sign_vocab[12911]

'что'

In [714]:
stem = SnowballStemmer('russian')

In [721]:
mystem.lemmatize('')

' '

In [849]:
a = np.zeros((4, 3))
b = np.zeros((4, 1))

In [851]:
np.vstack([a, b])

ValueError: all the input array dimensions except for the concatenation axis must match exactly