# Yandex Algorithm 2018 ML Track
## #27 in final leaderbord

https://contest.yandex.ru/algorithm2018/contest/7914/standings/

TLDR: fastText + commoncrawl + lightgbm

In [1]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('datasets/yandex/train.tsv', sep='\t', quoting=3, header=None)
test = pd.read_csv('datasets/yandex/final.tsv', sep='\t', quoting=3, header=None)

In [4]:
train.fillna('', inplace=True)
test.fillna('', inplace=True)

In [5]:
train[1].to_csv('datasets/yandex/t1.txt', index=False)
train[2].to_csv('datasets/yandex/t2.txt', index=False)
train[3].to_csv('datasets/yandex/t3.txt', index=False)
train[5].to_csv('datasets/yandex/t5.txt', index=False)

test[1].to_csv('datasets/yandex/f1.txt', index=False)
test[2].to_csv('datasets/yandex/f2.txt', index=False)
test[3].to_csv('datasets/yandex/f3.txt', index=False)
test[5].to_csv('datasets/yandex/f5.txt', index=False)

In [6]:
# then files are processed externally using CLI version of fasttext using 
# using cc.ru.300.bin.gz from https://fasttext.cc/docs/en/crawl-vectors.html
#
#
#$ ./fasttext print-sentence-vectors cc.ru.300.bin < f1.txt > f1.cc
#$ ./fasttext print-sentence-vectors cc.ru.300.bin < f2.txt > f2.cc
#$ ./fasttext print-sentence-vectors cc.ru.300.bin < f3.txt > f3.cc
#$ ./fasttext print-sentence-vectors cc.ru.300.bin < f5.txt > f5.cc

In [8]:
%%time
t1 = pd.read_csv('datasets/yandex/t1.cc', sep=' ', header=None)
t2 = pd.read_csv('datasets/yandex/t2.cc', sep=' ', header=None)
t3 = pd.read_csv('datasets/yandex/t3.cc', sep=' ', header=None)
t5 = pd.read_csv('datasets/yandex/t5.cc', sep=' ', header=None)

te1 = pd.read_csv('datasets/yandex/f1.cc', sep=' ', header=None)
te2 = pd.read_csv('datasets/yandex/f2.cc', sep=' ', header=None)
te3 = pd.read_csv('datasets/yandex/f3.cc', sep=' ', header=None)
te5 = pd.read_csv('datasets/yandex/f5.cc', sep=' ', header=None)

CPU times: user 49.3 s, sys: 1.64 s, total: 51 s
Wall time: 51.1 s


In [9]:
t1.shape, t2.shape, te1.shape

((97533, 301), (97533, 301), (104834, 301))

In [10]:
X_train = np.hstack([t1, t2, t3, t5])
X_test  = np.hstack([te1, te2, te3, te5])

In [11]:
def rank2num(st):
    if st == 'good':
        return 2
    else:
        if st == 'neutral':
            return 1
        else:
            return 0

In [12]:
train['rank'] = train[6].apply(rank2num)
train['target'] = train['rank'] * train[7]
y_train = train['target']

In [13]:
train_part_size = int(0.75 * train['target'].shape[0])
X_train_part = X_train[:train_part_size, :]
y_train_part = y_train[:train_part_size]
X_valid =  X_train[train_part_size:, :]
y_valid = y_train[train_part_size:]

In [64]:
from lightgbm import LGBMRegressor

reg = LGBMRegressor(n_estimators=720)

# 612 - 86132
# 700 - 86261
# 720 - 86263

reg.fit(X_train_part, y_train_part)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=720,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [65]:
%%time
reg_pred = reg.predict(X_valid)

CPU times: user 2.64 s, sys: 124 ms, total: 2.77 s
Wall time: 971 ms


In [66]:
%%time
from sklearn.metrics import mean_absolute_error

# метрика из постановки задачи была не лучше по близости к реалиям лидерборда, оставил эту

valid_mae = mean_absolute_error(y_valid, reg_pred)
print(valid_mae)

# fasttext 0.700861025602

0.699003289824
CPU times: user 4.87 ms, sys: 13 µs, total: 4.89 ms
Wall time: 1.48 ms


In [67]:
%%time
reg.fit(X_train, y_train)

CPU times: user 17min 24s, sys: 2.19 s, total: 17min 26s
Wall time: 4min 50s


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=720,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [68]:
%%time
y_test = reg.predict(X_test)

CPU times: user 10.5 s, sys: 446 ms, total: 11 s
Wall time: 3.52 s


In [69]:
sub = pd.DataFrame()
sub['context_id'] = test[0]
sub['reply_id'] = test[4]
sub['rank'] = - y_test
sub.head()

Unnamed: 0,context_id,reply_id,rank
0,4909294510,0,-0.728061
1,4909294510,1,-1.29362
2,4909294510,2,-1.189235
3,4909294510,3,-0.82046
4,4909294510,4,-1.001064


In [70]:
submission = sub.sort_values(by=['context_id', 'rank'])
del submission['rank']
submission.head()

Unnamed: 0,context_id,reply_id
1,4909294510,1
2,4909294510,2
4,4909294510,4
3,4909294510,3
5,4909294510,5


In [71]:
test.shape, sub.shape

((104834, 6), (104834, 3))

In [72]:
submission.to_csv('yandex-final-720.tsv',header=None, index=False, sep=' ')