### Load

In [2]:
import pandas as pd, numpy as np

In [3]:
df = pd.read_csv('data/train.tsv', sep='\t', quotechar=' ', header = None)
df.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply', 'label', 'confidence']
test = pd.read_csv('data/public.tsv', sep='\t', quotechar = ' ', header = None)
test.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply']

In [4]:
df.fillna('', inplace=True)
test.fillna('', inplace=True)

### Preprocessing

#### Labeling target

In [5]:
def label_enc(x ,reverse = False):
    if reverse == False:
        if x == 'bad':
            return 0
        elif x == 'neutral':
            return 1
        else:
            return 2
    else:
        if x == 0:
            return 'bad'
        elif x == 1:
            return 'neutral'
        else:
            return 'good'

In [6]:
df['label'] = df['label'].apply(label_enc)

#### FastText

In [7]:
import fastText, re

In [8]:
ft_model = fastText.load_model("./fastText/cc.ru.300.bin")

In [9]:
def pre(s):
    return re.sub(r'[^\w]', ' ', s)

In [10]:
def Vect(df):
    t1_ft = np.vstack(df['context_2'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
    t2_ft = np.vstack(df['context_1'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
    t3_ft = np.vstack(df['context_0'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
    t5_ft = np.vstack(df['reply'].apply(pre).apply(lambda x: ft_model.get_sentence_vector(x)))
    
    con = np.hstack([t1_ft, t2_ft, t3_ft, t5_ft])
    return con

In [11]:
%time
Train = Vect(df)
Test = Vect(test)
del ft_model

CPU times: user 1 µs, sys: 3 µs, total: 4 µs
Wall time: 7.63 µs


#### Train, test

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(Train, df['label'], test_size=0.35, random_state=42)

#### scorer

In [14]:
def dcg_at_k(r):
    '''
        r : int
            Assigned label
    '''
    r = np.asfarray(r)
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r):
    '''
        r : int
            Assigned label
    '''
    dcg_max = dcg_at_k(sorted(r, reverse=True))
    if not dcg_max:
        return 0.
    return dcg_at_k(r) / dcg_max

### Model

In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [16]:
model = XGBClassifier(objective='rank:pairwise' , max_depth=5, n_jobs=-1)

In [18]:
model.fit(Train, df['label'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [24]:
proba = [max(i) for i in model.predict_proba(Test)]

In [26]:
test['confidence'] = proba

In [27]:
test.sort_values(by=['context_id', 'confidence'])[['context_id', 'reply_id']].to_csv('subm.csv', encoding='utf-8', sep=' ', index=False)