In [1]:
import pandas as pd, numpy as np

In [2]:
train = pd.read_csv('train.tsv', sep='\t', quotechar=' ', header = None)
train.fillna('', inplace=True)

In [3]:
test = pd.read_csv('public.tsv', sep='\t', quoting=3, error_bad_lines=False, header=None, encoding="utf-8")
test.fillna('', inplace=True)

In [4]:
train.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply', 'label', 'confidence']

In [5]:
test.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply']

In [6]:
import pymorphy2 as morphy
from pymorphy2.tokenizers import simple_word_tokenize
analyzer = morphy.MorphAnalyzer()

def count_poses(string):
    interesting = {
        'NOUN':0,
        'ADJ':0,
        'VERB':0,
        'ADVB': 0,
        'PRTF': 0,
        'NPRO': 0,
        'NUMR': 0,
        'GRND': 0,
        'NUMB': 0,
        'LATN': 0
    }
    for token in simple_word_tokenize(string):
        pos = analyzer.tag(token)[0].POS
        if pos == 'ADJF' or pos == 'ADJS':
            interesting['ADJ'] +=1
            continue
        if pos == 'VERB' or pos == 'INFN':
            interesting['VERB'] +=1
            continue
        if pos == 'PRTF' or 'PRTS':
            interesting['PRTF'] += 1
            continue
        if pos in interesting:
            interesting[pos] += 1
    return interesting

In [None]:
train.head(5)

In [None]:
def encode_labels_as_numbers(label):
    if label == 'bad':
        return 0
    if label == 'neutral':
        return 1
    if label == 'good':
        return 2
    return None

def decode_labels_as_strings(label):
    if label == 0:
        return 'bad'
    if label == 1:
        return 'neutral'
    if label == 2:
        return 'good'
    return None

In [None]:
def rank2num(st):
    if st == 'good':
        return 2
    else:
        if st == 'neutral':
            return 1
        else:
            return 0


In [7]:
train['pymorphy_reply'] = train['reply'].apply(lambda x: count_poses(x))
train['reply_NOUN'] = train['pymorphy_reply'].apply(lambda x: x['NOUN'])
train['reply_ADJ'] = train['pymorphy_reply'].apply(lambda x: x['ADJ'])
train['reply_VERB'] = train['pymorphy_reply'].apply(lambda x: x['VERB'])
train['reply_ADVB'] = train['pymorphy_reply'].apply(lambda x: x['ADVB'])
train['reply_PRTF'] = train['pymorphy_reply'].apply(lambda x: x['PRTF'])
train['reply_NPRO'] = train['pymorphy_reply'].apply(lambda x: x['NPRO'])
train['reply_NUMR'] = train['pymorphy_reply'].apply(lambda x: x['NUMR'])
train['reply_GRND'] = train['pymorphy_reply'].apply(lambda x: x['GRND'])
train['reply_NUMB'] = train['pymorphy_reply'].apply(lambda x: x['NUMB'])
train.drop('pymorphy_reply' , axis=1 , inplace=True)

train['pymorphy_context_0'] = train['context_0'].apply(lambda x: count_poses(x))
train['context_0_NOUN'] = train['pymorphy_context_0'].apply(lambda x: x['NOUN'])
train['context_0_ADJ'] = train['pymorphy_context_0'].apply(lambda x: x['ADJ'])
train['context_0_VERB'] = train['pymorphy_context_0'].apply(lambda x: x['VERB'])
train['context_0_ADVB'] = train['pymorphy_context_0'].apply(lambda x: x['ADVB'])
train['context_0_PRTF'] = train['pymorphy_context_0'].apply(lambda x: x['PRTF'])
train['context_0_NPRO'] = train['pymorphy_context_0'].apply(lambda x: x['NPRO'])
train['context_0_NUMR'] = train['pymorphy_context_0'].apply(lambda x: x['NUMR'])
train['context_0_GRND'] = train['pymorphy_context_0'].apply(lambda x: x['GRND'])
train['context_0_NUMB'] = train['pymorphy_context_0'].apply(lambda x: x['NUMB'])

train.drop('pymorphy_context_0' , axis=1 , inplace=True)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sps

In [9]:
test['pymorphy_reply'] = test['reply'].apply(lambda x: count_poses(x))
test['reply_NOUN'] = test['pymorphy_reply'].apply(lambda x: x['NOUN'])
test['reply_ADJ'] = test['pymorphy_reply'].apply(lambda x: x['ADJ'])
test['reply_VERB'] = test['pymorphy_reply'].apply(lambda x: x['VERB'])
test['reply_ADVB'] = test['pymorphy_reply'].apply(lambda x: x['ADVB'])
test['reply_PRTF'] = test['pymorphy_reply'].apply(lambda x: x['PRTF'])
test['reply_NPRO'] = test['pymorphy_reply'].apply(lambda x: x['NPRO'])
test['reply_NUMR'] = test['pymorphy_reply'].apply(lambda x: x['NUMR'])
test['reply_GRND'] = test['pymorphy_reply'].apply(lambda x: x['GRND'])
test['reply_NUMB'] = test['pymorphy_reply'].apply(lambda x: x['NUMB'])
test.drop('pymorphy_reply' , axis=1 , inplace=True)

test['pymorphy_context_0'] = test['context_0'].apply(lambda x: count_poses(x))
test['context_0_NOUN'] = test['pymorphy_context_0'].apply(lambda x: x['NOUN'])
test['context_0_ADJ'] = test['pymorphy_context_0'].apply(lambda x: x['ADJ'])
test['context_0_VERB'] = test['pymorphy_context_0'].apply(lambda x: x['VERB'])
test['context_0_ADVB'] = test['pymorphy_context_0'].apply(lambda x: x['ADVB'])
test['context_0_PRTF'] = test['pymorphy_context_0'].apply(lambda x: x['PRTF'])
test['context_0_NPRO'] = test['pymorphy_context_0'].apply(lambda x: x['NPRO'])
test['context_0_NUMR'] = test['pymorphy_context_0'].apply(lambda x: x['NUMR'])
test['context_0_GRND'] = test['pymorphy_context_0'].apply(lambda x: x['GRND'])
test['context_0_NUMB'] = test['pymorphy_context_0'].apply(lambda x: x['NUMB'])
test.drop('pymorphy_context_0' , axis=1 , inplace=True)


In [10]:
pos_columns = ['reply_NOUN', 'reply_ADJ', 'reply_VERB', 'reply_ADVB', 'reply_PRTF', 'reply_NPRO', 'reply_NUMR', 'reply_GRND', 'reply_NUMB',  
               'context_0_NOUN', 'context_0_ADJ', 'context_0_VERB', 'context_0_ADVB', 'context_0_PRTF', 'context_0_NPRO', 'context_0_NUMR', 'context_0_GRND', 'context_0_NUMB']
pos_train = train[pos_columns]
pos_test = test[pos_columns]

In [11]:
content = train['context_0'].append(train['context_1']).append(train['context_2']).append(train['reply'])

vectorizer = TfidfVectorizer().fit(content)

context_0 = vectorizer.transform(train['context_0'])
context_0_t = vectorizer.transform(test['context_0'])

context_1 = vectorizer.transform(train['context_1'])
context_1_t = vectorizer.transform(test['context_1'])

context_2 = vectorizer.transform(train['context_2'])
context_2_t = vectorizer.transform(test['context_2'])

reply = vectorizer.transform(train['reply'])
reply_t = vectorizer.transform(test['reply'])

X = sps.hstack((context_0, context_1, context_2, reply))
X_test = sps.hstack((context_0_t, context_1_t, context_2_t, reply_t))

# y = train['target']

In [12]:
X.shape, X_test.shape

((97533, 163732), (9968, 163732))

In [None]:
# train['rank'] = train['label'].apply(rank2num)
# train['target'] = train['rank'] * train['confidence']
train.head()

In [None]:
y = train['target']

In [13]:
from xgboost import XGBRegressor



In [14]:
map_label = {'bad': 0, 'neutral': 1, 'good': 2}

train['label'] = train['label'].map(map_label)

train['target'] = 0
train.loc[train['label'] == 0, 'target'] = 1 - train.loc[train['label'] == 0, 'confidence']
train.loc[train['label'] == 1, 'target'] = train.loc[train['label'] == 1, 'confidence']
train.loc[train['label'] == 2, 'target'] = 2*train.loc[train['label'] == 2, 'confidence']

In [15]:
model = XGBRegressor(objective='rank:pairwise' , max_depth=7)

In [16]:
from sklearn.decomposition import PCA, TruncatedSVD
svd = TruncatedSVD(120)
X = sps.hstack((X, pos_train))
X_transformed = svd.fit_transform(X)

X_test = sps.hstack((X_test, pos_test))
X_test = svd.transform(X_test)

In [17]:
model.fit(X_transformed, train['target'])

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=7,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='rank:pairwise', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [18]:
test['target'] = model.predict(X_test)
sub = test.sort_values(by=['context_id', 'target'], 
                                  ascending=False)[['context_id', 'reply_id']]
    
sub.to_csv('xgboost.tsv' , sep='\t' , header=False , index=False)