# Import

In [1]:
from collections import Counter
from itertools import combinations
import pandas as pd
import numpy as np
import re
import string
from tqdm import tqdm_notebook
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score , GroupKFold
from xgboost import XGBRegressor

from tqdm._tqdm_notebook import tqdm_notebook

tqdm_notebook.pandas()

pd.set_option('display.max_columns' , 999)

# Simple functions applicable to the source data

In [2]:
def vowels_count(x):
    '''
        x : string
            string for get vowels count
    '''
    return len(re.findall(vowels, x))


def consonant_count(x):
    '''
        x : string
            string for get consonant count
    '''
    return len(re.findall(consonant, x))


def divide_vov_by_cons(x):
    '''
        x : string
            string for get ratio of vowels to consonants
    '''
    return vowels_count(x)/(consonant_count(x) + 0.001)


def count_word(x):
    '''
        x : string
            string for get count of word in string
    '''
    return len(x.split(' '))

func = [len, vowels_count, consonant_count, divide_vov_by_cons, count_word]
func_name = ['len', 'vowels_count', 'consonant_count','divide_vov_by_cons', 'count_word']

# Functions for calculating metrics

In [3]:
def dcg_at_k(r):
    '''
        r : int
            Assigned label
    '''
    r = np.asfarray(r)
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r):
    '''
        r : int
            Assigned label
    '''
    dcg_max = dcg_at_k(sorted(r, reverse=True))
    if not dcg_max:
        return 0.
    return dcg_at_k(r) / dcg_max

# The validation function

In [4]:
def valid(estimator, X, y, columns , n_folds = 5):
    '''
        estimator : estimator object implementing ‘fit’ and ‘predict’
            The object to use to fit the data.
        X : dataframe
            Data for predict y
        y : array , pandas.Series
            label for predict
        columns : list
            Features that will be used in the prediction
    '''
    result = []
    for tr_ind, vl_ind in GroupKFold(n_folds).split(X, groups=X['context_id']):
        predict = X.loc[vl_ind, ['context_id', 'reply_id', 'label']].copy()
        estimator.fit(X.loc[tr_ind, columns], y.loc[tr_ind])
        predict['score'] = estimator.predict(X.loc[vl_ind, columns])

        sub = predict.sort_values(by=['context_id', 'score'], 
                                  ascending=False)[['context_id', 'label']]

        res = sub.groupby('context_id')['label'].apply(ndcg_at_k).mean()
        result.append(res)
    return np.array(result)*100000

# Function for data preprocessing

In [5]:
def preprocess_feature(X):
    '''
        X : dataframe
            Dataset for preprocess
    '''
    X = X.copy()
    X['context_2_notnull'] = X['context_2'].notnull().astype('int8')
    X['context_1_notnull'] = X['context_1'].notnull().astype('int8')

    X.fillna('', inplace=True)
    X['is_duplicate_reply'] = X['reply'].duplicated(keep=False).astype('int8')

    return X

# Simple feature

In [6]:
def drop_punctuation(x):
    '''
        x : string
            String from which the punctuation is removed
    '''
    return re.sub('[%s]' % string.punctuation, ' ', x)

def simple_feature(X):
    '''
        X : dataframe
            Dataset for get first feature
    '''
    for col in ['reply', 'context_2', 'context_1', 'context_0']:
        X[col] = X[col].apply(drop_punctuation)

    for col in ['context_2', 'context_1', 'context_0', 'reply']:
        for f, name in zip(func, func_name):
            X[col + '_' + name] = X[col].apply(f)

    return X

# Read data

In [7]:
train_col_names = ['context_id','context_2','context_1','context_0','reply_id','reply','label','confidence']
public_col_names = ['context_id','context_2','context_1','context_0','reply_id','reply']


train = pd.read_csv('data/train.tsv', sep='\t' , quotechar=' ', header=None , names = train_col_names)
public = pd.read_csv('data/public.tsv', sep='\t', quotechar=' ', header=None , names = public_col_names)

# Get target

In [8]:
map_label = {'bad': 0, 'neutral': 1, 'good': 2}

train['label'] = train['label'].map(map_label)

train['target'] = 0
train.loc[train['label'] == 0, 'target'] = 1 - train.loc[train['label'] == 0, 'confidence']
train.loc[train['label'] == 1, 'target'] = train.loc[train['label'] == 1, 'confidence']
train.loc[train['label'] == 2, 'target'] = 2*train.loc[train['label'] == 2, 'confidence']

# Preprocess and feature generation

In [9]:
vowels = '[аеиоуыэюя]'
consonant = '[бвгджзйклмнпрстфхцчшщъьа]'
alphabet = vowels[:-1] + consonant[1:]


train = preprocess_feature(train)
public = preprocess_feature(public)

train = simple_feature(train)
public = simple_feature(public)

# Validate

In [10]:
to_drop = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id',
       'reply', 'label', 'confidence' , 'target']

columns = train.columns.drop(to_drop)

model = XGBRegressor(objective='rank:pairwise' , max_depth=5)

In [11]:
score = valid(model , X = train , y = train['target'] , columns=columns , n_folds=3)
print (score.mean() , score.std())

82955.0125798502 390.8996378980851


# Predict

In [12]:
public['target'] = model.fit(train[columns] , train['target']).predict(public[columns])

sub = public.sort_values(by=['context_id', 'target'], 
                                  ascending=False)[['context_id', 'reply_id']]
    
sub.to_csv('sub.tsv' , sep='\t' , header=False , index=False)