In [35]:
from collections import Counter
from itertools import combinations
import pandas as pd
import numpy as np
import re
import string
from tqdm import tqdm_notebook
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, GroupKFold
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm._tqdm_notebook import tqdm_notebook
from sklearn.linear_model import LinearRegression, Ridge, Lasso

tqdm_notebook.pandas()

pd.set_option('display.max_columns' , 999)
pd.options.display.max_colwidth = 300

import fastText

In [2]:
df_train = pd.read_csv('data/train.tsv', sep='\t', quotechar=' ', quoting=3, header=None, encoding='utf-8')
df_train.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply', 'label', 'confidence']
# df_test = pd.read_csv('data/public.tsv', sep='\t', quotechar=' ', quoting=3, header=None, encoding='utf-8')
df_test = pd.read_csv('data/final.tsv', sep='\t', quotechar=' ', quoting=3, header=None, encoding='utf-8')
df_test.columns = ['context_id', 'context_2', 'context_1', 'context_0', 'reply_id', 'reply']

In [3]:
print(df_train.shape)
df_train.head(n=7)

(97533, 8)


Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,label,confidence
0,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",0,не могу .,good,0.875352
1,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",1,"нет , звонить буду я .",neutral,0.900968
2,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",2,"слушай , я не мог уйти .",bad,0.88432
3,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",3,я не прекращу звонить .,good,0.98253
4,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",4,я звоню им .,good,0.838054
5,22579918886,"кликни на меня а потом на надпись "" видео - зв...","о , я тебя вижу .","ладно , повесь трубку .",5,просто повесь трубку .,bad,0.955718
6,50117448291,бывало и получше .,"слушайте , мы с женой . . . совсем заблудились .","я пытаюсь добраться до юджина , но , кажется ,...",0,едете ?,bad,0.909115


In [4]:
print(df_test.shape)
df_test.head(n=7)

(104834, 6)


Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply
0,4909294510,,,нет . . . у тебя на лице написано - нет .,0,тогда я попытался с двумя другими женщинами и ...
1,4909294510,,,нет . . . у тебя на лице написано - нет .,1,"я улыбаюсь потому . . . потому что , описывая ..."
2,4909294510,,,нет . . . у тебя на лице написано - нет .,2,"это , так сказать , соответствует уровню моей ..."
3,4909294510,,,нет . . . у тебя на лице написано - нет .,3,я врач .
4,4909294510,,,нет . . . у тебя на лице написано - нет .,4,не обращайте на меня внимания !
5,4909294510,,,нет . . . у тебя на лице написано - нет .,5,извините .
6,15805100619,,сейчас непросто думать .,весь напичкан лекарствами .,0,успокоит желудок .


In [5]:
df_train.fillna('', inplace=True)
df_test.fillna('', inplace=True)

# FastText

In [6]:
ft_model = fastText.load_model('./data/cc.ru.300.bin')

In [7]:
def fasttext_feature(df):
    return (df['context_2'] + ' ' + df['context_1'] + ' ' + df['context_0'] + ' ' + df['reply']).apply(
        lambda r: ft_model.get_sentence_vector(r)
    )

column_names = ['ft_vec_{}'.format(i) for i in range(300)]
X_train = pd.DataFrame(np.vstack(fasttext_feature(df_train)), columns=column_names)
X_test  = pd.DataFrame(np.vstack(fasttext_feature(df_test)),  columns=column_names)

In [8]:
X_train.head()

Unnamed: 0,ft_vec_0,ft_vec_1,ft_vec_2,ft_vec_3,ft_vec_4,ft_vec_5,ft_vec_6,ft_vec_7,ft_vec_8,ft_vec_9,ft_vec_10,ft_vec_11,ft_vec_12,ft_vec_13,ft_vec_14,ft_vec_15,ft_vec_16,ft_vec_17,ft_vec_18,ft_vec_19,ft_vec_20,ft_vec_21,ft_vec_22,ft_vec_23,ft_vec_24,ft_vec_25,ft_vec_26,ft_vec_27,ft_vec_28,ft_vec_29,ft_vec_30,ft_vec_31,ft_vec_32,ft_vec_33,ft_vec_34,ft_vec_35,ft_vec_36,ft_vec_37,ft_vec_38,ft_vec_39,ft_vec_40,ft_vec_41,ft_vec_42,ft_vec_43,ft_vec_44,ft_vec_45,ft_vec_46,ft_vec_47,ft_vec_48,ft_vec_49,ft_vec_50,ft_vec_51,ft_vec_52,ft_vec_53,ft_vec_54,ft_vec_55,ft_vec_56,ft_vec_57,ft_vec_58,ft_vec_59,ft_vec_60,ft_vec_61,ft_vec_62,ft_vec_63,ft_vec_64,ft_vec_65,ft_vec_66,ft_vec_67,ft_vec_68,ft_vec_69,ft_vec_70,ft_vec_71,ft_vec_72,ft_vec_73,ft_vec_74,ft_vec_75,ft_vec_76,ft_vec_77,ft_vec_78,ft_vec_79,ft_vec_80,ft_vec_81,ft_vec_82,ft_vec_83,ft_vec_84,ft_vec_85,ft_vec_86,ft_vec_87,ft_vec_88,ft_vec_89,ft_vec_90,ft_vec_91,ft_vec_92,ft_vec_93,ft_vec_94,ft_vec_95,ft_vec_96,ft_vec_97,ft_vec_98,ft_vec_99,ft_vec_100,ft_vec_101,ft_vec_102,ft_vec_103,ft_vec_104,ft_vec_105,ft_vec_106,ft_vec_107,ft_vec_108,ft_vec_109,ft_vec_110,ft_vec_111,ft_vec_112,ft_vec_113,ft_vec_114,ft_vec_115,ft_vec_116,ft_vec_117,ft_vec_118,ft_vec_119,ft_vec_120,ft_vec_121,ft_vec_122,ft_vec_123,ft_vec_124,ft_vec_125,ft_vec_126,ft_vec_127,ft_vec_128,ft_vec_129,ft_vec_130,ft_vec_131,ft_vec_132,ft_vec_133,ft_vec_134,ft_vec_135,ft_vec_136,ft_vec_137,ft_vec_138,ft_vec_139,ft_vec_140,ft_vec_141,ft_vec_142,ft_vec_143,ft_vec_144,ft_vec_145,ft_vec_146,ft_vec_147,ft_vec_148,ft_vec_149,ft_vec_150,ft_vec_151,ft_vec_152,ft_vec_153,ft_vec_154,ft_vec_155,ft_vec_156,ft_vec_157,ft_vec_158,ft_vec_159,ft_vec_160,ft_vec_161,ft_vec_162,ft_vec_163,ft_vec_164,ft_vec_165,ft_vec_166,ft_vec_167,ft_vec_168,ft_vec_169,ft_vec_170,ft_vec_171,ft_vec_172,ft_vec_173,ft_vec_174,ft_vec_175,ft_vec_176,ft_vec_177,ft_vec_178,ft_vec_179,ft_vec_180,ft_vec_181,ft_vec_182,ft_vec_183,ft_vec_184,ft_vec_185,ft_vec_186,ft_vec_187,ft_vec_188,ft_vec_189,ft_vec_190,ft_vec_191,ft_vec_192,ft_vec_193,ft_vec_194,ft_vec_195,ft_vec_196,ft_vec_197,ft_vec_198,ft_vec_199,ft_vec_200,ft_vec_201,ft_vec_202,ft_vec_203,ft_vec_204,ft_vec_205,ft_vec_206,ft_vec_207,ft_vec_208,ft_vec_209,ft_vec_210,ft_vec_211,ft_vec_212,ft_vec_213,ft_vec_214,ft_vec_215,ft_vec_216,ft_vec_217,ft_vec_218,ft_vec_219,ft_vec_220,ft_vec_221,ft_vec_222,ft_vec_223,ft_vec_224,ft_vec_225,ft_vec_226,ft_vec_227,ft_vec_228,ft_vec_229,ft_vec_230,ft_vec_231,ft_vec_232,ft_vec_233,ft_vec_234,ft_vec_235,ft_vec_236,ft_vec_237,ft_vec_238,ft_vec_239,ft_vec_240,ft_vec_241,ft_vec_242,ft_vec_243,ft_vec_244,ft_vec_245,ft_vec_246,ft_vec_247,ft_vec_248,ft_vec_249,ft_vec_250,ft_vec_251,ft_vec_252,ft_vec_253,ft_vec_254,ft_vec_255,ft_vec_256,ft_vec_257,ft_vec_258,ft_vec_259,ft_vec_260,ft_vec_261,ft_vec_262,ft_vec_263,ft_vec_264,ft_vec_265,ft_vec_266,ft_vec_267,ft_vec_268,ft_vec_269,ft_vec_270,ft_vec_271,ft_vec_272,ft_vec_273,ft_vec_274,ft_vec_275,ft_vec_276,ft_vec_277,ft_vec_278,ft_vec_279,ft_vec_280,ft_vec_281,ft_vec_282,ft_vec_283,ft_vec_284,ft_vec_285,ft_vec_286,ft_vec_287,ft_vec_288,ft_vec_289,ft_vec_290,ft_vec_291,ft_vec_292,ft_vec_293,ft_vec_294,ft_vec_295,ft_vec_296,ft_vec_297,ft_vec_298,ft_vec_299
0,0.017006,-0.034744,0.006785,-0.001589,-0.000538,0.004771,-0.016154,0.004372,-0.02621,-0.019262,0.002816,-0.003943,0.007769,0.003796,-0.017581,-0.020314,-0.01345,0.003917,-0.026513,0.002963,-0.005929,-0.001393,0.001359,0.026505,-0.013441,-0.008334,0.008402,0.033696,0.006745,0.003382,-0.015411,-0.004139,-0.005801,0.026962,-0.004942,-0.015525,0.003415,-0.306534,-0.023072,-0.005365,-0.016269,-0.008223,-0.004202,-0.021127,-0.021455,0.015516,-0.001456,0.016348,-0.011353,-0.005339,-0.000371,-0.013523,0.000366,-0.00134,-0.011223,-0.011227,-0.000396,0.002062,-0.042276,0.012103,-0.130811,-0.012966,0.02052,-0.008974,-0.005352,-0.007379,-0.003888,-0.002084,0.000176,-0.00715,-0.010111,0.010237,0.020073,-0.022171,-0.005727,0.006169,-0.014142,-0.021247,0.004208,0.009417,0.005247,0.020148,0.005188,-0.015143,-0.007667,-0.005906,0.011659,-0.030654,0.013179,0.002999,-0.025164,0.005919,0.003805,0.02701,0.00617,-0.002481,0.022149,0.00516,-0.007737,-0.006852,-0.025896,0.007271,-0.031287,0.012182,-0.003527,-0.016936,-0.030101,0.012009,0.00576,0.00884,0.022732,0.019283,0.020061,-0.005984,0.006277,0.01497,-0.005526,0.020266,0.010317,0.003029,0.008549,-0.002233,0.037159,-0.002237,0.010355,-0.000414,-0.002136,-0.013688,-0.002126,0.007183,0.018911,-0.010343,0.00459,-0.006213,-0.001387,-0.0059,0.013924,0.012624,-0.025142,0.007533,0.009448,0.003546,0.045312,0.01776,-0.044217,-7.5e-05,0.036257,0.076109,0.005965,-0.0083,-0.006248,0.001449,-0.017018,0.006499,-0.000421,0.018044,-0.017401,-0.005034,0.005432,-0.007479,-0.000585,0.0219,-0.01711,-0.022421,-0.015783,0.008137,-0.002116,0.007834,0.000716,0.012539,-0.01468,0.016543,0.006321,0.009983,-0.05513,-0.004646,-0.022277,-0.015003,0.02587,0.030474,-0.003492,-0.078183,0.00408,-0.020222,-0.014423,-0.001981,0.009637,0.002084,-0.002822,0.007181,-0.045375,0.000356,-0.004929,0.018391,0.003895,-0.000321,0.041389,0.004135,0.002047,-0.002158,-0.016128,0.018415,-0.001623,-0.015352,-0.014923,0.006149,0.00322,0.017156,-0.01228,-0.029824,0.02505,-0.020363,-0.00786,-0.018273,-0.004061,-0.000892,-0.015386,0.025924,-0.035209,0.027806,0.006814,-0.006054,-0.0027,-0.004081,0.012842,-0.007086,-0.00861,0.010758,0.016476,-0.058985,-0.00538,0.034787,-0.003073,0.029826,0.018863,-0.018312,0.018291,-0.025602,-0.005894,-0.023101,0.009164,0.041173,0.016274,-0.019938,-0.011989,0.011937,0.009141,0.001159,0.012404,-0.000592,-0.026291,0.017418,-0.0156,-0.057316,0.008761,-0.014248,-0.032617,0.004016,-0.007097,0.000662,-0.017317,0.002168,-0.011718,-0.030618,-0.024083,0.036993,0.003904475,-0.000847,-0.016503,-0.003308,0.047435,0.002616,0.160604,-0.020432,0.00803,-0.019937,-0.023147,-0.015289,0.001726,0.003913,0.006034,0.004852,-0.002654,0.002026,0.013954,0.008379,-0.003536,0.026021,-0.008296,0.004204,-0.023171,-0.021833,0.005263,-0.03254,-0.06518,0.010849,-0.008606,0.029214,-0.017494,-0.041679
1,0.019021,-0.038385,0.004946,-0.001751,0.002781,0.001333,-0.021151,0.003749,-0.02629,-0.022307,0.007709,-0.002427,0.009355,0.005442,-0.01783,-0.016448,-0.014782,0.001949,-0.022715,0.001575,-0.008362,-0.001707,-0.00379,0.033461,-0.017163,-0.007481,0.004991,0.039207,0.005774,0.001102,-0.01882,-0.005465,-0.006878,0.024222,-0.005224,-0.014054,0.007665,-0.301112,-0.029435,-0.003414,-0.016222,-0.016557,-0.004032,-0.021373,-0.026762,0.013349,-0.001868,0.007404,-0.006028,0.003291,-4.3e-05,-0.009226,0.006059,-0.003609,-0.01895,-0.015126,0.000712,0.009338,-0.044963,0.008452,-0.130233,-0.011521,0.022649,-0.001376,-0.007375,-0.002977,-0.002882,-0.003966,0.001174,-0.010103,-0.013468,0.010577,0.017456,-0.020555,-0.009071,0.002844,-0.022572,-0.019439,0.007905,0.004481,0.002432,0.023732,0.004398,-0.014851,-0.00833,-0.000317,0.011459,-0.026764,0.013784,0.007656,-0.025496,0.008147,0.002908,0.026201,0.002695,-0.002536,0.022437,0.013626,-0.01188,-0.00189,-0.026895,0.004644,-0.033201,0.012521,-0.00211,-0.015802,-0.023128,0.007939,0.003158,0.010595,0.021692,0.01754,0.018031,-0.007776,0.007499,0.012846,-0.003521,0.019712,0.001217,0.003294,0.015431,0.004871,0.040936,0.007202,0.010965,0.003454,-0.006213,-0.014492,-0.004325,0.005699,0.018535,-0.009303,0.004107,-0.005886,0.000897,-0.008532,0.012522,0.014962,-0.026623,0.005456,0.014658,0.009774,0.046992,0.015991,-0.046296,-0.001005,0.039247,0.07048,0.007523,-0.008794,0.000149,-0.000265,-0.022262,0.003254,-0.001114,0.017924,-0.01102,-0.000949,0.002391,-0.007283,-0.002216,0.024252,-0.014869,-0.024102,-0.009355,0.006446,-0.012307,0.016799,0.002027,0.013399,-0.009452,0.014359,0.008055,0.006636,-0.053396,-0.003662,-0.022037,-0.013624,0.027641,0.02717,-0.001288,-0.079408,0.002223,-0.023374,-0.014862,-0.005705,0.009024,0.003818,-0.003375,0.006199,-0.048065,0.002218,-0.000214,0.017932,0.003881,-0.002504,0.041738,0.007499,-0.00219,-0.001572,-0.01612,0.020206,-0.000474,-0.0181,-0.01727,0.009046,0.001145,0.013648,-0.003814,-0.029787,0.023675,-0.024135,-0.011933,-0.01398,-0.000935,-0.001972,-0.018494,0.019796,-0.037344,0.023081,0.009985,-0.001315,-0.003963,-0.000554,0.009144,-0.007473,-0.006383,0.009129,0.019195,-0.062581,-0.00335,0.033525,-0.00437,0.026268,0.018837,-0.014822,0.01939,-0.024653,-0.002868,-0.02619,0.008529,0.039409,0.0184,-0.018261,-0.015195,0.009691,0.010687,-0.002374,0.009195,0.001247,-0.020987,0.011031,-0.018187,-0.056739,0.007387,-0.010698,-0.031512,0.003902,-0.003815,0.002051,-0.025796,0.006482,-0.014389,-0.031027,-0.019935,0.036105,-8.596612e-07,0.001659,-0.013297,-0.00465,0.047141,0.000568,0.159519,-0.022473,0.009222,-0.01949,-0.023613,-0.018424,0.004271,0.003514,0.009427,0.006021,0.002013,-0.001091,0.012698,0.012384,-0.001788,0.021871,-0.009894,-0.001933,-0.02387,-0.022513,0.003574,-0.032606,-0.060522,0.005689,-0.00734,0.02358,-0.028921,-0.043198
2,0.016084,-0.035228,0.008171,-0.002742,-0.002678,0.007532,-0.014213,-0.00152,-0.030775,-0.0171,0.006904,0.002459,0.009928,2e-05,-0.015156,-0.01367,-0.013211,0.000609,-0.022672,0.002265,-0.006392,-0.004385,-0.005565,0.026944,-0.009453,-0.003559,0.00736,0.027973,0.004019,-0.000378,-0.014929,-0.006596,-0.011111,0.02652,-0.005072,-0.016942,0.005608,-0.305401,-0.02627,-0.006727,-0.014131,-0.011064,-0.004873,-0.01731,-0.026085,0.016906,0.002466,0.014009,-0.006575,-0.002481,-0.002485,-0.010427,0.002093,0.000407,-0.011974,-0.014065,0.001234,0.002118,-0.044991,0.011769,-0.12684,-0.005656,0.022487,-0.005106,-0.005941,-0.00195,-0.004742,-0.000665,-0.002168,-0.006264,-0.006928,0.009253,0.022712,-0.024054,-0.003343,0.004273,-0.012374,-0.019618,0.00529,0.006979,0.003691,0.020454,0.001705,-0.009372,-0.008797,2.2e-05,0.014512,-0.028747,0.017111,0.005335,-0.021737,0.005745,0.004145,0.025638,0.015517,-0.004215,0.018332,0.007357,-0.020849,-0.009123,-0.026913,0.01413,-0.033008,0.009485,-0.002507,-0.018128,-0.025434,0.01588,0.003691,0.006583,0.021417,0.018193,0.015217,-0.003432,0.008842,0.018101,-0.006886,0.021364,0.011301,-8.9e-05,0.003778,-0.003118,0.040201,-0.00135,0.009628,0.000785,-0.00163,-0.013551,0.000785,0.006497,0.020844,-0.004677,0.000792,-0.004962,0.000539,-0.001473,0.011052,0.014408,-0.025467,0.009402,0.002355,0.003496,0.044129,0.022545,-0.044125,0.002345,0.040593,0.069851,0.003681,-0.006528,-0.013298,0.006415,-0.02312,0.001566,-0.004253,0.016487,-0.002236,-0.002086,0.005849,-0.00693,-0.008653,0.020559,-0.016768,-0.022965,-0.01172,0.011881,0.001399,0.014539,-0.001172,0.01336,-0.013721,0.019775,0.007,0.005789,-0.052559,-0.002971,-0.022672,-0.016718,0.0286,0.023921,-0.002887,-0.086834,0.003946,-0.020093,-0.013805,-0.00133,0.006657,0.003429,-0.000546,0.004893,-0.044284,0.001757,-0.003189,0.015701,0.006235,-8e-06,0.040675,0.00281,-0.000217,-6.2e-05,-0.01652,0.018921,-0.005047,-0.010401,-0.015887,0.003907,-0.001838,0.02064,-0.010964,-0.030479,0.012842,-0.02071,-0.01278,-0.013898,-0.003431,-0.001217,-0.009875,0.023969,-0.030686,0.028381,0.005751,-0.008792,-0.001901,-0.004455,0.012392,-0.01362,-0.007988,0.005804,0.013563,-0.065194,-0.002491,0.032624,0.000697,0.030437,0.020945,-0.021527,0.015322,-0.026306,-0.005053,-0.020299,0.007941,0.038801,0.011294,-0.019602,-0.009981,0.011833,0.010488,-0.001423,0.007395,0.002611,-0.011855,0.010763,-0.017187,-0.048274,0.005602,-0.014123,-0.038178,0.002874,-0.001994,0.007803,-0.022012,0.000824,-0.013412,-0.03072,-0.023267,0.037247,0.006871325,-0.00308,-0.016298,-0.007488,0.044572,0.001555,0.159614,-0.020962,0.005111,-0.019426,-0.02045,-0.017304,0.0027,0.00725,0.006624,0.000594,-0.001113,0.003047,0.015484,0.014885,-0.000747,0.022679,-0.011021,0.002582,-0.026506,-0.020621,0.005013,-0.031119,-0.062513,0.010992,-0.008315,0.031021,-0.017787,-0.043509
3,0.021578,-0.034345,0.006115,-0.002616,0.003054,0.003902,-0.016581,0.00135,-0.028114,-0.022057,2.6e-05,-0.000854,0.007219,0.003237,-0.015361,-0.015769,-0.017495,0.001139,-0.023943,0.000801,-0.003986,-0.005533,0.001265,0.031506,-0.013633,-0.010045,0.004539,0.036409,0.00269,0.002006,-0.017671,-0.002529,-0.010656,0.030003,-0.00626,-0.014243,0.01097,-0.298412,-0.024593,-0.006828,-0.016945,-0.014604,-0.005897,-0.024992,-0.019722,0.011787,0.002989,0.009994,-0.010161,-0.003697,-0.004931,-0.01142,0.003125,-0.003677,-0.017826,-0.013076,-0.00309,0.012749,-0.044204,0.010347,-0.126958,-0.004636,0.024162,-0.004364,-0.007451,-0.006525,-0.0048,-0.0032,0.002963,-0.008917,-0.009445,0.010745,0.018892,-0.020733,-0.003933,0.00167,-0.017875,-0.018162,0.010239,0.005349,0.000762,0.022182,0.004936,-0.009614,-0.007359,0.000594,0.012722,-0.028333,0.013788,0.003999,-0.023333,0.0066,0.000519,0.027438,0.008535,-0.004,0.019265,0.01098,-0.008808,-0.002638,-0.027292,0.012564,-0.030357,0.013938,-0.007344,-0.01572,-0.027443,0.007859,0.010614,0.011356,0.024535,0.017713,0.016616,-0.006149,0.00846,0.01539,-0.003471,0.023049,0.001566,0.002752,0.012789,0.003421,0.039629,0.00492,0.005075,0.004007,-0.001497,-0.015858,-0.002538,0.007795,0.018493,-0.010704,0.000605,-0.005401,0.002184,-0.007091,0.012102,0.0144,-0.024764,0.004073,0.010367,0.007269,0.03942,0.017664,-0.041472,4.3e-05,0.033382,0.068956,0.007324,-0.008964,-0.006322,-0.0025,-0.017136,0.000994,-0.003737,0.017365,-0.008068,0.000552,0.004605,-0.008681,-0.001654,0.019939,-0.014302,-0.022409,-0.006826,0.006573,-0.005959,0.013356,0.00196,0.012569,-0.007208,0.016195,0.009321,0.008404,-0.054552,-0.000171,-0.024876,-0.013099,0.020042,0.027137,-0.001006,-0.080338,0.00274,-0.023316,-0.011285,-0.006622,0.006382,0.001941,-0.0033,0.005623,-0.047225,0.000104,-0.002569,0.018035,0.002633,-0.002786,0.038617,0.003793,-0.004353,-0.0039,-0.017035,0.02065,-0.007606,-0.017694,-0.018863,0.009364,-0.000808,0.016162,-0.013158,-0.030672,0.021319,-0.021764,-0.011479,-0.018763,-0.003813,-0.002217,-0.023609,0.023406,-0.034092,0.025318,0.006017,-0.001647,-0.002403,0.000553,0.009821,-0.005946,-0.004036,0.009032,0.022098,-0.0642,-0.00086,0.030289,-0.003233,0.030441,0.02113,-0.015505,0.021811,-0.025458,-0.004844,-0.0251,0.006284,0.04281,0.017351,-0.021187,-0.00967,0.010719,0.012128,-0.003338,0.008028,0.00378,-0.022383,0.013051,-0.018991,-0.056798,0.008903,-0.012155,-0.03349,0.002173,-0.001747,0.000233,-0.026485,0.006716,-0.009558,-0.027323,-0.017439,0.035287,0.002363026,0.000565,-0.015063,-0.007956,0.048865,-0.0003,0.164257,-0.020282,0.006163,-0.020347,-0.023339,-0.01774,0.005522,0.00262,0.004748,0.005451,0.000563,0.000105,0.016828,0.011852,-0.003854,0.020634,-0.011402,0.001694,-0.024416,-0.020345,0.002023,-0.035167,-0.062145,0.011246,-0.004974,0.023922,-0.022738,-0.036738
4,0.017599,-0.035226,0.002638,-0.00276,0.000406,0.002578,-0.014829,0.001294,-0.024681,-0.019856,0.001191,0.004037,0.01184,0.006206,-0.015637,-0.021706,-0.016247,0.004111,-0.020068,-0.000271,-0.005508,-0.003325,0.000887,0.028516,-0.009259,-0.012022,0.003348,0.03711,0.008312,-0.000739,-0.020498,-0.006137,-0.003972,0.034488,-0.011239,-0.011255,-0.003861,-0.298995,-0.030221,-0.007108,-0.016646,-0.012015,-0.004739,-0.015413,-0.022588,0.021115,0.000784,0.016851,-0.014236,0.001732,-0.006038,-0.010231,0.003538,-0.006806,-0.015375,-0.014158,0.003664,0.010781,-0.044512,0.013307,-0.127045,-0.009368,0.022605,-0.011968,-0.007813,-0.007738,-0.002436,-0.007095,0.001855,-0.007376,-0.009973,0.005211,0.021946,-0.018153,-0.012745,0.004023,-0.016895,-0.022109,0.004257,0.006692,0.003074,0.026705,0.00718,-0.015944,-0.012676,-0.001096,0.010015,-0.032065,0.009577,0.000671,-0.025993,0.004641,0.002435,0.025368,0.004128,-0.002344,0.025865,0.009302,-0.008828,-0.00433,-0.02365,0.004171,-0.029235,0.010024,-0.003339,-0.016577,-0.029046,0.005392,0.010909,0.007603,0.022355,0.014738,0.020345,-0.007246,0.011173,0.0163,-0.003227,0.019259,0.005282,0.006701,0.013103,-0.006693,0.0398,0.001653,0.012397,-0.000169,0.000446,-0.017641,-0.001769,0.007421,0.017952,-0.009183,-0.000141,-0.009923,-0.004016,-0.004552,0.013994,0.018077,-0.033847,0.005253,0.011582,0.003871,0.041094,0.016597,-0.03864,-0.002314,0.03282,0.072011,0.004479,-0.011025,-0.008663,0.000363,-0.019528,0.007519,0.000378,0.019429,-0.013554,0.002781,-0.000389,-0.006712,-0.000758,0.026614,-0.013231,-0.02235,-0.010515,-0.00134,-0.0033,0.0152,0.001356,0.013706,-0.010367,0.015946,0.006418,0.017674,-0.054938,-0.00293,-0.029576,-0.017582,0.029019,0.024754,0.002462,-0.073418,0.004228,-0.021087,-0.011337,-0.00181,0.010313,0.003922,0.002989,0.003996,-0.048213,-0.002524,-0.002038,0.018858,0.004307,-0.000516,0.034721,0.002439,0.000473,-0.002888,-0.014492,0.024507,-0.008147,-0.015256,-0.015975,0.010322,0.007212,0.016246,-0.015856,-0.026233,0.025967,-0.021053,-0.015102,-0.018865,-0.0016,-0.003769,-0.021103,0.017767,-0.033828,0.021402,0.004239,-0.002903,0.003483,-0.000137,0.011535,-0.00647,-0.00332,0.010452,0.019483,-0.062608,0.003597,0.029971,-0.009043,0.031322,0.021085,-0.014022,0.019157,-0.024304,-0.009495,-0.023149,0.006366,0.039612,0.020659,-0.021187,-0.011114,0.012552,0.012267,-0.003012,0.013237,0.002554,-0.02767,0.016228,-0.01927,-0.056983,0.005801,-0.014964,-0.024842,0.003337,-0.006502,-0.001013,-0.019193,0.007623,-0.006459,-0.028773,-0.021464,0.036633,-0.001716019,-0.001749,-0.013418,-0.00836,0.045839,0.000215,0.165874,-0.020029,0.009266,-0.016687,-0.030362,-0.01348,0.00949,0.00075,0.008221,0.008903,0.00164,-0.000179,0.017568,0.012592,-0.006158,0.019824,-0.009126,0.001351,-0.019622,-0.020984,0.001588,-0.032397,-0.066057,0.010786,-0.004952,0.023401,-0.019162,-0.037292


# Simple Features

In [9]:
def preprocess_feature(X):
    '''
        X : dataframe
            Dataset for preprocess
    '''
    X = X.copy()
    X['context_2_notnull'] = X['context_2'].notnull().astype('int8')
    X['context_1_notnull'] = X['context_1'].notnull().astype('int8')

    X.fillna('', inplace=True)
    X['is_duplicate_reply'] = X['reply'].duplicated(keep=False).astype('int8')

    return X

In [10]:
def vowels_count(x):
    '''
        x : string
            string for get vowels count
    '''
    return len(re.findall(vowels, x))


def consonant_count(x):
    '''
        x : string
            string for get consonant count
    '''
    return len(re.findall(consonant, x))


def divide_vov_by_cons(x):
    '''
        x : string
            string for get ratio of vowels to consonants
    '''
    return vowels_count(x)/(consonant_count(x) + 0.001)


def count_word(x):
    '''
        x : string
            string for get count of word in string
    '''
    return len(x.split(' '))

func = [len, vowels_count, consonant_count, divide_vov_by_cons, count_word]
func_name = ['len', 'vowels_count', 'consonant_count','divide_vov_by_cons', 'count_word']

In [11]:
def drop_punctuation(x):
    '''
        x : string
            String from which the punctuation is removed
    '''
    return re.sub('[%s]' % string.punctuation, ' ', x)

def simple_feature(X):
    '''
        X : dataframe
            Dataset for get first feature
    '''
    for col in ['reply', 'context_2', 'context_1', 'context_0']:
        X[col] = X[col].apply(drop_punctuation)

    for col in ['context_2', 'context_1', 'context_0', 'reply']:
        for f, name in zip(func, func_name):
            X[col + '_' + name] = X[col].apply(f)

    return X

In [12]:
vowels = '[аеиоуыэюя]'
consonant = '[бвгджзйклмнпрстфхцчшщъь]'
alphabet = vowels[:-1] + consonant[1:]

X_train1 = simple_feature(preprocess_feature(df_train))
X_test1  = simple_feature(preprocess_feature(df_test))

In [13]:
X_test1.head()

Unnamed: 0,context_id,context_2,context_1,context_0,reply_id,reply,context_2_notnull,context_1_notnull,is_duplicate_reply,context_2_len,context_2_vowels_count,context_2_consonant_count,context_2_divide_vov_by_cons,context_2_count_word,context_1_len,context_1_vowels_count,context_1_consonant_count,context_1_divide_vov_by_cons,context_1_count_word,context_0_len,context_0_vowels_count,context_0_consonant_count,context_0_divide_vov_by_cons,context_0_count_word,reply_len,reply_vowels_count,reply_consonant_count,reply_divide_vov_by_cons,reply_count_word
0,4909294510,,,нет у тебя на лице написано нет,0,тогда я попытался с двумя другими женщинами и ...,1,1,1,0,0,0,0.0,1,0,0,0,0.0,1,41,12,13,0.923006,17,71,28,30,0.933302,14
1,4909294510,,,нет у тебя на лице написано нет,1,я улыбаюсь потому потому что описывая ...,1,1,0,0,0,0,0.0,1,0,0,0,0.0,1,41,12,13,0.923006,17,93,34,34,0.999971,26
2,4909294510,,,нет у тебя на лице написано нет,2,это так сказать соответствует уровню моей ...,1,1,0,0,0,0,0.0,1,0,0,0,0.0,1,41,12,13,0.923006,17,56,18,26,0.692281,13
3,4909294510,,,нет у тебя на лице написано нет,3,я врач,1,1,0,0,0,0,0.0,1,0,0,0,0.0,1,41,12,13,0.923006,17,8,2,3,0.666445,4
4,4909294510,,,нет у тебя на лице написано нет,4,не обращайте на меня внимания,1,1,0,0,0,0,0.0,1,0,0,0,0.0,1,41,12,13,0.923006,17,31,12,13,0.923006,7


In [14]:
print(set(X_train1.columns[10:]) ^ set(X_test1.columns[6:]))
feature_columns = list(X_train1.columns[10:])

{'context_1_notnull', 'context_2_notnull'}


In [15]:
X_train2 = pd.concat([
    X_train1[feature_columns],
    X_train
], axis=1)

X_test2  = pd.concat([
    X_test1[feature_columns],
    X_test
], axis=1)

# TF-IDF

In [39]:
df_train['context_all'] = \
    df_train['context_2'] + ' ' + \
    df_train['context_1'] + ' ' + \
    df_train['context_0'] + ' ' + \
    df_train['reply']

df_test['context_all'] = \
    df_test['context_2'] + ' ' + \
    df_test['context_1'] + ' ' + \
    df_test['context_0'] + ' ' + \
    df_test['reply']

content = df_train['context_all'].append(df_test['context_all'])
content.head()

0                   кликни на меня а потом на надпись " видео - звонок " . о , я тебя вижу . ладно , повесь трубку . не могу .
1      кликни на меня а потом на надпись " видео - звонок " . о , я тебя вижу . ладно , повесь трубку . нет , звонить буду я .
2    кликни на меня а потом на надпись " видео - звонок " . о , я тебя вижу . ладно , повесь трубку . слушай , я не мог уйти .
3     кликни на меня а потом на надпись " видео - звонок " . о , я тебя вижу . ладно , повесь трубку . я не прекращу звонить .
4                кликни на меня а потом на надпись " видео - звонок " . о , я тебя вижу . ладно , повесь трубку . я звоню им .
Name: context_all, dtype: object

In [40]:
model_tfidf = TfidfVectorizer().fit(content)

In [None]:
model_tfidf

# Target

In [16]:
map_label = {'bad': 0, 'neutral': 1, 'good': 2}

df_train['label_map'] = df_train['label'].map(map_label)

df_train['target'] = 0
df_train.loc[df_train['label_map'] == 0, 'target'] = 1 - df_train.loc[df_train['label_map'] == 0, 'confidence']
df_train.loc[df_train['label_map'] == 1, 'target'] = df_train.loc[df_train['label_map'] == 1, 'confidence']
df_train.loc[df_train['label_map'] == 2, 'target'] = 2 * df_train.loc[df_train['label_map'] == 2, 'confidence']

y_train = df_train['target']

# Train and Predict

## XGBoost - 85192.00

In [17]:
model = XGBRegressor(objective='rank:pairwise' , max_depth=5)
model.fit(X_train2, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='rank:pairwise', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [18]:
df_test['target'] = model.predict(X_test2)

In [19]:
submission = \
    df_test.sort_values(by=['context_id', 'target'], ascending=False)[['context_id', 'reply_id']]
    
submission.to_csv('myltsev_submission__xgboost.tsv', sep='\t', header=False , index=False)

## LogisticRegression

In [20]:
model_ridge = Ridge()
model_ridge.fit(X_train2, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [21]:
submission = pd.DataFrame()

submission['context_id'] = df_test['context_id']
submission['reply_id'] = df_test['reply_id']
submission['rank'] = -model.predict(X_test2)

submission.sort_values(by=['context_id', 'rank'], inplace=True)

In [22]:
submission.head()

Unnamed: 0,context_id,reply_id,rank
1,4909294510,1,-0.641196
2,4909294510,2,-0.592023
4,4909294510,4,-0.521804
0,4909294510,0,-0.430454
5,4909294510,5,-0.12304


In [23]:
submission.to_csv('myltsev_submission__xgboost__logreg_ridge.tsv',header=None, index=False, sep=' ', columns=['context_id', 'reply_id'])

In [None]:
def fetch_sentences(column_name: str):
    return df_train[~pd.isnull(df_train[column_name])][column_name].values

sentences = set(fetch_sentences('context_2')) | \
            set(fetch_sentences('context_1')) | \
            set(fetch_sentences('context_0'))
sentences

In [None]:
with open('data/sentences.txt', 'w') as f:
    for sentence in sentences:
        f.write(sentence + '\n')

In [None]:
def context2vec(df, context_name):
    print('handling', context_name)
    with open('data/sentences_temp.txt', 'w') as f:
        for sent in df[context_name].values:
            f.write(str(sent) + '\n')
    ! ../fastText/build/fasttext print-sentence-vectors data/cc.ru.300.bin < data/sentences_temp.txt > data/sentences_temp_vec.txt
    df[context_name + '_vec'] = \
        pd.read_csv('data/sentences_temp_vec.txt', header=None)[0] \
            .apply(lambda s: np.fromstring(s, dtype=float, sep=' '))
    ! rm data/sentences_temp.txt data/sentences_temp_vec.txt
    df[context_name + '_vec'] = \
        np.where(df[context_name].isnull(), df[context_name], df[context_name + '_vec'])

context2vec(df_train, 'context_2')
context2vec(df_train, 'context_1')
context2vec(df_train, 'context_0')
context2vec(df_train, 'reply')

context2vec(df_test, 'context_2')
context2vec(df_test, 'context_1')
context2vec(df_test, 'context_0')
context2vec(df_test, 'reply')

In [None]:
df_train.sample(n=5)

In [None]:
df_test.sample(n=5)

In [None]:
def label2num(st):
    if st == 'good':
        return 3
    elif st == 'neutral':
        return 2
    else:
        return 1

df_train['label_num'] = df_train['label'].apply(label2num)

In [None]:
X_train = df_train[['context_2_vec', 'context_1_vec', 'context_0_vec', 'reply_vec']]
X_train.sample(n=5)

In [None]:
y_train = df_train['confidence'] * df_train['label_num']
y_train.sample(n=5)

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

reg = Ridge()
reg.fit(X_train, y_train)