In [None]:
import re
import random
import logging
import gensim
import pymorphy2
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
from gensim.models.word2vec import LineSentence
from gensim.models import KeyedVectors
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from sklearn.metrics.pairwise import cosine_similarity
from IPython.core.interactiveshell import InteractiveShell
from laserembeddings import Laser
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import warnings
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all"
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (30, 30)
pd.set_option('display.max_colwidth', 100)

%matplotlib inline

## Data Loading

In [None]:
df_train.to_csv('two_chats_df_train.csv', index=False)

In [None]:
df_train = pd.read_csv('two_chats_df_train.csv')
df_train.shape
df_train.head()

## Creating Test Dataset

In [None]:
def get_false_samples(df_input, n):
    return df_input.loc[random.sample(list(df_input.index), n)] ## ЗАФИКСИРОВАТЬ СИД

In [None]:
def get_df_test(df_input, n):
    df_output = df_input.copy()
    false_samples_list = [get_false_samples(df_output, n)['NContext'].values
                          for each_sample in tqdm_notebook(df_output['NContext'])]
    df_output['False_Samples'] = false_samples_list
    return df_output[['NContext', 'NResponse', 'False_Samples']]

In [None]:
df_test = get_df_test(df_train, 9)

In [None]:
df_test['With_False_Samples'] = [[df_test['NResponse'][x]] +
                                 list(df_test['False_Samples'][x]) for x in range(len(df_test))]

In [None]:
df_test.tail(-5).values
df_test.shape

## Learning Models

### Word2Vec

In [None]:
sentence_split_by_dot_pattern = r'(?<!\s\w\d\.)(?<!\w\.\w\.)(?<=[….?!])[\s+\n+]\s*(?=[\w\"\'«]|<\/?\s*p\s*>|-(?=\s*\w+))'
sentence_split_pattern = sentence_split_by_dot_pattern \
    + r'|(?<![….!?]\s<\/p>)[\s+\n+]\s*(?=<\s*p\s*>|<\s*br\s*\/?>)|[….?!](?=<\/?\s*p\s*>|<br\s*\/?>)'
word_split_pattern = r"(?P<word>(?:(?!_)(?:[\w/]|(?<=\w)[-'.](?=\w)))+)"
sentence_split_regexp = re.compile(sentence_split_pattern, flags=re.UNICODE)
word_split_regexp = re.compile(word_split_pattern, flags=re.UNICODE)


def split_text_into_part(text, part_regexp):
    return [text for i, text in
            enumerate(part_regexp.split(text))]


def split_text_into_sentences(text):
    return split_text_into_part(text, sentence_split_regexp)

In [None]:
morph2 = pymorphy2.MorphAnalyzer()

conv_pos2 = {'ADJF': 'ADJ', 'ADJS': 'ADJ', 'ADV': 'ADV', 'NOUN': 'NOUN',
             'VERB': 'VERB', 'PRTF': 'ADJ', 'PRTS': 'ADJ', 'GRND': 'VERB'}

nones2 = {} 
tmp_dict2 = {} 

def normalizePymorphy_sentences(text, need_pos=False):
    output = []
    sentences = split_text_into_sentences(text)
    for sentence in sentences:
        tokens = re.findall(
            '[A-Za-zА-Яа-яЁё]+\-[A-Za-zА-Яа-яЁё]+|[A-Za-zА-Яа-яЁё]+|[0-9]+', sentence)
        with open("stopwords.txt", encoding="utf-8") as file:
            stop_words = file.read()
        words = []
        for t in tokens:
            if t in tmp_dict2.keys():
                words.append(tmp_dict2[t])
            elif t in nones2.keys():
                pass
            else:
                pv = morph2.parse(t)
                # if pv[0].tag.POS != None and pv[0].score >= 0.20:
                # if pv[0].tag.POS != None:
                # pv[0].normal_form not in stop_words and \
                #  and pv[0].normal_form not in stop_words
                if len(pv[0].normal_form) > 1:
                    # and pv[0].tag.POS in conv_pos2.keys()
                    # if pv[0].normal_form != 'быть' and \
                    # if pv[0].tag.POS in conv_pos.keys():
                    if need_pos:
                        word = pv[0].normal_form+"_"+conv_pos[pv[0].tag.POS]
                    else:
                        word = pv[0].normal_form
                    words.append(word)
                    tmp_dict2[t] = word
#                 else:
#                     nones[t] = ""
        output.append(words)
        output = [s for s in output if len(s) > 1]

    return output

In [None]:
all_sentences = []
for text in tqdm_notebook(df_positive['Context'].dropna()):
    if text == 0:
        continue
    all_sentences.extend(normalizePymorphy_sentences(text))

In [None]:
phrases = Phrases(sentences=all_sentences, min_count=5, threshold=10)
bigram = Phraser(phrases)

for index, sentence in enumerate(all_sentences):
    all_sentences[index] = bigram[sentence]

In [None]:
num_features = 300
min_word_count = 5  
num_workers = 4 
context = 5          
downsampling = 1e-3  

w2v_model = Word2Vec(all_sentences, workers=8, size=num_features,
                 min_count=min_word_count, window=context, sample=downsampling, sg=1)

In [None]:
def shuffle_corpus(sentences):
    shuffled = list(sentences)
    random.shuffle(shuffled)
    return shuffled

In [None]:
w2v_model.build_vocab(sentences=shuffle_corpus(all_sentences), update=True)

In [None]:
w2v_model.train(sentences=shuffle_corpus(all_sentences),
                epochs=5, total_examples=w2v_model.corpus_count)

In [None]:
w2v_model.init_sims(replace=True)

In [None]:
w2v_model.corpus_total_words

In [None]:
w2v_model.most_similar('кодак')

### TF-IDF

In [None]:
w2v_dict = dict(zip(w2v_model.wv.index2word, w2v_model.wv.syn0))
tfidf = TfidfVectorizer(norm=None).fit(df_train['NContext'].values)

### FastText

In [None]:
embedding_size = 300
window_size = 5
min_word = 5
down_sampling = 1e-3

In [None]:
fastText_model = FastText(all_sentences,
                          size=embedding_size,
                          window=window_size,
                          min_count=min_word,
                          sample=down_sampling,
                          sg=1,
                          workers=8)

In [None]:
fastText_model.wv.most_similar(['кодак'], topn=10)

## Modeling

### Baseline Model: TF-IDF

In [None]:
def evaluate_recall(y, y_test, k=1):
    num_examples = float(len(y))
    num_correct = 0
    for predictions, label in zip(y, y_test):
        if label in predictions[:k]:
            num_correct += 1
    return num_correct/num_examples

In [None]:
class TFIDFPredictor:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def fit(self, data):
        self.vectorizer = TfidfVectorizer().fit(data.values)

    def predict(self, context, utterances):
        vector_context = self.vectorizer.transform([context])
        vector_uttr = self.vectorizer.transform(utterances)
        # Длина векторов tfidf равна единице (vectore magnitude), 
        # поэтому косинусное расстояние = скалярное произведение, которое можно посчитать как ниже 
        #(linear_kernel - попарное скалярное произведение),
        # такой подход ускорит работу, вместо cosine_similarity, не нужно подсчитывать зря длину векторов:
        # result = linear_kernel(vector_context, vector_uttr).flatten()
        result = np.dot(vector_uttr, vector_context.T).todense()
        result = np.asarray(result).flatten()
        return np.argsort(result, axis=0)[::-1]

In [None]:
tfidf_model = TFIDFPredictor()
tfidf_model.fit(df_train['NContext'])
y = [tfidf_model.predict(df_test['NContext'][x], df_test.iloc[x, 3]) for x in tqdm_notebook(range(len(df_test)))]

In [None]:
y_test = np.zeros(df_test.shape[0])
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

### Model with Word2Vec

In [None]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.wv.vocab:
            mean.append(wv.wv.syn0norm[wv.wv.vocab[word].index])
            all_words.add(wv.wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list])

In [None]:
class W2VPredictor: 
    def __init__(self):
        ''

    def fit(self, data):
        ''

    def predict(self, context, utterances):
        vector_context = word_averaging_list(w2v_model, [context.split()])
        vector_uttr = word_averaging_list(w2v_model, [x.split() for x in utterances])
        result = cosine_similarity(vector_context, vector_uttr)
        result = np.asarray(result).flatten()
        return np.flip(np.argsort(result))

In [None]:
w2vec_model = W2VPredictor()

In [None]:
y = [w2vec_model.predict(df_test['NContext'][x], df_test.iloc[x, 3]) for x in tqdm_notebook(range(len(df_test)))]

In [None]:
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

### Model with TF-IDF & Word2Vec

In [None]:
from collections import defaultdict

max_idf = max(tfidf.idf_)
weights = defaultdict(lambda: max_idf, [(
            w, tfidf.idf_[i]) for w, i in tqdm_notebook(tfidf.vocabulary_.items())])

In [None]:
class WeightedW2VPredictor:
    def __init__(self, tfidf, num_features, weights):
        self.max_idf = max(tfidf.idf_)
        self.weights = weights
        self.dim = num_features

    def fit(self, tfidf, data):
        ''

    def predict(self, w2v_model, context, utterances):
        vector_context = np.array([np.mean([w2v_model[w] * self.weights[w] for w in context.split() if w in w2v_model]
                                or [np.zeros(self.dim)], axis=0)])
    
        vector_uttr = np.array([np.mean([w2v_model[w] * self.weights[w] for w in words.split() if w in w2v_model] or
                                        [np.zeros(self.dim)], axis=0) for words in utterances])
        result = cosine_similarity(vector_context[0].reshape(1, -1), vector_uttr)
        result = np.asarray(result).flatten()

        return np.flip(np.argsort(result))

In [None]:
w_w2v_model = WeightedW2VPredictor(tfidf, num_features, weights)

In [None]:
y = [w_w2v_model.predict(w2v_model, df_test['NContext'][x], df_test.iloc[x, 3])
     for x in tqdm_notebook(range(len(df_test)))]

In [None]:
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

### Model with FastText

In [None]:
class FTPredictor: 
    def __init__(self):
        ''

    def fit(self, data):
        ''

    def predict(self, context, utterances):
        vector_context = word_averaging_list(fastText_model, [context.split()])
        vector_uttr = word_averaging_list(fastText_model, [x.split() for x in utterances])
        result = cosine_similarity(vector_context, vector_uttr)
        result = np.asarray(result).flatten()
        return np.flip(np.argsort(result))

In [None]:
fastT_model = FTPredictor()

In [None]:
y = [fastT_model.predict(df_test['NContext'][x], df_test.iloc[x, 3]) for x in tqdm_notebook(range(len(df_test)))]

In [None]:
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))

### Model with LASER embeddings

In [None]:
class LASERPredictor:
    def __init__(self):
        self.vectorizer = Laser()

    def fit(self, data):
        self.vectorizer.embed_sentences(data, lang='ru')

    def predict(self, context, utterances):
        vector_context = self.vectorizer.embed_sentences([context], lang='ru')
        vector_uttr = self.vectorizer.embed_sentences(utterances, lang='ru')
        result = cosine_similarity(vector_context, vector_uttr)
        result = np.asarray(result).flatten()
        return np.flip(np.argsort(result))

In [None]:
laser_model = LASERPredictor()
laser_model.fit(df_train['Context'])

In [None]:
y = [laser_model.predict(df_test['NContext'][x], df_test.iloc[x, 3]) for x in tqdm_notebook(range(len(df_test)))]

In [None]:
for n in [1, 2, 5, 10]:
    print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))