---------------------

In [0]:
import pandas as pd

import warnings
from IPython.core.interactiveshell import InteractiveShell
warnings.filterwarnings('ignore')
InteractiveShell.ast_node_interactivity = "all"
from sklearn.utils import shuffle
from sklearn.preprocessing import normalize

from laserembeddings import Laser

from sklearn.neighbors import BallTree 
from sklearn.base import BaseEstimator
from sklearn.neighbors import KDTree
from sklearn.pipeline import make_pipeline
import telebot

---------

# Data Loading

In [0]:
allSet = pd.read_csv('two_chats_df_train.csv') 
kino = pd.read_csv('kino.csv', sep=',')
bigData = allSet.append(kino)
bigData = shuffle(bigData)
bigData = bigData.reset_index(drop=True)

# LASER embeddings

In [0]:
class myVectorizer_Laser(object):
    def fit(self, X):
        self.laser = Laser()
        return self
    def transform(self, X):
        return normalize(self.laser.embed_sentences(X, lang='ru'), norm='l2')

In [6]:
vectorizer = myVectorizer_Laser()
vectorizer.fit(bigData.Context)
Martix = vectorizer.transform(bigData.Context)

<__main__.myVectorizer_Laser at 0x7fd3f794c438>

In [7]:
Martix.shape

(104670, 1024)

--------

# Ranking & Pipeline

In [0]:
def softmax(x):
    proba = np.exp(-x)
    return proba / sum(proba)

In [0]:
class Neighbors(BaseEstimator):
    def __init__(self, k=9, radius = 0.01):
        self.k = k
        self.radius = radius
    
    def fit(self, X, y):
        self.tree_ = KDTree(X, metric='euclidean')
        self.y_ = np.array(y)
        
    def predict(self, X, random_state = None):
        dist, ind = self.tree_.query(X, return_distance = True, k = self.k)
        result = []
        
        print('measure:',dist)
        print('--------')
        print('indexes of relevant answers:',ind)
        for d, i in zip(dist, ind):
            result.append(np.random.choice(i, p = softmax(d * self.radius)))
            
        return print('relevant answers', self.y_[ind])

In [0]:
ns = Neighbors()
ns.fit(Martix, bigData.Response)

In [0]:
pipe = make_pipeline(vectorizer, ns)

In [31]:
pipe.predict(['какой фотоаппарат лучше купить?'])

measure: [[0.56291434 0.7309298  0.7309298  0.7309298  0.7343525  0.73694703
  0.73994138 0.74614302 0.75308905]]
--------
indexes of relevant answers: [[65646 57979 80509 35009 28500 88009 82628 65676 38355]]
relevant answers [['однозначно кэнон но если ты извращенец бери зенит'
  'Но я не про засветки' 'ф80' 'Canon FTb' 'Espio 160' 'ю8'
  'canon eos 30' 'никон фг20' 'Toyo Field 45A II']]


-------

# Demo Telegram Bot

In [0]:
token='#####################'
continueNPlus1=True
bot = telebot.TeleBot(token)

In [0]:
@bot.message_handler(commands=['start'])
def start_message(message):
    bot.send_message(message.chat.id, 'привет, хозяева! привет все!')

In [0]:
@bot.message_handler(func=lambda message: True)
def eho_all(message):
    bot.reply_to(message, pipe.predict([message.text]))

In [0]:
bot.polling()