In [1]:
import os
import string
import annoy
import codecs
import pickle

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from gensim.models import Word2Vec

import numpy as np
from tqdm.notebook import tqdm
import pandas as pd

import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer

# Обучение классификатора «товарный запрос vs. болталка»

In [2]:
df = pd.read_csv('data/ProductsDataset.csv')
df.head()

Unnamed: 0,title,descrirption,product_id,category_id,subcategory_id,properties,image_links
0,Юбка детская ORBY,"Новая, не носили ни разу. В реале красивей чем...",58e3cfe6132ca50e053f5f82,22.0,2211,"{'detskie_razmer_rost': '81-86 (1,5 года)'}",http://cache3.youla.io/files/images/360_360/58...
1,Ботильоны,"Новые,привезены из Чехии ,указан размер 40,но ...",5667531b2b7f8d127d838c34,9.0,902,"{'zhenskaya_odezhda_tzvet': 'Зеленый', 'visota...",http://cache3.youla.io/files/images/360_360/5b...
2,Брюки,Размер 40-42. Брюки почти новые - не знаю как ...,59534826aaab284cba337e06,9.0,906,{'zhenskaya_odezhda_dzhinsy_bryuki_tip': 'Брюк...,http://cache3.youla.io/files/images/360_360/59...
3,Продам детские шапки,"Продам шапки,кажда 200р.Розовая и белая проданны.",57de544096ad842e26de8027,22.0,2217,"{'detskie_pol': 'Девочкам', 'detskaya_odezhda_...",http://cache3.youla.io/files/images/360_360/57...
4,Блузка,"Темно-синяя, 42 размер,состояние отличное,как ...",5ad4d2626c86cb168d212022,9.0,907,"{'zhenskaya_odezhda_tzvet': 'Синий', 'zhenskay...",http://cache3.youla.io/files/images/360_360/5a...


In [3]:
data = pd.DataFrame(pd.concat([df['title'], df['descrirption']]), columns = ['title']).dropna().reset_index(drop=True)

data['target'] = 1

In [4]:
question = None
answer = False
written = False

question_list = []


with codecs.open('data/Otvety.txt', 'r', 'utf-8') as fin:
    for line in tqdm(fin):
        
        if line.startswith("---"):
            question = None
            answer = False
            continue
            
        if answer == True:
            continue
            
        if question is None:
            question = line.strip()
            question_list.append(question)
            answer = True
            continue

0it [00:00, ?it/s]

In [5]:
data_1 = pd.DataFrame(question_list, columns = ['title'])[1:].reset_index(drop=True)
data_1['target'] = 0

In [6]:
data_concat = pd.concat([data, data_1])
data_concat

Unnamed: 0,title,target
0,Юбка детская ORBY,1
1,Ботильоны,1
2,Брюки,1
3,Продам детские шапки,1
4,Блузка,1
...,...,...
1163416,Между словами ПРЕЗИДЕНТ и РЕЗИДЕНТ есть что ли...,0
1163417,"""Если это мое, то оно никуда от меня не денетс...",0
1163418,А Вы халяву любите или совесть имеете???) .,0
1163419,Так много разных гороскопов кто-нибудь может п...,0


In [7]:
data_concat['target'].value_counts(normalize=True)

0    0.943948
1    0.056052
Name: target, dtype: float64

In [8]:
from pymorphy2 import MorphAnalyzer
from string import punctuation

exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in exclude]
    return " ".join(txt)

In [9]:
data_concat['title_new'] = data_concat['title'].apply(preprocess_text)
data_concat

Unnamed: 0,title,target,title_new
0,Юбка детская ORBY,1,юбка детский orby
1,Ботильоны,1,ботильон
2,Брюки,1,брюки
3,Продам детские шапки,1,продать детский шапка
4,Блузка,1,блузка
...,...,...,...
1163416,Между словами ПРЕЗИДЕНТ и РЕЗИДЕНТ есть что ли...,0,между слово президент и резидент есть что либо...
1163417,"""Если это мое, то оно никуда от меня не денетс...",0,если это мой то оно никуда от я не деться
1163418,А Вы халяву любите или совесть имеете???) .,0,а вы халява любить или совесть иметь
1163419,Так много разных гороскопов кто-нибудь может п...,0,так много разный гороскоп ктонибыть мочь посов...


In [10]:
from sklearn.model_selection import train_test_split

X = data_concat['title_new']
y = data_concat['target']

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True, random_state=42)

In [11]:
print(y_train.value_counts(normalize=True))
print(y.value_counts(normalize=True))

0    0.943947
1    0.056053
Name: target, dtype: float64
0    0.943948
1    0.056052
Name: target, dtype: float64


In [12]:
count_vect = CountVectorizer().fit(x_train.values)

xtrain_count = count_vect.transform(x_train)
xvalid_count = count_vect.transform(x_valid)

tfidf_vec = TfidfVectorizer().fit(x_train.values)
xtrain_tfidf = tfidf_vec.transform(x_train)
xvalid_tfidf = tfidf_vec.transform(x_valid)

tfidf_ngrams = TfidfVectorizer(ngram_range=(1,3)).fit(x_train.values)
xtrain_tfidf_ngram = tfidf_ngrams.transform(x_train)
xvalid_tfidf_ngram = tfidf_ngrams.transform(x_valid)

In [13]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid):
    classifier.fit(feature_vector_train, label)
    prediction = classifier.predict(feature_vector_valid)
    
    return metrics.accuracy_score(prediction, y_valid)

In [14]:
from sklearn import naive_bayes, linear_model, ensemble, metrics, svm

In [15]:
# Naive Bayes на Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, y_train, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes на Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, y_train, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes на Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, y_train, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

NB, Count Vectors:  0.9908438876763678
NB, WordLevel TF-IDF:  0.9757365051804854
NB, N-Gram Vectors:  0.9675459022644847


In [16]:
# Linear Classifier на Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, y_train, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier на Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, y_train, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier на Ngram Level TF IDF Vectors
# accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, y_train, xvalid_tfidf_ngram)
# print("LR, N-Gram Vectors: ", accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LR, Count Vectors:  0.9939310837234586
LR, WordLevel TF-IDF:  0.9935010669284631


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# SVM на Count Vectors
accuracy = train_model(svm.SVC(), xtrain_count, y_train, xvalid_count)
print("SVM, N-Gram Vectors: ", accuracy)

# SVM на Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, y_train, xvalid_tfidf)
print("SVM, N-Gram Vectors: ", accuracy)

# SVM на Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, y_train, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)

In [20]:
best_model = linear_model.LogisticRegression()
best_model.fit(xtrain_count, y_train) 

with open('model.pkl', 'wb') as output:
    pickle.dump(best_model, output)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


→ Реализован поиск похожих товаров в контентной части бота

* Все названия товаров свёрнуты в векторное представление Word2Vec (предобученном или обученном на исходном датасете).
* Построен индекс по названиям документов.
* Для товарных запросов реализован поиск в индексе (запрос также оборачивается Word2Vec, происходит проход в индекс).

→ Реализована болталка

* Все вопросы из датасета свёрнуты Word2Vec в векторное представление.
* Построен индекс по вопросам.
* На запрос в болталку происходит поиск ближайшего вопроса и возвращается ответ на этот вопрос.

# Реализация поиска похожих товаров в контентной части бота 

In [21]:
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text_new(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in exclude]
    return txt

In [22]:
sentences = df['title'].apply(preprocess_text_new)

In [23]:
model_w2v = Word2Vec(sentences = sentences.values, vector_size = 100, window = 3)
model_w2v.save('w2v_product_model')

In [24]:
index = annoy.AnnoyIndex(100, 'angular')

index_map = {}
counter = 0

for i in range(df.shape[0]):
    
    n_w2v = 0
    answer = df.iloc[i, 2]
    index_map[counter] = answer
    
    vector = np.zeros(100)
    question = preprocess_text_new(df.iloc[i, 0])
    for word in question:
        if word in model_w2v.wv:
            vector += model_w2v.wv[word]
            n_w2v += 1
            
    if n_w2v > 0:
        vector = vector / n_w2v
    index.add_item(counter, vector)
    
    counter += 1
    
index.build(10)
index.save('speaker_2.ann')

True

# Реализация болталки

In [40]:
sentences = []

morpher = MorphAnalyzer()
sw = set(get_stop_words("ru"))
exclude = set(string.punctuation)
c = 0

with codecs.open("data/Otvety.txt", "r", "utf-8") as fin:
    for line in tqdm(fin):
        spls = preprocess_text_new(line)
        sentences.append(spls)
        c += 1
        if c > 500000:
            break

0it [00:00, ?it/s]

In [41]:
sentences = [i for i in sentences if len(i) > 2]
talk_model = Word2Vec(sentences=sentences, vector_size=100, min_count=1, window=5)
talk_model.save("w2v_talk_model")

In [42]:
talk_index = annoy.AnnoyIndex(100 ,'angular')

talk_index_map = {}
counter = 0

with codecs.open("prepared_answers.txt", "r", "utf-8") as f:
    for line in tqdm(f):
        n_w2v = 0
        spls = line.split("\t")
        talk_index_map[counter] = spls[1]
        question = preprocess_text_new(spls[0])
        vector = np.zeros(100)
        for word in question:
            if word in talk_model.wv:
                vector += talk_model.wv[word]
                n_w2v += 1
        if n_w2v > 0:
            vector = vector / n_w2v
        talk_index.add_item(counter, vector)
            
        counter += 1

talk_index.build(10)
talk_index.save('speaker.ann')

0it [00:00, ?it/s]

True

# Реализуем логику поиска похожих товаров по продуктовому запросу

In [43]:
def get_answer(question):
    if best_model.predict(count_vect.transform([preprocess_text(question)]))[0] == 1:
        preprocessed_question = preprocess_text_new(question)
        n_w2v = 0
        vector = np.zeros(100)
        for word in preprocessed_question:
            if word in model_w2v.wv:
                vector += model_w2v.wv[word]
                n_w2v += 1
        if n_w2v > 0:
            vector = vector / n_w2v
        answer_index = index.get_nns_by_vector(vector, 1)
        return index_map[answer_index[0]]
        
    else:
        preprocessed_question = preprocess_text_new(question)
        n_w2v = 0
        vector = np.zeros(100)
        for word in preprocessed_question:
            if word in talk_model.wv:
                vector += talk_model.wv[word]
                n_w2v += 1
        if n_w2v > 0:
            vector = vector / n_w2v
        answer_index = talk_index.get_nns_by_vector(vector, 1)
        return talk_index_map[answer_index[0]]

In [44]:
assert(get_answer('Юбка детская ORBY').startswith('58e3cfe6132ca50e053f5f82'))
assert(not get_answer('Где ключи от танка?').startswith('5'))