In [1]:
!pip install pymystem3

[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
from flask import Flask
from flask import render_template, request, url_for
from pymystem3 import Mystem
import string
from gensim import matutils
import numpy as np
from math import log
from collections import defaultdict
from nltk import word_tokenize
from nltk.corpus import stopwords
import pickle
from gensim.models.doc2vec import Doc2Vec
from gensim.models import Word2Vec

In [2]:
stops = True
amount = 10
search_method = 'inverted_index'
russian_stopwords = set(stopwords.words('russian'))

with open('info_data.pickle', 'rb') as f:
    info_data = pickle.load(f)
print('1')
with open('word_count_del.pickle', 'rb') as f:
    word_count_del = pickle.load(f)
print('2')
with open('word_count_not_del.pickle', 'rb') as f:
    word_count_not_del = pickle.load(f)
print('3')
with open('vec_data_del.pickle', 'rb') as f:
    vec_data_del = pickle.load(f)
print('4')
with open('vec_data_not_del.pickle', 'rb') as f:
    vec_data_not_del = pickle.load(f)
print('5')

avgdl = np.mean([i['len'] for i in info_data.values()])
print('6')
model_w2v = Word2Vec.load('araneum_none_fasttextcbow_300_5_2018.model')
print('7')
model_d2v = Doc2Vec.load('model_d2v')
print('8')

1
2
3
4
5
6
7
8


In [13]:
def preprocessing(input_text, stopwords={}, del_stopwords=True, del_digit=True):

    words = [x.lower().strip(string.punctuation+'»«–…') for x in word_tokenize(input_text)]
    lemmas = [mystem.lemmatize(x)[0] for x in words if x]
    lemmas_arr = []
    for lemma in lemmas:
        if del_stopwords:
            if lemma in stopwords:
                continue
        if del_digit:
            if lemma.isdigit():
                continue
        lemmas_arr.append(lemma)
    return lemmas_arr

In [14]:

def get_w2v_vectors(lemmas, model):

    lemmas_vectors = []
    for lemma in lemmas:
        try:
            lemmas_vectors.append(model.wv[lemma])
        except:
            None
    if lemmas_vectors:
        doc_vec = sum(lemmas_vectors)
        normalized_vec = matutils.unitvec(doc_vec)
        return list(normalized_vec)
    else:
        return [0] * 300


In [15]:
def similarity(vec1, vec2):
    return np.dot(vec1, vec2)


def culc_sim_score(all_data, vec, model_type):

    answer = defaultdict(float)  # id : score

    for part in all_data:

        if model_type == 'word2v':
            sim = similarity(part['w2v_vec'], vec)
        elif model_type == 'doc2v':
            sim = similarity(part['d2v_vec'], vec)
        else: raise ValueError

        if answer[part['id']] == 0.0: answer[part['id']] = float('-inf')

        if sim > answer[part['id']]: answer[part['id']] = sim

    return answer

In [16]:
def search_w2v(string, model, info_data, vec_data, stopwords={}, amount=10, del_stop=True):

    if not isinstance(string, str):
        raise ValueError('enter correct data')

    words = preprocessing(string, stopwords=stopwords, del_stopwords=del_stop, del_digit=True)
    vec = get_w2v_vectors(words, model)
    answer = culc_sim_score(vec_data, vec, 'word2v')

    for index, ans in enumerate(sorted(answer.items(), reverse=True, key=lambda x: x[1])):
        if index >= amount: break
        yield (ans[0], info_data[ans[0]], ans[1])


In [17]:
def get_d2v_vectors(text, model):
    """Получает вектор документа"""
    return model.infer_vector(text)


def search_d2v(string, model, info_data, vec_data, stopwords={}, del_stop=False, amount=10):

    if not isinstance(string, str):
        raise ValueError('enter correct data')

    words = preprocessing(string, stopwords=stopwords, del_stopwords=del_stop, del_digit=True)
    vec = get_d2v_vectors(words, model)
    answer = culc_sim_score(vec_data, vec, 'doc2v')

    for index, ans in enumerate(sorted(answer.items(), reverse=True, key=lambda x: x[1])):
        if index >= amount: break
        yield (ans[0], info_data[ans[0]], ans[1])

In [18]:
def score_BM25(qf, dl, avgdl, k1, b, N, n):
    """
    Compute similarity score between search query and documents from collection
    :return: score

    qf - кол - во вхождений слова в документе
    dl - длина документа
    """
    if dl == 0:
        dl = 1
        
    tf = qf / dl
    idf = log((N - n + 0.5) / (n + 0.5))
    a = (k1 + 1) * tf
    b = tf + k1*(1 - b + b*(dl / avgdl))

    return (a / b) * idf


def compute_sim(words, avgdl, doc, info_data, word_count, N):
    """
    Compute similarity score between search query and documents from collection
    :return: score
    """

    k1 = 2.0
    b = 0.75
    ans = 0

    for word in words:
        if word_count[word] != {}:

            try: qf = word_count[word][doc]
            except KeyError: qf = 0

            dl = info_data[doc]['len']
            n = len(word_count[word])
            ans += score_BM25(qf, dl, avgdl, k1, b, N, n)

    return ans


In [19]:
def get_search_result(text, avgdl, info_data, word_count, stopwords={}, del_stop=True, amount=10):
    """
    Compute sim score between search query and all documents in collection
    Collect as pair (doc_id, score)
    :param query: input text
    :return: list of lists with (doc_id, score)
    """

    if not isinstance(text, str):
        raise ValueError

    words = preprocessing(text, stopwords=stopwords, del_stopwords=del_stop, del_digit=True)
    answer = {}
    N = len(info_data)

    for doc in info_data:
        answer[doc] = compute_sim(words, avgdl, doc, info_data, word_count, N)

    for index, ans in enumerate(sorted(answer.items(), reverse=True, key=lambda x: x[1])):
        if index >= amount: break
        yield (ans[0], info_data[ans[0]], ans[1])

In [21]:
def search(string, search_method, avgdl, model_w2v, model_d2v, info_data, vec_data_del, vec_data_not_del, word_count_del, word_count_not_del, amount=10, del_stop=True, stopwords={}):

    if search_method == 'inverted_index':
        if del_stop != 'True':
            search_result = get_search_result(string, avgdl, info_data, word_count_not_del, stopwords=stopwords, del_stop=False, amount=amount)
        else:
            search_result = get_search_result(string, avgdl, info_data, word_count_del, stopwords=stopwords, del_stop=True, amount=amount)

    elif search_method == 'word2vec':
        if del_stop != 'True':
            search_result = search_w2v(string, model_w2v, info_data, vec_data_not_del, stopwords=stopwords, amount=amount, del_stop=False)
        else:
            search_result = search_w2v(string, model_w2v, info_data, vec_data_del, stopwords=stopwords, amount=amount, del_stop=True)

    elif search_method == 'doc2vec':
        if del_stop != 'True':
            search_result = search_d2v(string, model_d2v, info_data, vec_data_not_del, stopwords=stopwords, amount=amount, del_stop=False)
        else:
            search_result = search_d2v(string, model_d2v, info_data, vec_data_del, stopwords=stopwords, amount=amount, del_stop=True)


    return search_result



In [None]:

mystem = Mystem()
app = Flask(__name__)


@app.route('/', methods=['GET'])
def main_page():
    main_url = url_for('main_page')
    if request.args:
        query = request.args['words']
        amount = int(request.args['amount'])
        stops = request.args['stops']
        search_method = request.args['model']

        result = search(query, search_method, avgdl, model_w2v, model_d2v, info_data, vec_data_del, vec_data_not_del, word_count_del, word_count_not_del, amount=amount, del_stop=stops, stopwords=russian_stopwords)
        return render_template('result.html', name=result, main_page=main_page)

    return render_template('index.html')


@app.route('/result', methods=['GET'])
def result():
    main_url = url_for('main_page')

    return render_template('result.html')

if __name__ == '__main__':
    app.run(debug=False, host='0.0.0.0', port=5000)


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://0.0.0.0:5000/ (Press CTRL+C to quit)
92.242.59.6 - - [23/Oct/2018 12:16:10] "GET / HTTP/1.1" 200 -
92.242.59.6 - - [23/Oct/2018 12:16:20] "GET /?words=%D0%BA%D1%83%D0%BF%D0%B8%D1%82%D1%8C+%D0%B0%D0%B9%D1%84%D0%BE%D0%BD&amount=5&stops=True&model=inverted_index HTTP/1.1" 200 -
  if np.issubdtype(vec.dtype, np.int):
92.242.59.6 - - [23/Oct/2018 12:17:03] "GET /?words=%D0%BA%D1%83%D0%BF%D0%B8%D1%82%D1%8C+%D0%B0%D0%B9%D1%84%D0%BE%D0%BD&amount=5&stops=True&model=word2vec HTTP/1.1" 200 -
92.242.59.6 - - [23/Oct/2018 12:17:18] "GET /?words=%D0%BA%D1%83%D0%BF%D0%B8%D1%82%D1%8C+%D0%B0%D0%B9%D1%84%D0%BE%D0%BD&amount=5&stops=True&model=doc2vec HTTP/1.1" 200 -
92.242.59.6 - - [23/Oct/2018 12:18:05] "GET /?words=%D0%BA%D1%83%D0%BF%D0%B8%D1%82%D1%8C+%D0%B0%D0%B9%D1%84%D0%BE%D0%BD&amount=5&stops=True&model=doc2vec HTTP/1.1" 200 -
92.242.59.6 - - [23/Oct/2018 12:18:37] "GET /?words=%D0%BA%D1%83%D0%BF%D0%B8%D1%82%D1%8C+%D0%B0%D0%B9%D1%84%D0%BE%D0%BD&amount=5&stops=True&model=word2ve

In [52]:
!wget 127.0.0.1:5000

--2018-10-23 09:04:15--  http://127.0.0.1:5000/
Connecting to 127.0.0.1:5000... failed: Connection refused.


In [9]:
!sudo apt-get install lsof
!lsof -i :5000

[sudo] password for jovyan: 
/bin/sh: 1: lsof: not found
