In [1]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from sklearn.feature_extraction.text import CountVectorizer
import requests
import pandas as pd
from sklearn import preprocessing
import numpy as np
import nltk
from tqdm.notebook import tqdm

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [3]:
try:
    es.indices.create(index='myandex')
except:
    es.indices.delete(index='myandex')
    es.indices.create(index='myandex')

## Create index

In [4]:
def recreate_index(name_idx):
    es.indices.delete(index='myandex')
    es.indices.create(index='myandex', body=name_idx)

## Tokenizer

In [5]:
def tokenizer(text, bigrams=False):
    if bigrams: 
        analyzer = {
            'analyzer': 'bigram_analyzer'
        }
    else: 
        analyzer = {
            'analyzer': 'my_analyzer'
        }
        
    body = analyzer
    body['text'] = text
    tokens = es.indices.analyze(index='myandex', body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

## Indexing documents

In [6]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

In [7]:
def es_actions_generator(path, idx):
    with open(path, 'r') as inf:
        documents = json.load(inf)
        for i, doc in enumerate(documents):   
            yield create_es_action('myandex', i + idx, doc)

In [8]:
def indexing_documents(path, idx=0):
    for ok, result in parallel_bulk(es, es_actions_generator(path, idx), queue_size=4, thread_count=4, chunk_size=1000):
        if not ok:
            print(result)
    

#### Load data

In [10]:
articles = pd.read_csv('../compare/articles_main.csv')
titles = pd.read_csv('../compare/idioms.csv')

## Query

In [11]:
def search(query, i, *args):
    return pretty_print_result(es.search(index='myandex', body=query, size=3000), i)
                        
def pretty_print_result(search_result, i):
    result = []
    res = search_result['hits']
    for i, hit in enumerate(res['hits']):
        string = hit['_source']['title'] + ' ' + '(' + hit['_source']['type'] + ')'
        result.append({'id': hit['_id'], 'score': hit['_score'], 'title': string, 'type': hit['_source']['type']})
    return result
    
                  
def get_doc_by_id(doc_id):
    return es.get(index='myandex', id=doc_id)['_source']

In [12]:
def make_query(fields, text):
    query = {
        "query": {
           "multi_match" : {
               "fields" : fields,
               "query" : text
           }
       }
    }
    return query

## Weighted score

In [13]:
def scale_score(lst):
    scores = []
    for item in lst:
        scores.append(item['score'])
    min_max_scaler = preprocessing.MinMaxScaler()
    scores = min_max_scaler.fit_transform(np.array(scores).reshape(-1, 1))[:, 0]
    return scores * 10

In [14]:
def weight_score(lst, scores, popularity):
    result = []
    for i, item in enumerate(lst):
        idx = int(item['id'])
        score = scores[i]
        title = item['title']
        pop = popularity[idx]
        final_score = 0.8 * score + 0.2 * pop
        result.append(final_score)
    return result

In [15]:
def filters(lst, filter_len, filter_type, filter_verbs, min_len=3, max_len=10, min_verbs=1):
    """
    input: 
        lst - result from elascticsearch
        filter_len - if True: filter length by min_len and max_len
        filter_type - if True: filter by type (book or film)
        filter_verbs - if True: filter by min_verbs 
    """
    
    result = []
    for row in lst:
        verbs = 0
        title = row['title']
        string = tokenizer(title, bigrams=False)
        text = re.sub(r'[^\w\s]', '', title)
        words = nltk.tokenize.word_tokenize(text)[:-1]
        for word in words: 
            tag = nltk.pos_tag([word])[0][1]
            if re.match('VB*', tag) is not None: 
                verbs += 1
        
        if filter_len and filter_type and filter_verbs:
            if (max_len + 1 >= len(string) >= min_len + 1) and (verbs >= min_verbs) and (row['type'] == 'film' or row['type'] == 'book'):
                result.append(row)
        elif filter_len and not filter_type and filter_verbs:
            if (max_len + 1 >= len(string) >= min_len + 1) and (verbs >= min_verbs):
                result.append(row)
        elif filter_len and not filter_type and not filter_verbs:
            if (max_len + 1 >= len(string) >= min_len + 1):
                result.append(row)
        elif filter_len and filter_type and not filter_verbs:
            if (max_len + 1 >= len(string) >= min_len + 1) and (row['type'] == 'film' or row['type'] == 'book'):
                result.append(row)
        if len(result) >= 5:
            return result
    return result

In [16]:
def sorting(lst):
    not_empty = []
    empty = []
    for i, item in enumerate(lst):
        if len(item['result']):
            not_empty.append(item)
        else:
            empty.append(item)

    result = sorted(not_empty, key=lambda x: x['result'][0]['score'], reverse=True)
    result.extend(empty)
    return result

## Search

In [31]:
def run_all(dataset, path, summary_name):
    """
    input: 
        dataset - dataset name ('idioms'/'movies'/...)
        path - path to dataset
        summary_name - name of column with summary/lead/...
    output:
        pd.DataFrame()
    """
    
#     methods = ['unigram', 'bigram', 'bigram_stop']
    methods = ['unigram']
    all_final = []

    print('TOTAL PROGRESS:')
    for method in tqdm(methods, total=len(methods)):  
        final = []
        result = []

        print('BEGING METHOD:', method)
        with open(f'../index/{dataset}/{method}_idx2.json', 'r') as inf:
                index = json.load(inf)
        recreate_index(index)

        print('Indexing document....')
        indexing_documents(path, idx=0)
        print('Indexing done')
        
#         options = [('title', 'title'), ('title', 'all'), ('all', 'title'), ('all', 'all')]
        options = [('title', 'title'), ('title', 'all'), ('all', 'title'), ('all', 'all')]
#         options = [('title', 'title')]

#         if method == 'unigram':
#             options.append(('keywords', 'keywords'))
        
        print('Search begins...') 
        for j, option in tqdm(enumerate(options), total=len(options)):
            query, docs = option
            print('Option', j + 1)
            result_verbs_all = []
            result_verbs = []
            result_films = []
            result = []
            k = 0
            for x in range(1000):
                top = search(make_query([docs], 'man'), k)
                for i, item in tqdm(articles.iterrows(), total=articles.shape[0]):
                    if len(top) > 0:
                        print('BEGIN')
                        k += 1
                        title = item['title']
                        if query == 'all':
                            article = item['title'] + ' ' + item['content']
                        else:
                            article = item[query]

                        if docs == 'all':
                            candidate = ['title']
                            candidate.extend(summary_name)
                        else:
                            candidate = [docs]

                        top = search(make_query(candidate, article[:7000]), k)

                        if len(top):

                            # POPULARITY
        #                     score = scale_score(top)
        #                     rating = weight_score(top, score, titles['score'])
        #                     top = [{'id': x['id'], 'score': rating[i], 'title': x['title'], 'type': x['type']} for i, x in enumerate(top)]
        #                     top = sorted(top, key=lambda x: x['score'], reverse=True)

                            top = filters(top, True, False, True)


                        result.append({'article_title': title, 'result': top[:5]})

        #             result = sorting(result)

                        for item in result:
                            for i, candidate in enumerate(item['result']):
                                final.append({'article': title, 'candidate': candidate['title'], 
                                             'score': candidate['score'], 'type': 'idiom'})
        all_final.append(final)
    return pd.DataFrame(all_final)

In [32]:
VERSION = 3

## Write result:

In [1]:
final = run_all('idioms', '../compare/idioms.json', ['overview'])