In [1]:
import elasticsearch as es

In [3]:
import os

In [4]:
import glob

In [5]:
from tqdm import tqdm_notebook as tqdm

In [2]:
ec = es.Elasticsearch(hosts=['localhost'])

In [3]:
ec.info()

{'cluster_name': 'elasticsearch',
 'cluster_uuid': 'PK0wP8RZTDmwz3nJ4WTUiw',
 'name': 'WFfEHU-',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2017-10-06T20:33:39.012Z',
  'build_hash': '1a2f265',
  'build_snapshot': False,
  'lucene_version': '6.6.1',
  'number': '5.6.3'}}

In [6]:
def split_filename(fn):
    splitted = fn.split('/')
    versioned_id = splitted[-1].replace('.tex', '')
    category = splitted[-2]
    clean_id = versioned_id.split('v')[0]
    return clean_id, category, versioned_id

In [7]:
def ensure_utf8(line):
    if type(line) is str:
        return line
    else:
        return line.decode('utf-8')

In [8]:
def extract_abstract_and_title(lines):
    abstract = [];
    in_abstract = False
    title = ''
    for line in lines:
        # line = ensure_utf8(line)
        if 'end{abstract}' in line:
            in_abstract = False
        if in_abstract:
            abstract.append(line)
        elif r'\title{' in line:
            title = line.replace(r'\title{', '').replace('}', '')
        if 'begin{abstract}' in line:
            in_abstract = True
    return ' '.join(abstract), title

In [9]:
abstracts = []
for fn in tqdm(glob.glob('data/*/*.tex')):
    with open(fn, 'rt') as f:
        try:
            abstract, title = extract_abstract_and_title(f.readlines())
            if not abstract.strip():
                continue
        except UnicodeDecodeError:
            continue
    clean_id, category, versioned_id = split_filename(fn)
    abstracts.append(
    {
        'clean_id': clean_id,
        'category': category,
        'versioned_id': versioned_id,
        'file': fn,
        'abstract': abstract,
        'title': title if title else category + '/' + clean_id
    }
    )

A Jupyter Widget




In [85]:
len(abstracts)

148840

In [59]:
def ops_iter(abstracts):
    for abstract in abstracts:
        yield {
            '_op_type': 'index',
            '_index': 'abstracts',
            '_type': 'abstract',
            '_id': abstract['clean_id'],
            '_source': abstract
        }

In [89]:
# index documents into elasticsearch
for _ in streaming_bulk(ec, ops_iter(tqdm(abstracts)), chunk_size=10000):
    pass

A Jupyter Widget

In [13]:
# prepare test queries
import numpy as np
import pandas as pd
queries = []
p=[0.5, 0.25, 0.125, 0.1, 0.025]
for a in tqdm(abstracts):
    words = a['abstract'].split()
    good_words = [word for word in words if all(map(str.isalpha, word))]
    N = len(good_words)
    if N > 50:
        for _ in range(10):
            wcnt = np.random.choice(range(1, 6), size=1, p=p)[0]
            w = []
            for _ in range(wcnt):
                w.append(good_words[np.random.randint(0, N)])
            queries.append('%20'.join(w))
pd.DataFrame({'query': queries}).to_csv('test_queries.csv', index=False)

A Jupyter Widget


