In [None]:
import json
import os

from itertools import chain

In [None]:
with open('/Users/matthijm/surfdrive/POL/scraped/elasticsearch-documents.json') as stream:
    documents = json.load(stream)

In [None]:
len(documents)

In [None]:
COLLECTION_NAMES = ['figshare', 'hbovpk', 'leraar24', 'stimuleringsmaatregel', 'wur', 'wwmhbo']
DATA_ROOT = os.path.join('data', 'freeze-1', 'data', 'output')

In [None]:
def load_data(collection_name):
    path = os.path.join(DATA_ROOT, collection_name, 'with_text.json')
    items = []
    with open(path) as stream:
        for item in json.load(stream):
            item['collection_name'] = collection_name
            items.append(item)
    return items

In [None]:
items = list(chain(*[load_data(name) for name in COLLECTION_NAMES]))

In [None]:
len(items)

## Count #documents/items

In [None]:
import pandas as pd

pd.Series([len(_['documents']) for _ in items]).value_counts()

# Flatten items to documents
* Skip documents with empty `text` field
* Add `collection_name` text field
* Rename `content_type` to `mime_type` if necessary
* Add item keywords to document keywords
* Add `item_id` for reference purposes
* Add `item_url` for refrence purposes

In [None]:
sum((len(_['documents']) for _ in items))

In [None]:
HUMANIZED_MIME_TYPES = {
    'application/pdf': 'pdf',
    'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'powerp.',
    'application/vnd.ms-powerpoint': 'powerp.',
    'application/msword': 'word',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'word',
    'application/rtf': 'word',
    'text/plain': 'word',
    'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'excel',
    'text/html': 'html',
    'video': 'video',
    'image': 'image',
    'application/zip': 'zip',
    'audio/mpeg': 'audio',
    'application/octet-stream': 'other'
}

def humanize_mime_type(mime_type):
    if 'html' in mime_type:
        mime_type = 'text/html'
        
    if 'video' in mime_type:
        mime_type = 'video'
        
    if 'image' in mime_type:
        mime_type = 'image'
    
    return HUMANIZED_MIME_TYPES[mime_type]

In [None]:
documents = []

for item in items:
    for doc_index, document in enumerate(item['documents']):
        document['collection_name'] = item['collection_name']
        
        if document['collection_name'] == 'leraar24' and 'html' in document['mime_type']:
            document['mime_type'] = 'video'
            
        if not document['mime_type']:
            # Try to infer mime type
            if 'youtube' in document['url']:
                document['mime_type'] = 'video'
            elif 'wurtv' in document['url']:
                document['mime_type'] = 'video'
            else:
                print('ignoring', document['collection_name'], document['id'])
                continue
        
        document['humanized_mime_type'] = humanize_mime_type(document['mime_type'])
        document['keywords'] = item.get('keywords', [])
        document['item_id'] = item['id']
        document['item_url'] = item['url']
        documents.append(document)

In [None]:
len(documents)

In [None]:
documents[0]

## Additional filters
* Verify language is set
* Verify text is not null

In [None]:
documents = [_ for _ in documents if _['language'] is not None]
print(len(documents))
documents = [_ for _ in documents if _['text'] is not None]
print(len(documents))
documents = [_ for _ in documents if not (_['humanized_mime_type'] == 'video' and _['language'] == 'en')]
len(documents)

In [None]:
unique_documents = []
ids = set()
for d in documents:
    if d['id'] in ids:
        continue
    unique_documents.append(d)
    ids.add(d['id'])
len(unique_documents)

In [None]:
documents = unique_documents

## Statistics for unfiltered corpus
Below are some plots for the unfiltered corpus, that is: the corpus before we selected the documents that should go into Elasticsearch.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib as mpl
mpl.rcParams['font.size'] = 18
mpl.rcParams['figure.figsize'] = (10, 6)

In [None]:
def shorten_collection_name(name):
    if name == 'stimuleringsmaatregel':
        return 'stim. reg.'
    return name

In [None]:
df = pd.DataFrame([
    {
        'mime_type': _['humanized_mime_type'],
        'collection_name': shorten_collection_name(_['collection_name']),
        'language': _['language']
    }
    for _ in documents
])

print(df['collection_name'].value_counts())
num_documents = df['collection_name'].value_counts().sum()

In [None]:
plt.figure(figsize=(10, 7))
corpus_counts = df['collection_name'].value_counts().to_frame().rename(columns={'collection_name': '#documents'})
sns.barplot(data=corpus_counts, x=corpus_counts.index, y='#documents')
plt.title('#Documents per corpus - raw (total: {})'.format(num_documents))
# plt.savefig('/Users/matthijm/surfdrive/POL/scraped/raw-document-overview.png', dpi=150, bbox_inches='tight');

In [None]:
df['mime_type'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
mime_counts = df['mime_type'].value_counts().to_frame().rename(columns={'mime_type': '#documents'})
sns.barplot(data=mime_counts, x=mime_counts.index, y='#documents')
plt.title('#Documents per MIME type - raw');
plt.savefig('/Users/matthijm/surfdrive/POL/scraped/raw-mime-types-all.png', dpi=150, bbox_inches='tight');

In [None]:
fig, axes = plt.subplots(
    nrows=len(COLLECTION_NAMES),
    ncols=1,
    figsize=(10, len(COLLECTION_NAMES) * 7)
)

for collection_name, ax in zip(COLLECTION_NAMES, axes):
    collection_name = shorten_collection_name(collection_name)
    mime_counts =\
        df.loc[df['collection_name'] == collection_name]['mime_type'].value_counts().to_frame().rename(columns={'mime_type': '#documents'})
    sns.barplot(data=mime_counts, x=mime_counts.index, y='#documents', ax=ax)
    ax.set_title('#Documents per MIME type for \'{}\' - raw'.format(collection_name));
    
plt.savefig('/Users/matthijm/surfdrive/POL/scraped/raw-mime-types-per-collection.png', dpi=150, bbox_inches='tight');

In [None]:
df['language'].value_counts()

In [None]:
plt.figure(figsize=(10, 7))
mime_counts = df['language'].value_counts().to_frame().rename(columns={'language': '#documents'})
sns.barplot(data=mime_counts, x=mime_counts.index, y='#documents')
plt.title('#Documents per language - raw');
plt.savefig('/Users/matthijm/surfdrive/POL/scraped/raw-languages-all.png', dpi=150, bbox_inches='tight');

## Subset data for Elasticsearch ingest
* Select only video transcripts, Word & Powerpoint documents and PDFs
* Select only Dutch and English documents

In [None]:
es_documents = []

for document in documents:
    h_mime_type = document['humanized_mime_type']
    if h_mime_type not in ['video', 'word', 'powerp.', 'pdf']:
        continue

    es_documents.append(document)

print(len(es_documents))
    
es_documents = [_ for _ in es_documents if _['language'] in ['nl', 'en']]
len(es_documents)

In [None]:
df = pd.DataFrame([
    {
        'mime_type': _['humanized_mime_type'],
        'collection_name': shorten_collection_name(_['collection_name']),
        'language': _['language']
    }
    for _ in es_documents
])

num_documents = df['language'].value_counts()
print(num_documents)
num_documents.sum()

In [None]:
df.loc[df['collection_name'] == 'wur']['mime_type']

## Save files going into ES to SURFdrive

In [None]:
with open('/Users/matthijm/surfdrive/POL/scraped/elasticsearch-documents.json', 'wt') as stream:
    json.dump(es_documents, stream, indent=2)

## Statistics for documents going into Elasticsearch

In [None]:
plt.figure(figsize=(10, 7))
corpus_counts = df['collection_name'].value_counts().to_frame().rename(columns={'collection_name': '#documents'})
sns.barplot(data=corpus_counts, x=corpus_counts.index, y='#documents')
plt.title('#Documents per collection - ES (total: {})'.format(num_documents.sum()));

# plt.savefig('/Users/matthijm/surfdrive/POL/scraped/es-document-overview.png', dpi=150, bbox_inches='tight');

## MIME type overview per collection

In [None]:
plt.figure(figsize=(10, 7))
mime_counts = df['mime_type'].value_counts().to_frame().rename(columns={'mime_type': '#documents'})
sns.barplot(data=mime_counts, x=mime_counts.index, y='#documents')
plt.title('#Documents per MIME type - ES');
plt.savefig('/Users/matthijm/surfdrive/POL/scraped/es-mime-types-all.png', dpi=150, bbox_inches='tight');

In [None]:
fig, axes = plt.subplots(
    nrows=len(COLLECTION_NAMES),
    ncols=1,
    figsize=(10, len(COLLECTION_NAMES) * 7)
)

for collection_name, ax in zip(COLLECTION_NAMES, axes):
    collection_name = shorten_collection_name(collection_name)
    mime_counts =\
        df.loc[df['collection_name'] == collection_name]['mime_type'].value_counts().to_frame().rename(columns={'mime_type': '#documents'})
    sns.barplot(data=mime_counts, x=mime_counts.index, y='#documents', ax=ax)
    ax.set_title('#Documents per MIME type for \'{}\' - ES'.format(collection_name));
    
plt.savefig('/Users/matthijm/surfdrive/POL/scraped/es-mime-types-per-collection.png', dpi=150, bbox_inches='tight');

## List indices for verification

In [None]:
import json

with open('es-credentials.json') as stream:
    credentials = json.load(stream)

URL = credentials['url']
AUTH = (credentials['username'], credentials['password'])

In [None]:
import requests

print(requests.get('{}/{}'.format(URL, '_cat/indices'), auth=AUTH).text)

## Create new index for freeze-1

In [None]:
requests.put(
    '{}/{}'.format(URL, 'freeze-1'),
    json={
        'mappings': {
            '_doc': {
                'properties': {
                    'title': {'type': 'text'},
                    'text': {
                        'type': 'object',
                        'properties': {
                            'en': {
                                'type': 'text',
                                'analyzer': 'english'
                            },
                            'nl': {
                                'type': 'text',
                                'analyzer': 'dutch'
                            }
                        }
                    },
                    'url': {'type': 'text'},
                    'keywords': {'type': 'text'},
                    'mime_type': {'type': 'text'},
                    'humanized_mime_type': {'type': 'text'},
                    'item_id': {'type': 'text'},
                    'item_url': {'type': 'text'},
                    'collection_name': {'type': 'text'}
                }
            }
        }
    },
    auth=AUTH
).text

## Delete index (CAREFUL)

In [None]:
import requests

requests.delete('{}/{}'.format(URL, 'freeze-1'), auth=AUTH)

## Ingest into ES

In [None]:
import re
import copy
from progressbar import ProgressBar

for doc in ProgressBar()(es_documents):
    clean_text = re.sub(r'\s+', ' ', doc['text'])

    es_doc = copy.deepcopy(doc)
    es_doc['text'] = dict()
    if es_doc['language'] == 'nl':
        es_doc['text']['nl'] = clean_text
    elif es_doc['language'] == 'en':
        es_doc['text']['en'] = clean_text
    else:
        raise ValueError('this shouldn not happen')
    
    url = '{}/freeze-1/_doc/{}'.format(URL, doc['id'])
    requests.put(url, auth=AUTH, json=es_doc)

In [None]:
requests.get('{}/{}/_doc/{}'.format(URL, 'freeze-1', es_documents[0]['id']), auth=AUTH).text

## List indices for verification

In [None]:
import requests

print(requests.get('{}/{}'.format(URL, '_cat/indices'), auth=AUTH).text)

## Do a test query

In [None]:
requests.get(
    '{}/{}'.format(URL, 'freeze-1/_search'),
    json={
        "from" : 0, "size" : 10,
        'query': {
            'multi_match': {
                'query': 'gene',
                'fields': ['text.en']
            }
        }
    },
    auth=AUTH
).json()

In [None]:
requests.get('{}/{}'.format(URL, 'test/_mapping'), auth=AUTH).json()