# Assignment 3, Indexing

In this notebook you will index DBpedia (see the sub-collections listed under `https://github.com/uis-dat640-fall2019/admin/tree/master/assignments/assignment-3#data`). 

Make sure you specify the index settings, analyzer, and fields appropriately for to support the models to be implemented in subsequent notebooks.

Note: you'll need to build a positional index. Use a single shard to make sure you're getting the right term statistics.

Be sure to use both markdown cells with section headings and explanations, as well as writing readable code, to make it clear what your intention is each step of the way through the code. 

In [2]:
# fix jupyter notebook narrow style
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os
import subprocess
import json

from pprint import pprint
from tqdm import tqdm
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch import RequestError

In [2]:
def get_n_lines(filename):
    return int(subprocess.check_output(['wc', '-l', filename]).decode('utf8').split()[0])

# Initialize Elasticsearch index

In [3]:
es = Elasticsearch()
es.info()

{'name': 'HoG',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': 'Zle62SN2R0-CJz5T3d07Og',
 'version': {'number': '7.4.2',
  'build_flavor': 'default',
  'build_type': 'deb',
  'build_hash': '2f90bbf7b93631e52bafb59b3b049cb44ec25e96',
  'build_date': '2019-10-28T20:40:44.881551Z',
  'build_snapshot': False,
  'lucene_version': '8.2.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [4]:
INDEX_NAME = 'collection_v2'

INDEX_SETTINGS = {
    'settings' : {
        'index' : {
            "number_of_shards" : 1,
            "number_of_replicas" : 0
        },
        'analysis': {
            'analyzer': {
                'english_analyzer': {
                    'type': "custom",
                    'tokenizer': "standard",
                    'filter': [
                        "lowercase",
                        "asciifolding",
                        "dbpedia_stop",
                        "english_stop",
                        "stemming_english_minimal"
                    ]                
                },
            },
            'filter' : {
                'stemming_english_minimal' : {
                    'type': "stemmer",
                    'name': "minimal_english"
                },
                'english_stop': {
                    'type': "stop",
                    'stopwords': "_english_"
                },
                'dbpedia_stop': {
                    'type': "stop",
                    'stopwords': ["en", "http", "www", "org"]
                }
            },
        }
    },
    'mappings': {
        'properties': {
            'names': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "english_analyzer"
            },
            'categories': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "english_analyzer"
            },
            'similar_entity_names': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "english_analyzer"
            },
            'attributes': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "english_analyzer"
            },
            'related_entity_names': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "english_analyzer"
            },
            'related_entity_uri': {
                'type': "text",
                'analyzer': "whitespace"
            },
           'catch_all': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "english_analyzer"
            }
        }
    }
}

In [None]:
es.indices.get_alias('*')
#es.indices.delete(index=INDEX_NAME)   
#es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

# Files used

In [None]:
# filename, reverse relations
files = {
    'knowledge_base/anchor_text_en.ttl' : False,
    'knowledge_base/article_categories_en.ttl' : False,
    'knowledge_base/disambiguations_en.ttl' : True,
    'knowledge_base/infobox_properties_en.ttl' : False,
    'knowledge_base/instance_types_transitive_en.ttl' : False,
    'knowledge_base/labels_en.ttl' : False,
    'knowledge_base/long_abstracts_en.ttl' : False,
    'knowledge_base/mappingbased_literals_en.ttl' : False,
    'knowledge_base/mappingbased_objects_en.ttl' : False,
    'knowledge_base/page_links_en.ttl' : False,
    'knowledge_base/persondata_en.ttl' : False,
    'knowledge_base/short_abstracts_en.ttl' : False,
    'knowledge_base/transitive_redirects_en.ttl' : True,
}

## Fields predicates

In [None]:
fields = {
    'names' : [
        '<http://xmlns.com/foaf/0.1/name>',
        '<http://dbpedia.org/property/name>',
        '<http://xmlns.com/foaf/0.1/givenName>',
        '<http://xmlns.com/foaf/0.1/surname>',
        '<http://dbpedia.org/property/officialName>',
        '<http://dbpedia.org/property/officialNames>',
        '<http://dbpedia.org/property/fullName>',
        '<http://dbpedia.org/property/nativename>',
        '<http://dbpedia.org/property/birthname>',
        '<http://dbpedia.org/property/nickname>',
        '<http://dbpedia.org/property/nicknames>',
        '<http://dbpedia.org/property/showName>',
        '<http://dbpedia.org/property/shipName>',
        '<http://dbpedia.org/property/clubname>',
        '<http://dbpedia.org/property/unitName>',
        '<http://dbpedia.org/property/otherNames>',
        '<http://dbpedia.org/property/othernames>',
        '<http://dbpedia.org/property/otherName>',
        '<http://dbpedia.org/property/othername>',
        '<http://dbpedia.org/property/formerNames>',
        '<http://dbpedia.org/property/formerName>',
        '<http://dbpedia.org/property/formernames>',
        '<http://dbpedia.org/property/birthName>',
        '<http://dbpedia.org/property/birthname>',
        '<http://dbpedia.org/property/alternativeNames>',
        '<http://dbpedia.org/property/alternateNames>',
        '<http://dbpedia.org/property/otherNames>',
        '<http://dbpedia.org/property/othernames>',
        '<http://dbpedia.org/property/names>',
        '<http://dbpedia.org/property/fullname>',
        '<http://www.w3.org/2000/01/rdf-schema#label>'
    ],
    'categories' : [
        '<http://purl.org/dc/terms/subject>'
    ],
    'similar_entity_names' : [
        '<http://dbpedia.org/ontology/wikiPageWikiLinkText>',
        '<http://dbpedia.org/ontology/wikiPageDisambiguates>',
        '<http://dbpedia.org/ontology/wikiPageRedirects>'
    ]
}

## Load entity, category and property labels

In [None]:
def read_ttl(filename, predicate_match):
    mapping = {}
 
    with open(filename, 'r') as f:
        for line in tqdm(f, desc=f'extracting {filename}', total=get_n_lines(filename)):
            if not line.startswith('<'):
                continue

            line = line.split(' ', maxsplit=2)
            subject, predicate, object = line[0], line[1], line[2]
            if predicate == predicate_match and subject not in mapping:
                mapping[subject] = object
                
    return mapping

In [None]:
predicate_match = '<http://www.w3.org/2000/01/rdf-schema#label>'

filename = 'knowledge_base/labels_en.ttl'
entity_labels = read_ttl(filename, predicate_match)

filename = 'knowledge_base/category_labels_en.ttl'
category_labels = read_ttl(filename, predicate_match)

filename = 'knowledge_base/infobox_property_definitions_en.ttl'
property_labels = read_ttl(filename, predicate_match)

## URI resolution

In [None]:
def is_literal(x):
    return True if x.endswith('@en .\n') or x.endswith('".\n') else False

def is_attribute(x):
    return True if x.startswith('<http://dbp') else False

def is_category(x):
    return True if x.startswith('<http://dbpedia.org/resource/Category:') else False

def resolve_category(x):
    return category_labels.get(x, x)

def is_property(x):
    return True if x.startswith('<http://dbpedia.org/property/') else False

def resolve_property(x):
    # what to put as default value ? ex : <http://www.w3.org/2000/01/rdf-schema#comment> is not resolved ?
    return property_labels.get(x, x)

def is_entity(x):
    return True if x.startswith('<http://dbpedia.org/resource/') else False

def resolve_entity(x):
    return entity_labels.get(x, x)

def is_http(x):
    return True if x.startswith('<http') else False

def resolve_http(x):
    return x.split('>')[0].split('/')[-1]

def is_typed(x):
    return True if '^^<' in x else False

def resolve_type(x):
    return x.split('"^^')[0]

def resolve(x):
    if is_category(x):
        return resolve_category(x)
    elif is_property(x):
        return resolve_property(x)
    elif is_entity(x):
        return resolve_entity(x)
    elif is_http(x):
        return resolve_http(x)
    elif is_typed(x):
        return resolve_type(x)
    else:
        return x

## Entity serialization

In [None]:
def serialize_entity(entity, predicates, objects):
    entry = {
        'names' : '',
        'categories' : '',
        'similar_entity_names' : '',
        'attributes' : '',
        'related_entity_names' : '',
        'related_entity_uri' : '',
        'catch_all' : ''
    }
    
    for pred, obj in zip(predicates, objects):
        resolved_obj = resolve(obj)
        resolved_pred = resolve(pred)
        
        found_field = False
        for field, predicate_list in fields.items():
            if pred in predicate_list:
                entry[field] =  f'{entry[field]} {resolved_obj}'
                found_field = True
                break
        
        if not found_field:
            if is_attribute(pred) and is_literal(obj):
                    entry['attributes'] = f'{entry["attributes"]} {resolved_pred} {resolved_obj}'
            elif is_attribute(pred) and is_entity(obj):
                    entry['related_entity_names'] = f'{entry["related_entity_names"]} {resolved_obj}'
                    entry['related_entity_uri'] = f'{entry["related_entity_uri"]} {obj}'
            else:
                entry['catch_all'] = f'{entry["catch_all"]} {resolved_obj}'
            
    return entry

## Memory indexing

In [None]:
def merge_with_collection(entity, entry, collection):
    if entity not in collection:
        collection[entity] = entry
    else:
        for key, value in entry.items():
            collection[entity][key] = ' '.join([value, collection[entity][key]])

In [11]:
def generate_data(collection):
    for entity, entry in tqdm(collection.items(), desc=f'indexing collection'):
        if len(bytes(entity, 'utf-8')) < 512 and entry['names'].strip() and entry['catch_all'].strip():
            yield {
                '_index': INDEX_NAME,
                '_id' : entity,
                '_source': entry,
            }

In [None]:
collection = {}

In [None]:
def index_collection():
    for filename, reverse_relation in files.items():
        
        current_subject = None
        predicates = []
        objects = []
        
        with open(filename,'r') as f:
            for i, line in enumerate(tqdm(f, desc=f'reading {filename}', total=get_n_lines(filename))):
                if not line.startswith('<'):
                    continue

                line = line.split(' ', maxsplit=2)
                
                if reverse_relation:
                    subject, predicate, object = line[2], line[1], line[0]
                    # get rid of the trailing characters at the end of the line
                    subject = subject.split(' ')[0]
                else:
                    subject, predicate, object = line[0], line[1], line[2]
                    # to properly retrieve entity URI
                    if not is_literal(object):
                        object = object.split(' ')[0]

                # on first line, initialize current subject
                if current_subject is None:
                    current_subject = subject

                # while subject is the same, add predicates and objects to subject
                if subject == current_subject:
                    predicates.append(predicate)
                    objects.append(object)

                # when subject changes, merge entity with collection in memory
                else:
                    entry = serialize_entity(current_subject, predicates, objects)
                    merge_with_collection(current_subject, entry, collection)
                    current_subject = subject
                    predicates = [predicate]
                    objects = [object]

In [None]:
index_collection()

## Elasticsearch indexing

In [None]:
# finally, index collection
helpers.bulk(es, generate_data(collection))

## Testing our index 

In [49]:
pprint(es.search(index=INDEX_NAME, q='encryption standard', filter_path=['hits.hits._id']))

{'hits': {'hits': [{'_id': '<http://dbpedia.org/resource/Advanced_Encryption_Standard>'},
                   {'_id': '<http://dbpedia.org/resource/Data_Encryption_Standard>'},
                   {'_id': '<http://dbpedia.org/resource/Advanced_Encryption_Standard_process>'},
                   {'_id': '<http://dbpedia.org/resource/S-63_(encryption_standard)>'},
                   {'_id': '<http://dbpedia.org/resource/AES_implementations>'},
                   {'_id': '<http://dbpedia.org/resource/Intel_Cascade_Cipher>'},
                   {'_id': '<http://dbpedia.org/resource/IEEE_P1619>'},
                   {'_id': '<http://dbpedia.org/resource/AES_instruction_set>'},
                   {'_id': '<http://dbpedia.org/resource/Brute_Force:_Cracking_the_Data_Encryption_Standard>'},
                   {'_id': '<http://dbpedia.org/resource/Poly1305>'}]}}


In [9]:
doc = es.get(index=INDEX_NAME, id='<http://dbpedia.org/resource/Advanced_Encryption_Standard>')
pprint(doc)

{'_id': '<http://dbpedia.org/resource/Advanced_Encryption_Standard>',
 '_index': 'collection_v2',
 '_primary_term': 1,
 '_seq_no': 39548,
 '_source': {'attributes': '                        abstract "The Advanced '
                           'Encryption Standard (AES), also known as Rijndael '
                           '(its original name), is a specification for the '
                           'encryption of electronic data established by the '
                           'U.S. National Institute of Standards and '
                           'Technology (NIST) in 2001.AES is based on the '
                           'Rijndael cipher developed by two Belgian '
                           'cryptographers, Joan Daemen and Vincent Rijmen, '
                           'who submitted a proposal to NIST during the AES '
                           'selection process. Rijndael is a family of ciphers '
                           'with different key and block sizes.For AES, NIST '
              

In [10]:
tv = es.termvectors(index=INDEX_NAME, id='<http://dbpedia.org/resource/Isaac_Asimov>', fields='*', field_statistics=True)['term_vectors']
pprint(tv)

{'attributes': {'field_statistics': {'doc_count': 4673761,
                                     'sum_doc_freq': 285985189,
                                     'sum_ttf': 408590200},
                'terms': {'10': {'term_freq': 1, 'tokens': [{'position': 88}]},
                          '1920': {'term_freq': 1,
                                   'tokens': [{'position': 32}]},
                          '1948': {'term_freq': 1,
                                   'tokens': [{'position': 485}]},
                          '1964': {'term_freq': 1,
                                   'tokens': [{'position': 219}]},
                          '1965': {'term_freq': 1,
                                   'tokens': [{'position': 421}]},
                          '1992': {'term_freq': 1,
                                   'tokens': [{'position': 35}]},
                          '2': {'term_freq': 1, 'tokens': [{'position': 31}]},
                          '500': {'term_freq': 1, 'tokens': [{'positio

                                               'tokens': [{'position': 607}]},
                                    'biochemist': {'term_freq': 1,
                                                   'tokens': [{'position': 1975}]},
                                    'biochemistry': {'term_freq': 3,
                                                     'tokens': [{'position': 27},
                                                                {'position': 2187},
                                                                {'position': 2212}]},
                                    'biographical': {'term_freq': 1,
                                                     'tokens': [{'position': 1913}]},
                                    'biography': {'term_freq': 1,
                                                  'tokens': [{'position': 1027}]},
                                    'birth': {'term_freq': 1,
                                              'tokens': [{'position': 1931}]},
    

                                                          {'position': 2177},
                                                          {'position': 2219}]},
                                    'space': {'term_freq': 5,
                                              'tokens': [{'position': 541},
                                                         {'position': 1051},
                                                         {'position': 1347},
                                                         {'position': 1582},
                                                         {'position': 1858}]},
                                    'speaking': {'term_freq': 1,
                                                 'tokens': [{'position': 2126}]},
                                    'speculative': {'term_freq': 1,
                                                    'tokens': [{'position': 530}]},
                                    'spider': {'term_freq': 1,
                                

                                                                                              'position': 328,
                                                                                              'start_offset': 16014}]},
                                  '<http://dbpedia.org/resource/Honorary_degree>': {'term_freq': 1,
                                                                                    'tokens': [{'end_offset': 11972,
                                                                                                'position': 244,
                                                                                                'start_offset': 11927}]},
                                  '<http://dbpedia.org/resource/Howard_W._Blakeslee>': {'term_freq': 1,
                                                                                        'tokens': [{'end_offset': 12066,
                                                                                            

                          'terms': {'aisek': {'term_freq': 1,
                                              'tokens': [{'position': 19}]},
                                    'asenian': {'term_freq': 1,
                                                'tokens': [{'position': 3640}]},
                                    'asimof': {'term_freq': 1,
                                               'tokens': [{'position': 70}]},
                                    'asimov': {'term_freq': 1693,
                                               'tokens': [{'position': 11},
                                                          {'position': 20},
                                                          {'position': 23},
                                                          {'position': 26},
                                                          {'position': 28},
                                                          {'position': 31},
                                                     

## Computing field lengths

In [33]:
collection_stats = {}

for field, stats in tv.items():
    collection_stats[field] = stats['field_statistics']
    
with open('data/collection_stats.json', 'w') as f:
    json.dump(collection_stats, f, indent=4)

In [34]:
collection_stats

{'attributes': {'sum_doc_freq': 285985189,
  'doc_count': 4673761,
  'sum_ttf': 408590200},
 'related_entity_names': {'sum_doc_freq': 253984319,
  'doc_count': 4672196,
  'sum_ttf': 484036251},
 'catch_all': {'sum_doc_freq': 181848260,
  'doc_count': 4670331,
  'sum_ttf': 242142938},
 'categories': {'sum_doc_freq': 48566914,
  'doc_count': 4623214,
  'sum_ttf': 67749741},
 'names': {'sum_doc_freq': 15904769,
  'doc_count': 4673671,
  'sum_ttf': 39369886},
 'similar_entity_names': {'sum_doc_freq': 21662487,
  'doc_count': 4496193,
  'sum_ttf': 256472073},
 'related_entity_uri': {'sum_doc_freq': 157668925,
  'doc_count': 4672196,
  'sum_ttf': 194688711}}

## Modifying the default scoring 

In [61]:
es.indices.close(index=INDEX_NAME)

{'acknowledged': True,
 'shards_acknowledged': True,
 'indices': {'collection_v2': {'closed': True}}}

In [66]:
body = {
    "similarity": {
        "default": {
            "type": "DFI",
            "independence_measure": "standardized"
        }
    }
}

es.indices.put_settings(index=INDEX_NAME, body=body)

{'acknowledged': True}

In [67]:
es.indices.open(index=INDEX_NAME)

{'acknowledged': True, 'shards_acknowledged': True}