# Smart Document Retrieval System

The objective of this project is to create and implement an information retrieval system utilizing Elasticsearch for document indexing and retrieval. The focus involves extracting temporal expressions and georeferences from documents to enable spatiotemporal and textual queries. Users can search for information based on time-related, geographical aspects, and traditional textual queries. This comprehensive approach enhances the system's capability to handle a wide range of queries, making it a powerful tool for information retrieval.

Import required libraries 

In [1]:
import os
import re
import zipfile
import spacy
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from nltk.stem import PorterStemmer
from dateutil import parser
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderUnavailable
import time
from collections import Counter

## Elasticsearch Connection
Connect to the server

In [2]:
elasticsearch_host = 'localhost'
elasticsearch_port = 9200

es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])

Test Server Connection 

In [3]:
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Connection to Elasticsearch failed")

Connected to Elasticsearch


## Collecting & Cleaning Data
### Data Collecting
Assign the zip file path as `zip_file` and the location for extracting the files as `extract_files_path`.

In [4]:
zip_path = 'C:\\Users\\yasee\Downloads\\archive (1).zip'
extract_files_path = 'C:\\Users\\yasee\\Downloads\\extracted_data'

The `unzip_data_file` function takes the path for the folder that contains data, then exteact all the files in that path.

In [5]:
def unzip_data_file(zip_path, extract_path):
    try:
        os.makedirs(extract_path, exist_ok=True)

        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)

        print(f"Successfully extracted files to {extract_path}")
    except Exception as e:
        print(f"Error during extraction: {e}")

unzip_data_file(zip_path, extract_files_path)

Successfully extracted files to C:\Users\yasee\Downloads\extracted_data


The `extract_reuters` function takes the path to the extracted files of type `sgm`, extracts all the Reuters elements, and then returns them as a list.

In [6]:
def extract_reuters(extract_files_path):
    reuters = []
    try:
        for file in os.listdir(extract_files_path):
            if file.endswith(".sgm"):
                filename = os.path.join(extract_files_path, file)
                
                with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
                    data_file = f.read()

                soup = BeautifulSoup(data_file, 'html.parser')
                reuters.extend(soup.find_all('reuters'))
            
        print(f"Successfully extract ruters.")
        return reuters
    except Exception as e:
        print(f"Error during extracting ruters: {e}") 
        
reuters = extract_reuters(extract_files_path)
print(f"We have {len(reuters)} reuters.")

Successfully extract ruters.
We have 21578 reuters.


### Data Cleaning
Load `spaCy` model

In [7]:
nlp = spacy.load("en_core_web_lg")

In [24]:
def convert_date_format(date):
    try:
        parsed_date = parser.parse(date[:22].strip())
        return parsed_date.strftime("%Y-%m-%dT%H:%M:%SZ")
    except Exception as e:
        print(f"Error parsing date: {e}") 
        return "N/A"

In [9]:
def extract_body_content(body):
    tokens = nlp(body)

    tokens = [token.text for token in tokens if not token.is_stop and len(token.text) >= 3]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

The `extract_authors` function takes authors as a string, then splits the string using `and` or `by` as separators. It subsequently removes extra whitespaces from the beginning and end of the string. Each author is then stored as an object containing `firstname` and `surname`.

In [10]:
def extract_authors(authors):
    unclean_author_list = re.split(r'\b(?:BY|AND)\b', authors, flags=re.IGNORECASE)
    clean_author_list = [author.strip() for author in unclean_author_list if author.strip()]

    authors_list = []
    
    for author in clean_author_list:
        author_parts = author.split(',')[0].split(' ')
        authors_list.append({"firstname": author_parts[0], "surname": author_parts[1]})
        
    return authors_list

# Usage
print(extract_authors("    by Janie Gabbett, Reuters"))
print(extract_authors("    by Janie Gabbett and Mike Ross, Reuters"))

[{'firstname': 'Janie', 'surname': 'Gabbett'}]
[{'firstname': 'Janie', 'surname': 'Gabbett'}, {'firstname': 'Mike', 'surname': 'Ross'}]


The `extract_d_elements` function takes a tag that contais elemts in tag `D`, then extract all these elements as a list.

In [11]:
def extract_d_elements(d_elements):
    if d_elements:
        d_elements = d_elements.find_all('d')
        return [element.text for element in d_elements]

In [12]:
def clean_title(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text().strip().replace("  "," ")

In [5]:
import requests
import json

def get_coordinates(places):
    list_of_places_corrd = []
    try:
        api_url = "https://nominatim.openstreetmap.org/search?format=json&q=" + "%20".join(places)

        response = requests.get(api_url)
        data = json.loads(response.text)

        for place in data:
            if(place['addresstype'] == "city" or place['addresstype'] == "state"):
                list_of_places_corrd.append({
                    "lat": place['lat'], 
                    "lon": place['lon']
                })
        return list_of_places_corrd
    except Exception as e:
        print(f"Failed to retruive corrdinates: {e}")
        get_coordinates(places)
    return {"lat": 0, "lon": 0}


[]


In [15]:
def extract_temporal_expressions(text, category):
    doc = nlp(text)
    temporal_expressions = [ent.text for ent in doc.ents if ent.label_ == category]

    return temporal_expressions

In [16]:
def extract_dateline_countrys(dateline):
    
    countrys = extract_temporal_expressions(nlp(dateline), "GPE")  # Geo-Political Entity

    return get_coordinates(countrys)
# Usage 
print(extract_dateline_countrys("    LOS ANGELES, Feb 26 - "))

[{'lat': '34.0536909', 'lon': '-118.242766'}, {'lat': '8.524167250000001', 'lon': '-82.19418575966449'}, {'lat': '-37.4707455', 'lon': '-72.351686'}]


In [17]:
def get_coordinates_for_places(places):

    places = extract_d_elements(places)
    
    return get_coordinates(places)

In [18]:
def extract_geopoints(body):
    places = extract_temporal_expressions(nlp(body), "GPE")
    
    places = list(set(places))
    return get_coordinates(places)    


The `approximate_geopoints` function reutrn the most occurences geopoint if there wasn't a country or city

In [19]:
from collections import Counter

def approximate_geopoints(georeferences, countrys_coordinates, citys_coordinates):

    if len(countrys_coordinates) == 0 and len(citys_coordinates) == 0 and not len(georeferences) == 0:
        # Counter to count occurrences of each georeference
        georeference_counts = Counter(tuple(georef.items()) for georef in georeferences)
        most_common_georeference = dict(georeference_counts.most_common(1)[0][0])
        
        return [{"lat": most_common_georeference['lat'], "lon": most_common_georeference['lon']}]
    else:
        return [countrys_coordinates] + [citys_coordinates]

# Usage
georeferences_list = [{"lat": 0, "lon": 0}, {"lat": 12.123, "lon": 1.4213}, {"lat": 12.123, "lon": 1.4213}, {"lat": 2, "lon": 2}]
print(approximate_geopoints(georeferences_list, [], []))

[{'lat': 12.123, 'lon': 1.4213}]


In [20]:
def extract_temporal_expression_as_dict(reuter):
    if reuter.find('body'):
        temproals = extract_temporal_expressions(reuter.find('body').text, "DATE")
        return [{ "expression" : temp } for temp in temproals]
    else:
        { "expression": "N/A" }

In [21]:
index_mapping = {
    "mappings": {
        "properties": {
            "date": {"type": "date"},
            "topics": {"type": "keyword"},
            "title": {"type": "text", "analyzer": "autocomplete", "search_analyzer": "autocomplete_search"},
            "author": {
                "type": "nested",
                "properties": {
                    "firstname": {"type": "text", "fields": {"keyword": {"type": "keyword"}}},
                    "surname": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}
                }
            },
            "analized-body": {"type": "text"},
            "body": {"type": "text"},
            "temporal-expression": {
                "type": "nested",
                "properties": {
                    "expression": {"type": "text"}
                }
            },
            "geopoints": {
                "type": "nested",
                "properties": {
                    "lon": {"type": "double"},
                    "lat": {"type": "double"}
                }
            },
            "georeferences": {
                "type": "nested",
                "properties": {
                    "lon": {"type": "double"},
                    "lat": {"type": "double"}
                }
            }
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "autocomplete": {
                    "tokenizer": "autocomplete",
                    "filter": ["lowercase"]
                },
                "autocomplete_search": {
                    "tokenizer": "lowercase"
                }
            },
            "tokenizer": {
                "autocomplete": {
                    "type": "edge_ngram",
                    "min_gram": 3,
                    "max_gram": 10,
                    "token_chars": ["letter", "digit"]
                }
            }
        }
    }
}


In [22]:
index_name = "smart_document_system"

if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=index_mapping)

In [27]:
actions = []

for reuter in reuters: 
    citys_coordinates = extract_dateline_countrys(reuter.find('dateline').text) if reuter.find('dateline') else []
    countrys_coordinates = get_coordinates_for_places(reuter.find('places')) if reuter.find('places') else []
    georeferences = extract_geopoints(reuter.find('body').text) if reuter.find('body') else [{ "lat": 0, "lon": 0 }]    
    
    actions.append({
        "_op_type": "index",
        "_index": index_name,
        "_source": {
            'date': convert_date_format(reuter.find('date').text) if reuter.find('date') else "N/A",
            'topics': extract_d_elements(reuter.find('topics')) if reuter.find('topics') else "N/A",
            'title': clean_title(reuter.find('title').text) if reuter.find('title') else "N/A",
            'author': extract_authors(reuter.find('author').text) if reuter.find('author') else { "firstname": "N/A", "surname": "N/A" }, 
            'analized-body': extract_body_content(reuter.find('body').text) if reuter.find('body') else "N/A",
            'body': reuter.find('body').text if reuter.find('body') else "N/A",  # For frontend purpose
            'temporal-expression': extract_temporal_expression_as_dict(reuter) if reuter.find('body') else { "expression": "N/A" },
            'geopoints': approximate_geopoints(georeferences, countrys_coordinates, citys_coordinates), 
            'georeferences': georeferences
            }
    })
    
    if len(actions) == 500:   
        success, failed = bulk(es, actions)
        count_result = es.count(index=index_name)
        indexed_docs = count_result['count']
        
        print(f"Successfully indexed new {success} articles.")
        print(f"Number of indexed articles is {indexed_docs} until now.")
        actions = []
        

if actions:
    success, failed = bulk(es, actions)
    count_result = es.count(index=index_name)
    indexed_docs = count_result['count']

    print(f"Successfully indexed new {success} articles.")
    print(f"Total number of indexed articles is {indexed_docs}.")
    


Total number of indexed articles is 21579.
