# Smart Document Retrieval System

The objective of this project is to create and implement an information retrieval system utilizing Elasticsearch for document indexing and retrieval. The focus involves extracting temporal expressions and georeferences from documents to enable spatiotemporal and textual queries. Users can search for information based on time-related, geographical aspects, and traditional textual queries. This comprehensive approach enhances the system's capability to handle a wide range of queries, making it a powerful tool for information retrieval.

Import required libraries 

In [59]:
import os
import re
import zipfile
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
import spacy
from nltk.stem import PorterStemmer

## Elasticsearch Connection
Connect to the server

In [60]:
elasticsearch_host = 'localhost'
elasticsearch_port = 9200

es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])

Test Server Connection 

In [61]:
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Connection to Elasticsearch failed")

Connection to Elasticsearch failed


## Collecting & Cleaning Data
### Data Collecting
Assign the zip file path as `zip_file` and the location for extracting the files as `extract_files_path`.

In [62]:
zip_path = r'C:\\Users\\yasee\Downloads\\archive (1).zip'
extract_files_path = 'C:\\Users\\yasee\\Downloads\\extracted_data'

The `unzip_data_file` function takes the path for the folder that contains data, then exteact all the files in that path.

In [63]:
def unzip_data_file(zip_path, extract_path):
    try:
        os.makedirs(extract_path, exist_ok=True)

        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_path)

        print(f"Successfully extracted files to {extract_path}")
    except Exception as e:
        print(f"Error during extraction: {e}")

unzip_data_file(zip_path, extract_files_path)

Successfully extracted files to C:\Users\yasee\Downloads\extracted_data


The `extract_reuters` function takes the path to the extracted files of type `sgm`, extracts all the Reuters elements, and then returns them as a list.

In [64]:
def extract_reuters(extract_files_path):
    reuters = []
    try:
        for file in os.listdir(extract_files_path):
            if file.endswith(".sgm"):
                filename = os.path.join(extract_files_path, file)
                
                with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
                    data_file = f.read()

                soup = BeautifulSoup(data_file, 'html.parser')
                reuters.extend(soup.find_all('reuters'))
            
        print(f"Successfully extract ruters.")
        return reuters
    except Exception as e:
        print(f"Error during extracting ruters: {e}") 
        
reuters = extract_reuters(extract_files_path)
print(f"We have {len(reuters)} reuters.")

Successfully extract ruters.
We have 21578 reuters.


### Data Cleaning
Initlize requier models

In [65]:
# Load spaCy model
nlp = spacy.load("en_core_web_lg")

In [66]:
def process_body_content(body):
    if not body == "N/A":
        tokens = nlp(body)

        tokens = [token.text for token in tokens if not token.is_stop and len(token.text) >= 3]

        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

    return tokens

The `split_authors` function takes authors as a string, then splits the string using `and` or `by` as separators. It subsequently removes extra whitespaces from the beginning and end of the string. Each author is then stored as an object containing `Firstname` and `Surname`.

In [67]:
def split_authors(authors):
    unclean_author_list = re.split(r'\b(?:BY|AND)\b', authors, flags=re.IGNORECASE)
    clean_author_list = [author.strip() for author in unclean_author_list if author.strip()]
    
    authors_list = []
    for author in clean_author_list:
        author_parts = author.split(',')[0].split(' ')
        authors_list.append({"Firstname": author_parts[0], "Surname": author_parts[1]})
        
    return authors_list

The `extreact_entitys` function takes all reuters, then extract all needed entitys, then stored them in articles list as an objects.

In [68]:
def extreact_entitys(reuters):
    articles = []
    for reuter in reuters:
        
        article = {
            'date': reuter.find('date').text if reuter.find('date') else "N/A",
            'topics': reuter.find('topics').text if reuter.find('topics') else "N/A",
            'places': reuter.find('places').text if reuter.find('places') else "N/A",
            'title': reuter.find('title').text if reuter.find('title') else "N/A",
            'author': split_authors(reuter.find('author').text) if reuter.find('author') else "N/A",
            'dateline': reuter.find('dateline').text if reuter.find('dateline') else "N/A",
            'body': process_body_content(reuter.find('body').text) if reuter.find('body') else "N/A"
        }
        
        articles.append(article)
        
    return articles

articles = extreact_entitys(reuters)
print(articles[0])

{'date': '26-FEB-1987 15:01:01.79', 'topics': 'cocoa', 'places': 'el-salvadorusauruguay', 'title': 'BAHIA COCOA REVIEW', 'author': 'N/A', 'dateline': '    SALVADOR, Feb 26 - ', 'body': ['shower', 'continu', 'week', 'bahia', 'cocoa', 'zone', 'allevi', 'drought', 'earli', 'januari', 'improv', 'prospect', 'come', 'temporao', 'normal', 'humid', 'level', 'restor', 'comissaria', 'smith', 'said', 'weekli', 'review', '\n    ', 'dri', 'period', 'mean', 'temporao', 'late', 'year', '\n    ', 'arriv', 'week', 'end', 'februari', '155,221', 'bag', 'kilo', 'make', 'cumul', 'total', 'season', '5.93', 'mln', '5.81', 'stage', 'year', 'cocoa', 'deliv', 'earlier', 'consign', 'includ', 'arriv', 'figur', '\n    ', 'comissaria', 'smith', 'said', 'doubt', 'old', 'crop', 'cocoa', 'avail', 'harvest', 'practic', 'come', 'end', 'total', 'bahia', 'crop', 'estim', '6.4', 'mln', 'bag', 'sale', 'stand', '6.2', 'mln', 'thousand', 'bag', 'hand', 'farmer', 'middlemen', 'export', 'processor', '\n    ', 'doubt', 'cocoa', 

In [69]:
for article in articles:
    print(article['date'])

26-FEB-1987 15:01:01.79
26-FEB-1987 15:02:20.00
26-FEB-1987 15:03:27.51
26-FEB-1987 15:07:13.72
26-FEB-1987 15:10:44.60
26-FEB-1987 15:14:36.41
26-FEB-1987 15:14:42.83
26-FEB-1987 15:15:40.12
26-FEB-1987 15:17:11.20
26-FEB-1987 15:18:06.67
26-FEB-1987 15:18:59.34
26-FEB-1987 15:19:15.45
26-FEB-1987 15:20:13.09
26-FEB-1987 15:20:27.17
26-FEB-1987 15:20:48.43
26-FEB-1987 15:21:16.13
26-FEB-1987 15:24:48.56
26-FEB-1987 15:26:26.78
26-FEB-1987 15:26:54.12
26-FEB-1987 15:32:03.12
26-FEB-1987 15:33:23.61
26-FEB-1987 15:34:07.03
26-FEB-1987 15:34:16.30
26-FEB-1987 15:35:16.67
26-FEB-1987 15:35:39.38
26-FEB-1987 15:36:44.78
26-FEB-1987 15:36:53.42
26-FEB-1987 15:38:26.23
26-FEB-1987 15:39:41.92
26-FEB-1987 15:41:56.54
26-FEB-1987 15:43:14.36
26-FEB-1987 15:43:59.53
26-FEB-1987 15:44:36.04
26-FEB-1987 15:45:19.65
26-FEB-1987 15:45:26.55
26-FEB-1987 15:45:35.37
26-FEB-1987 15:45:39.20
26-FEB-1987 15:45:47.29
26-FEB-1987 15:46:36.16
26-FEB-1987 15:47:16.17
26-FEB-1987 15:48:26.92
26-FEB-1987 15:4

()

In [None]:
import spacy
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer

# Load spaCy model
nlp = spacy.load("en_core_web_lg")

# Function to preprocess the body text
def preprocess_text(body):
    # Remove HTML tags
    body = BeautifulSoup(body, 'html.parser').get_text()

    # Tokenize using spaCy
    tokens = nlp(body)

    # Remove stop words and tokens with length less than 3
    tokens = [token.text for token in tokens if not token.is_stop and len(token.text) >= 3]

    # Stemming using nltk PorterStemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

# Example usage:
body = "Your body text goes here. <p>HTML tags might be present.</p> It also contains stop words and short words."
processed_tokens = preprocess_text(body)
print(processed_tokens)


In [None]:

from dateutil import parser

original_date_str = "26-FEB-1987 15:02:20.00"
parsed_date = parser.parse(original_date_str)
formatted_date_str = parsed_date.strftime("%Y-%m-%d %H:%M:%S ")
print(formatted_date_str)