In [2]:
import os
import re
import zipfile
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

# Elasticsearch Connection
Connect to the server

In [3]:
elasticsearch_host = 'localhost'
elasticsearch_port = 9200

es = Elasticsearch([f'http://{elasticsearch_host}:{elasticsearch_port}'])

Test Server Connection 

In [4]:
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Connection to Elasticsearch failed")

Connection to Elasticsearch failed


# Collecting & Cleaning Data
## Data Cleaning Functions
The `split_authors` function takes authors as a string, then splits the string using `and` or `by` as separators. It subsequently removes extra whitespaces from the beginning and end of the string. Each author is then stored as an object containing `Firstname` and `Surname`.

In [22]:
def split_authors(authors):
    unclean_author_list = re.split(r'\b(?:BY|AND)\b', authors, flags=re.IGNORECASE)
    clean_author_list = [author.strip() for author in unclean_author_list if author.strip()]
    
    authors_list = []
    for author in clean_author_list:
        author_parts = author.split(',')[0].split(' ')
        authors_list.append({"Firstname": author_parts[0], "Surname": author_parts[1]})
        
    return authors_list


## Data Collection

In [25]:
articles = []
folder_path = "C:\\Users\\yasee\\Downloads\\extracted_data"

for file in os.listdir(folder_path):
    if file.endswith(".sgm"):
        filename = os.path.join(folder_path, file)
        
        with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
            data_file = f.read()

        soup = BeautifulSoup(data_file, 'html.parser')
        reuters_elements = soup.find_all('reuters')

        for reuters_element in reuters_elements:
            date = reuters_element.find('date').text if reuters_element.find('date') else "N/A"
            topic = reuters_element.find('topics').text if reuters_element.find('topics') else "N/A"
            place = reuters_element.find('places').text if reuters_element.find('places') else "N/A"
            title = reuters_element.find('title').text if reuters_element.find('title') else "N/A"
            authors = split_authors(reuters_element.find('author').text) if reuters_element.find('author') else "N/A"
            dateline = reuters_element.find('dateline').text if reuters_element.find('dateline') else "N/A"
            body = reuters_element.find('body').text if reuters_element.find('body') else "N/A"

            article = {
                'date': date,
                'topics': topic,
                'places': place,
                'title': title,
                'author': authors,
                'dateline': dateline,
                'body': body
            }
            articles.append(article)

In [6]:
print('We have {} documents'.format(len(articles)))

print(articles[0])


We have 21578 documents
{'date': '26-FEB-1987 15:01:01.79', 'topics': 'cocoa', 'places': 'el-salvadorusauruguay', 'title': 'BAHIA COCOA REVIEW', 'author': 'N/A', 'dateline': '    SALVADOR, Feb 26 - ', 'body': 'Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 ml

In [24]:
# Iterate through each article in the list
for article in articles:
    if article['author'] != "N/A":
        print("Author:", article['author'])


Author: [{'Firstname': 'Janie', 'Lastname': 'Gabbett'}]
Author: [{'Firstname': 'Janie', 'Lastname': 'Gabbett'}]
Author: [{'Firstname': 'Michael', 'Lastname': 'Gelb'}]
Author: [{'Firstname': 'Cal', 'Lastname': 'Mankowski'}]
Author: [{'Firstname': 'Kathleen', 'Lastname': 'Hays'}]
Author: [{'Firstname': 'Jane', 'Lastname': 'Arraf'}]
Author: [{'Firstname': 'Patti', 'Lastname': 'Domm'}]
Author: [{'Firstname': 'TED', 'Lastname': "D'AFFLISIO"}]
Author: [{'Firstname': 'NAILENE', 'Lastname': 'CHOU'}]
Author: [{'Firstname': 'Sue', 'Lastname': 'Baker'}]
Author: [{'Firstname': 'John', 'Lastname': 'Morrison'}]
Author: [{'Firstname': 'Jeremy', 'Lastname': 'Solomons'}]
Author: [{'Firstname': 'Brian', 'Lastname': 'Childs'}]
Author: [{'Firstname': 'Jeff', 'Lastname': 'Stearns'}]
Author: [{'Firstname': 'Lisa', 'Lastname': 'Vaughan'}]
Author: [{'Firstname': 'Jeremy', 'Lastname': 'Clift'}]
Author: [{'Firstname': 'Chaitanya', 'Lastname': 'Kalbag'}]
Author: [{'Firstname': 'Alice', 'Lastname': 'Ratcliffe'}]


In [8]:
import re

test = "    BY PATRICK RIZZO and PATTI DOMM"
author_list = re.split(r'\b(?:BY|AND)\b', test, flags=re.IGNORECASE)
# The regex '\b(?:BY|AND)\b' matches 'BY' or 'AND' as whole words (case-insensitive)

# Remove leading and trailing whitespaces from each author
author_list = [author.strip() for author in author_list if author.strip()]

print(author_list)


['PATRICK RIZZO', 'PATTI DOMM']


In [10]:

from dateutil import parser

original_date_str = "26-FEB-1987 15:02:20.00"
parsed_date = parser.parse(original_date_str)
formatted_date_str = parsed_date.strftime("%Y-%m-%d %H:%M:%S ")
print(formatted_date_str)

1987-02-26 15:02:20 
