In [1]:
import pandas as pd 
from bs4 import BeautifulSoup
import re
import html
import glob
import numpy as np

In [2]:
# remove unuseful data from text
def parse_text(text):
    if isinstance(text, list):
        text = ' '.join(text)
    text = str(text)
    text = re.sub(r"[^a-zA-Z<>/]|\b\w\b", ' ', text)
    return re.sub(r"\s+", ' ', text).strip()

# return true if value is single element from list of valid places
def check_places(value):
    places = ['west-germany', 'usa', 'france', 'uk', 'canada', 'japan']
    if isinstance(value, list):
        return False
    return value in places

In [3]:
# list of tags with useful informations  
tags = ["title", "dateline", "body", "places", "topics"]

df = pd.DataFrame(columns=tags)

file_names = glob.glob("data/*.sgm")

for file_name in file_names:
    with open(file_name, "r", encoding="unicode_escape") as f:
        contents = f.read()
    soup = BeautifulSoup(html.unescape(contents), "html.parser")
    articles = soup.find_all("reuters")
    for article in articles:
        art = {}
        for tag in tags:
            content = article.find(tag)
            if content:
                art[tag] = (
                    list(content.strings)
                    if len(list(content.children)) > 1
                    else content.get_text()
                )
        df = df.append(art, ignore_index=True)

df["title"] = df["title"].apply(parse_text)
df["body"] = df["body"].apply(parse_text)
df["dateline"] = df["dateline"].apply(lambda x: str(x).split(",", 1)[0].strip())
df = df.replace(r"^\s*$", np.nan, regex=True)

df = df[df['places'].apply(check_places) == True]
df.head()


Unnamed: 0,title,dateline,body,places,topics
0,CHRYSLER LATE MARCH CAR SALES UP,DETROIT,Chrysler Corp said car sales for the March per...,usa,
1,WALL STREET STOCKS/COMPAQ COMPUTER,NEW YORK,Compaq Computer Corp IBM chief rival in the pe...,usa,
2,NORANDA SETS TEMPORARY MINE SHUTDOWN,Murdochville,said production will remain shut down at its f...,canada,copper
3,CANADA BUDGET DEFICIT RISES IN JANUARY,OTTAWA,The Canadian government budget deficit rose to...,canada,
5,COPLEY PROPERTIES INC INCREASES DIVIDEND,BOSTON,Qtly div cts vs cts prior Payable APril Record...,usa,earn


In [4]:
df.to_csv("parsed.csv")