In [12]:
import re
import os
import csv

import pandas as pd
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

#### The sgml tags needs to be noticed
<
REUTERS TOPICS="YES" 
LEWISSPLIT="TRAIN" 
CGISPLIT="TRAINING-SET" 
OLDID="5544" 
NEWID="1">

`LEWISSPLIT="TRAIN"` means this sgml file is `training set` in ModApte split

<
TOPICS>
<
D>
cocoa
<
/D>
<
/TOPICS>

The `text inside TOPICS tags` means `Topical category`.

<
BODY>
text text text<
/BODY>

The `text inside BODY tags` means `Text content`.

### Split warc file to docs

In [2]:
# ref : https://stackoverflow.com/questions/60269904/split-text-file-after-specific-line-in-python
def split_docs_iter(stream, start, end):
    def inner(stream):
        # Yields each line until an end marker is found (or EOF)
        for line in stream:
            if line and not end.match(line):
                yield line
                continue
            break

    # Find a start marker, then break off into a nested iterator
    for line in stream:
        if line:
            if start.match(line):
                yield inner(stream)
            continue
        break

In [13]:
REUTERS_PATH = 'reuters21578'
REUTERS_SGMLS_PATH = 'reuters_sgmls'

SECTION_START = re.compile(r'<REUTERS')
SECTION_END = re.compile(r'</REUTERS>')

CUT_START_PATTERN = "---"
CUT_END_PATTERN = "----"

CUT_START = re.compile(CUT_START_PATTERN)
CUT_END = re.compile(CUT_END_PATTERN)

files = [f for f in listdir(REUTERS_PATH) if isfile(join(REUTERS_PATH, f)) and ".sgm" in f]

# create dir
os.makedirs(REUTERS_SGMLS_PATH, exist_ok=True)

tmp_sgml = ""

# initialize the progress bar
p = tqdm(total=len(files), nrows=4, position=0, leave=True)

for file in files:
    # add patterns for later sgmls splitting
    with open(f'./{REUTERS_PATH}/{file}', 'r', encoding='ISO-8859-1') as fh_in:
        lines = fh_in.readlines()
        for line in lines:
            if SECTION_START.match(line):
                tmp_sgml += f"{CUT_START_PATTERN}\n"
                tmp_sgml += line
            elif SECTION_END.match(line):
                tmp_sgml += line
                tmp_sgml += f"{CUT_END_PATTERN}\n"
            else:
                tmp_sgml += line

    # save temporary
    with open(f'./{REUTERS_SGMLS_PATH}/tmp_sgml', 'w', encoding='UTF-8') as fh_out:
        fh_out.write(tmp_sgml)

    # split sgmls by our custom patterns
    with open(f'./{REUTERS_SGMLS_PATH}/tmp_sgml', 'r', encoding="ISO-8859-1") as fh_in:
        for (i, nested_iter) in enumerate(split_docs_iter(fh_in, CUT_START, CUT_END)):
            with open('./reuters_sgmls/docID_{:05d}'.format(i), 'w', encoding='UTF-8') as fh_out:
                for line in nested_iter:
                    fh_out.write(line)

    p.set_description('SGMLs splitted needed: ', refresh=True)
    p.update(1) # update progress bar

p.close()

os.remove(f'./{REUTERS_SGMLS_PATH}/tmp_sgml')


  0%|          | 0/22 [00:00<?, ?it/s]

#### Extract information from splitted sgmls

In [38]:
sgmls_files = [f for f in listdir(REUTERS_SGMLS_PATH) if isfile(join(REUTERS_SGMLS_PATH, f))]

# initialize the progress bar
pb = tqdm(total=len(sgmls_files), nrows=4, position=0, leave=True)

with open('reuter_docs.csv', "w", newline='') as csv_file:
    writer = csv.writer(csv_file)

    # header
    writer.writerow(['topics', 'body', 'train_test_label'])

    for sgml in sgmls_files:
        try:
            with open(f"{REUTERS_SGMLS_PATH}/{sgml}", 'r') as f:
                data = f.read()
                soup = BeautifulSoup(data, "html.parser")

                if not soup.find("reuters"):
                    train_test_label = "None"
                else:
                    train_test_label = soup.find("reuters")['lewissplit']

                if not soup.find("topics"):
                    topics = "None"
                else:
                    topics = soup.find("topics").get_text()

                if not soup.find("body"):
                    body = "None"
                else:
                    body = soup.find("body").get_text()

                writer.writerow([topics, body, train_test_label])

        except UnicodeDecodeError:
            print(f"{sgml} UnicodeDecodeError, pass")

        pb.set_description('Extracting SGMLs content: ', refresh=True)
        pb.update(1) # update progress bar

pb.close()

  0%|          | 0/21578 [00:00<?, ?it/s]

docID_17979 UnicodeDecodeError, pass


In [39]:
df = pd.read_csv('reuter_docs.csv')
df

Unnamed: 0,topics,body,train_test_label
0,cocoa,Showers continued throughout the week in\nthe ...,TRAIN
1,,Standard Oil Co and BP North America\nInc said...,TRAIN
2,,Texas Commerce Bancshares Inc's Texas\nCommerc...,TRAIN
3,,BankAmerica Corp is not under\npressure to act...,TRAIN
4,grainwheatcornbarleyoatsorghum,The U.S. Agriculture Department\nreported the ...,TRAIN
...,...,...,...
21572,ship,The Japan/India-Pakistan-Gulf/Japan\nshipping ...,TEST
21573,ipi,The Soviet Union's industrial output is\ngrowi...,TEST
21574,gold,Six black miners have been killed\nand two inj...,TEST
21575,,The prospect of a dominant alliance of\nsocial...,TEST


In [43]:
df.groupby('topics').size()

topics
acq                        2362
acqalum                       2
acqcopper                     3
acqcrude                      9
acqcrudenat-gas               8
                           ... 
yen                           6
zinc                         16
zinclead                      2
zincleadcopper                2
zincleadstrategic-metal       1
Length: 655, dtype: int64

In [44]:
df['topics'].unique()

array(['cocoa', nan, 'grainwheatcornbarleyoatsorghum',
       'veg-oillinseedlin-oilsoy-oilsun-oilsoybeanoilseedcornsunseedgrainsorghumwheat',
       'earn', 'acq', 'earnacq', 'wheatgrain', 'copper', 'housing',
       'money-supply', 'coffee', 'acqship', 'sugar', 'trade', 'reserves',
       'ship', 'graincorn', 'veg-oilsoybeanoilseedmeal-feedsoy-meal',
       'grainwheatcornoatryesorghumsoybeanoilseed', 'cotton', 'grainship',
       'carcasslivestock', 'grain', 'crude', 'nat-gas', 'cpignp',
       'grainwheat', 'graincornoat',
       'veg-oiloilseedmeal-feedsoybeansoy-oilsoy-meal', 'cpi',
       'money-fxinterest', 'interest', 'gnpbop', 'grainrice',
       'soybeanred-beanoilseed',
       'grainwheatriceveg-oilsoybeansugarrubbercopra-cakecornpalm-oilpalmkernelcoffeeteaplywoodsoy-mealcotton',
       'money-fx', 'meal-feedcopra-cake', 'alum', 'veg-oilpalm-oil',
       'teacocoacoffee', 'oilseedsoybean',
       'oilseedsoybeanmeal-feedsoy-meal', 'goldplatinumstrategic-metal',
       'meal