In [1]:
import re
import os
import csv

import pandas as pd
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

#### The sgml tags needs to be noticed
<
REUTERS TOPICS="YES" 
LEWISSPLIT="TRAIN" 
CGISPLIT="TRAINING-SET" 
OLDID="5544" 
NEWID="1">

`LEWISSPLIT="TRAIN"` means this sgml file is `training set` in ModApte split

<
TOPICS>
<
D>
cocoa
<
/D>
<
/TOPICS>

The `text inside TOPICS tags` means `Topical category`.

<
BODY>
text text text<
/BODY>

The `text inside BODY tags` means `Text content`.

### Split warc file to docs

In [2]:
# ref : https://stackoverflow.com/questions/60269904/split-text-file-after-specific-line-in-python
def split_docs_iter(stream, start, end):
    def inner(stream):
        # Yields each line until an end marker is found (or EOF)
        for line in stream:
            if line and not end.match(line):
                yield line
                continue
            break

    # Find a start marker, then break off into a nested iterator
    for line in stream:
        if line:
            if start.match(line):
                yield inner(stream)
            continue
        break

In [5]:
REUTERS_PATH = 'reuters21578'
REUTERS_SGMLS_PATH = 'reuters_sgmls'

SECTION_START = re.compile(r'<REUTERS')
SECTION_END = re.compile(r'</REUTERS>')

CUT_START_PATTERN = "---"
CUT_END_PATTERN = "----"

CUT_START = re.compile(CUT_START_PATTERN)
CUT_END = re.compile(CUT_END_PATTERN)

files = [f for f in listdir(REUTERS_PATH) if isfile(join(REUTERS_PATH, f)) and ".sgm" in f]

# create dir
os.makedirs(REUTERS_SGMLS_PATH, exist_ok=True)

tmp_sgml = ""

# initialize the progress bar
p = tqdm(total=len(files), nrows=4, position=0, leave=True)

for file in files:
    # add patterns for later sgmls splitting
    with open(f'./{REUTERS_PATH}/{file}', 'r', encoding='ISO-8859-1') as fh_in:
        lines = fh_in.readlines()
        for line in lines:
            if SECTION_START.match(line):
                tmp_sgml += f"{CUT_START_PATTERN}\n"
                tmp_sgml += line
            elif SECTION_END.match(line):
                tmp_sgml += line
                tmp_sgml += f"{CUT_END_PATTERN}\n"
            else:
                tmp_sgml += line

    # save temporary
    with open(f'./{REUTERS_SGMLS_PATH}/tmp_sgml', 'w', encoding='UTF-8') as fh_out:
        fh_out.write(tmp_sgml)

    # split sgmls by our custom patterns
    with open(f'./{REUTERS_SGMLS_PATH}/tmp_sgml', 'r', encoding="ISO-8859-1") as fh_in:
        for (i, nested_iter) in enumerate(split_docs_iter(fh_in, CUT_START, CUT_END)):
            with open('./reuters_sgmls/docID_{:05d}'.format(i), 'w', encoding='UTF-8') as fh_out:
                for line in nested_iter:
                    fh_out.write(line)

    p.set_description('SGMLs splitted needed: ', refresh=True)
    p.update(1) # update progress bar

p.close()

os.remove(f'./{REUTERS_SGMLS_PATH}/tmp_sgml')


0it [00:00, ?it/s]


FileNotFoundError: [WinError 2] 系統找不到指定的檔案。: './reuters_sgmls/tmp_sgml'

#### Extract information from splitted sgmls

In [38]:
sgmls_files = [f for f in listdir(REUTERS_SGMLS_PATH) if isfile(join(REUTERS_SGMLS_PATH, f))]

# initialize the progress bar
pb = tqdm(total=len(sgmls_files), nrows=4, position=0, leave=True)

with open('reuter_docs.csv', "w", newline='') as csv_file:
    writer = csv.writer(csv_file)

    # header
    writer.writerow(['topics', 'body', 'train_test_label'])

    for sgml in sgmls_files:
        try:
            with open(f"{REUTERS_SGMLS_PATH}/{sgml}", 'r') as f:
                data = f.read()
                soup = BeautifulSoup(data, "html.parser")

                if not soup.find("reuters"):
                    train_test_label = "None"
                else:
                    train_test_label = soup.find("reuters")['lewissplit']

                if not soup.find("topics"):
                    topics = "None"
                else:
                    topics = soup.find("topics").get_text()

                if not soup.find("body"):
                    body = "None"
                else:
                    body = soup.find("body").get_text()

                writer.writerow([topics, body, train_test_label])

        except UnicodeDecodeError:
            print(f"{sgml} UnicodeDecodeError, pass")

        pb.set_description('Extracting SGMLs content: ', refresh=True)
        pb.update(1) # update progress bar

pb.close()

  0%|          | 0/21578 [00:00<?, ?it/s]

docID_17979 UnicodeDecodeError, pass


In [4]:
df = pd.read_csv('reuter_docs.csv')
df

Unnamed: 0,topics,body,train_test_label
0,cocoa,Showers continued throughout the week in\r\nth...,TRAIN
1,,Standard Oil Co and BP North America\r\nInc sa...,TRAIN
2,,Texas Commerce Bancshares Inc's Texas\r\nComme...,TRAIN
3,,BankAmerica Corp is not under\r\npressure to a...,TRAIN
4,grainwheatcornbarleyoatsorghum,The U.S. Agriculture Department\r\nreported th...,TRAIN
...,...,...,...
21572,ship,The Japan/India-Pakistan-Gulf/Japan\r\nshippin...,TEST
21573,ipi,The Soviet Union's industrial output is\r\ngro...,TEST
21574,gold,Six black miners have been killed\r\nand two i...,TEST
21575,,The prospect of a dominant alliance of\r\nsoci...,TEST


In [5]:
df.groupby('topics').size()

topics
acq                        2362
acqalum                       2
acqcopper                     3
acqcrude                      9
acqcrudenat-gas               8
                           ... 
yen                           6
zinc                         16
zinclead                      2
zincleadcopper                2
zincleadstrategic-metal       1
Length: 655, dtype: int64

In [None]:
df['topics'].unique()

In [7]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21577 entries, 0 to 21576
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   topics            11366 non-null  object
 1   body              21577 non-null  object
 2   train_test_label  21577 non-null  object
dtypes: object(3)
memory usage: 505.8+ KB
None


In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [10]:
print(df.describe())

       topics   body train_test_label
count   11366  21577            21577
unique    655  18781                3
top      earn   None            TRAIN
freq     3945   2535            14668


In [13]:
count_vec = CountVectorizer()

In [19]:
train_set = df[(df['train_test_label']=='TRAIN')]
train_set = train_set.dropna()
train_set

Unnamed: 0,topics,body,train_test_label
0,cocoa,Showers continued throughout the week in\r\nth...,TRAIN
4,grainwheatcornbarleyoatsorghum,The U.S. Agriculture Department\r\nreported th...,TRAIN
5,veg-oillinseedlin-oilsoy-oilsun-oilsoybeanoils...,Argentine grain board figures show\r\ncrop reg...,TRAIN
8,earn,Champion Products Inc said its\r\nboard of dir...,TRAIN
9,acq,Computer Terminal Systems Inc said\r\nit has c...,TRAIN
...,...,...,...
14778,dlrmoney-fx,The Bank of Japan bought a small amount of\r\n...,TRAIN
14782,rubber,"Japan's rubber stocks fell to 44,980\r\ntonnes...",TRAIN
14784,money-fx,THE BANK OF KOREA SAID IT FIXED THE\r\nMIDRATE...,TRAIN
14804,copper,Nippon Mining Co Ltd said it lowered its\r\nse...,TRAIN


In [20]:
train_set_body = count_vec.fit_transform(train_set['body'])
train_set_body = np.array(train_set_body.todense())

In [21]:
test_set['topics']

14825                               trade
14827                               grain
14828                        crudenat-gas
14831    tradegrainricecornsugartinrubber
14832                     veg-oilpalm-oil
                       ...               
21569                                 acq
21571                      money-fxdlryen
21572                                ship
21573                                 ipi
21574                                gold
Name: topics, Length: 3021, dtype: object

In [22]:
test_set = df[df['train_test_label']=='TEST']
test_set = test_set.dropna()
test_set

Unnamed: 0,topics,body,train_test_label
14825,trade,Mounting trade friction between the\r\nU.S. An...,TEST
14827,grain,A survey of 19 provinces and seven cities\r\ns...,TEST
14828,crudenat-gas,The Ministry of International Trade and\r\nInd...,TEST
14831,tradegrainricecornsugartinrubber,Thailand's trade deficit widened to 4.5\r\nbil...,TEST
14832,veg-oilpalm-oil,Indonesia expects crude palm oil (CPO)\r\npric...,TEST
...,...,...,...
21569,acq,Chase Corp Ltd <CHCA.WE> said it will\r\nmake ...,TEST
21571,money-fxdlryen,Tokyo's foreign exchange market is watching\r\...,TEST
21572,ship,The Japan/India-Pakistan-Gulf/Japan\r\nshippin...,TEST
21573,ipi,The Soviet Union's industrial output is\r\ngro...,TEST


In [23]:
test_set_body = count_vec.fit_transform(test_set['body'])
test_set_body = np.array(test_set_body.todense())

In [24]:
model = MultinomialNB().fit(train_set_body, train_set['topics'])

In [None]:
pred_topics = model.predict(test_set_body)

In [None]:
print('Accuracy:', accuracy_score(test_set['topics'], pred_topics))
