In [1]:
import re
import os
import csv

import pandas as pd
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

#### The sgml tags needs to be noticed
<
REUTERS TOPICS="YES" 
LEWISSPLIT="TRAIN" 
CGISPLIT="TRAINING-SET" 
OLDID="5544" 
NEWID="1">

`LEWISSPLIT="TRAIN"` means this sgml file is `training set` in ModApte split

<
TOPICS>
<
D>
cocoa
<
/D>
<
/TOPICS>

The `text inside TOPICS tags` means `Topical category`.

<
BODY>
text text text<
/BODY>

The `text inside BODY tags` means `Text content`.

### Split warc file to docs

In [2]:
# ref : https://stackoverflow.com/questions/60269904/split-text-file-after-specific-line-in-python
def split_docs_iter(stream, start, end):
    def inner(stream):
        # Yields each line until an end marker is found (or EOF)
        for line in stream:
            if line and not end.match(line):
                yield line
                continue
            break

    # Find a start marker, then break off into a nested iterator
    for line in stream:
        if line:
            if start.match(line):
                yield inner(stream)
            continue
        break

In [5]:
REUTERS_PATH = 'reuters21578'
REUTERS_SGMLS_PATH = 'reuters_sgmls'

SECTION_START = re.compile(r'<REUTERS')
SECTION_END = re.compile(r'</REUTERS>')

CUT_START_PATTERN = "---"
CUT_END_PATTERN = "----"

CUT_START = re.compile(CUT_START_PATTERN)
CUT_END = re.compile(CUT_END_PATTERN)

files = [f for f in listdir(REUTERS_PATH) if isfile(join(REUTERS_PATH, f)) and ".sgm" in f]

# create dir
os.makedirs(REUTERS_SGMLS_PATH, exist_ok=True)

tmp_sgml = ""

# initialize the progress bar
p = tqdm(total=len(files), nrows=4, position=0, leave=True)

for file in files:
    # add patterns for later sgmls splitting
    with open(f'./{REUTERS_PATH}/{file}', 'r', encoding='ISO-8859-1') as fh_in:
        lines = fh_in.readlines()
        for line in lines:
            if SECTION_START.match(line):
                tmp_sgml += f"{CUT_START_PATTERN}\n"
                tmp_sgml += line
            elif SECTION_END.match(line):
                tmp_sgml += line
                tmp_sgml += f"{CUT_END_PATTERN}\n"
            else:
                tmp_sgml += line

    # save temporary
    with open(f'./{REUTERS_SGMLS_PATH}/tmp_sgml', 'w', encoding='UTF-8') as fh_out:
        fh_out.write(tmp_sgml)

    # split sgmls by our custom patterns
    with open(f'./{REUTERS_SGMLS_PATH}/tmp_sgml', 'r', encoding="ISO-8859-1") as fh_in:
        for (i, nested_iter) in enumerate(split_docs_iter(fh_in, CUT_START, CUT_END)):
            with open('./reuters_sgmls/docID_{:05d}'.format(i), 'w', encoding='UTF-8') as fh_out:
                for line in nested_iter:
                    fh_out.write(line)

    p.set_description('SGMLs splitted needed: ', refresh=True)
    p.update(1) # update progress bar

p.close()

os.remove(f'./{REUTERS_SGMLS_PATH}/tmp_sgml')


0it [00:00, ?it/s]


FileNotFoundError: [WinError 2] 系統找不到指定的檔案。: './reuters_sgmls/tmp_sgml'

#### Extract information from splitted sgmls

In [43]:
sgmls_files = [f for f in listdir(REUTERS_SGMLS_PATH) if isfile(join(REUTERS_SGMLS_PATH, f))]

# initialize the progress bar
pb = tqdm(total=len(sgmls_files), nrows=4, position=0, leave=True)

with open('reuter_docs.csv', "w", newline='') as csv_file:
    writer = csv.writer(csv_file)

    # header
    writer.writerow(['topics', 'body', 'train_test_label'])

    for sgml in sgmls_files:
        try:
            with open(f"{REUTERS_SGMLS_PATH}/{sgml}", 'r') as f:
                data = f.read()
                soup = BeautifulSoup(data, "html.parser")

                if not soup.find("reuters"):
                    train_test_label = "None"
                else:
                    train_test_label = soup.find("reuters")['lewissplit']

                if not soup.find("topics"):
                    topics = "None"
                else:
                    topics = soup.find("topics").get_text()

                if not soup.find("body"):
                    body = "None"
                else:
                    body = soup.find("body").get_text()

                writer.writerow([topics, body, train_test_label])

        except UnicodeDecodeError:
            print(f"{sgml} UnicodeDecodeError, pass")

        pb.set_description('Extracting SGMLs content: ', refresh=True)
        pb.update(1) # update progress bar

pb.close()

NameError: name 'REUTERS_SGMLS_PATH' is not defined

In [2]:
df = pd.read_csv('reuter_docs.csv')
df

Unnamed: 0,topics,body,train_test_label
0,cocoa,Showers continued throughout the week in\r\nth...,TRAIN
1,,Standard Oil Co and BP North America\r\nInc sa...,TRAIN
2,,Texas Commerce Bancshares Inc's Texas\r\nComme...,TRAIN
3,,BankAmerica Corp is not under\r\npressure to a...,TRAIN
4,grainwheatcornbarleyoatsorghum,The U.S. Agriculture Department\r\nreported th...,TRAIN
...,...,...,...
21572,ship,The Japan/India-Pakistan-Gulf/Japan\r\nshippin...,TEST
21573,ipi,The Soviet Union's industrial output is\r\ngro...,TEST
21574,gold,Six black miners have been killed\r\nand two i...,TEST
21575,,The prospect of a dominant alliance of\r\nsoci...,TEST


In [3]:
df.groupby('topics').size()

topics
acq                        2362
acqalum                       2
acqcopper                     3
acqcrude                      9
acqcrudenat-gas               8
                           ... 
yen                           6
zinc                         16
zinclead                      2
zincleadcopper                2
zincleadstrategic-metal       1
Length: 655, dtype: int64

In [4]:
df['topics'].unique()

array(['cocoa', nan, 'grainwheatcornbarleyoatsorghum',
       'veg-oillinseedlin-oilsoy-oilsun-oilsoybeanoilseedcornsunseedgrainsorghumwheat',
       'earn', 'acq', 'earnacq', 'wheatgrain', 'copper', 'housing',
       'money-supply', 'coffee', 'acqship', 'sugar', 'trade', 'reserves',
       'ship', 'graincorn', 'veg-oilsoybeanoilseedmeal-feedsoy-meal',
       'grainwheatcornoatryesorghumsoybeanoilseed', 'cotton', 'grainship',
       'carcasslivestock', 'grain', 'crude', 'nat-gas', 'cpignp',
       'grainwheat', 'graincornoat',
       'veg-oiloilseedmeal-feedsoybeansoy-oilsoy-meal', 'cpi',
       'money-fxinterest', 'interest', 'gnpbop', 'grainrice',
       'soybeanred-beanoilseed',
       'grainwheatriceveg-oilsoybeansugarrubbercopra-cakecornpalm-oilpalmkernelcoffeeteaplywoodsoy-mealcotton',
       'money-fx', 'meal-feedcopra-cake', 'alum', 'veg-oilpalm-oil',
       'teacocoacoffee', 'oilseedsoybean',
       'oilseedsoybeanmeal-feedsoy-meal', 'goldplatinumstrategic-metal',
       'meal

In [5]:
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21577 entries, 0 to 21576
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   topics            11366 non-null  object
 1   body              21577 non-null  object
 2   train_test_label  21577 non-null  object
dtypes: object(3)
memory usage: 505.8+ KB
None


In [10]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
print(df.describe())

       topics   body train_test_label
count   11366  21577            21577
unique    655  18781                3
top      earn   None            TRAIN
freq     3945   2535            14668


In [13]:
sw = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 

def clean_text(text):
    
    text = text.lower()
    
    text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text) # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")

    text = re.sub(r"http\S+", "",text) #Removing URLs 
    #text = re.sub(r"http", "",text)
    
    html=re.compile(r'<.*?>') 
    
    text = html.sub(r'',text) #Removing html tags
    
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
    for p in punctuations:
        text = text.replace(p,'') #Removing punctuations
        
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    
    text = [lemmatizer.lemmatize(word) for word in text]
    
    text = " ".join(text) #removing stopwords
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text) #Removing emojis
    
    return text

In [14]:
df['body'] = df['body'].apply(lambda x: clean_text(x))

In [15]:
df.head()

Unnamed: 0,topics,body,train_test_label
0,cocoa,shower continued throughout week bahia cocoa z...,TRAIN
1,,standard oil co bp north america inc said plan...,TRAIN
2,,texas commerce bancshares inc texas commerce b...,TRAIN
3,,bankamerica corp pressure act quickly proposed...,TRAIN
4,grainwheatcornbarleyoatsorghum,u agriculture department reported farmer owned...,TRAIN


In [16]:
train_set = df[(df['train_test_label']=='TRAIN')]
train_set = train_set.dropna()
train_set

Unnamed: 0,topics,body,train_test_label
0,cocoa,shower continued throughout week bahia cocoa z...,TRAIN
4,grainwheatcornbarleyoatsorghum,u agriculture department reported farmer owned...,TRAIN
5,veg-oillinseedlin-oilsoy-oilsun-oilsoybeanoils...,argentine grain board figure show crop registr...,TRAIN
8,earn,champion product inc said board director appro...,TRAIN
9,acq,computer terminal system inc said completed sa...,TRAIN
...,...,...,...
14778,dlrmoney-fx,bank japan bought small amount dollar shortly ...,TRAIN
14782,rubber,"japan rubber stock fell , tonne march , februa...",TRAIN
14784,money-fx,"bank korea said fixed midrate dollar, highest ...",TRAIN
14804,copper,nippon mining co ltd said lowered selling pric...,TRAIN


In [17]:
test_set = df[df['train_test_label']=='TEST']
test_set = test_set.dropna()
test_set

Unnamed: 0,topics,body,train_test_label
14825,trade,mounting trade friction u japan raised fear am...,TEST
14827,grain,survey province seven city showed vermin consu...,TEST
14828,crudenat-gas,ministry international trade industry miti rev...,TEST
14831,tradegrainricecornsugartinrubber,thailand trade deficit widened billion baht fi...,TEST
14832,veg-oilpalm-oil,indonesia expects crude palm oil cpo price ris...,TEST
...,...,...,...
21569,acq,chase corp ltd chcawe said make offer fully pa...,TEST
21571,money-fxdlryen,tokyo foreign exchange market watching nervous...,TEST
21572,ship,japan india pakistan gulf japan shipping confe...,TEST
21573,ipi,soviet union industrial output growing slower ...,TEST


In [18]:
tfidf_vectorizer = TfidfVectorizer() 

tfidf_train_vectors = tfidf_vectorizer.fit_transform(train_set['body'])

tfidf_test_vectors = tfidf_vectorizer.transform(test_set['body'])

In [19]:
model = MultinomialNB().fit(tfidf_train_vectors, train_set['topics'])

In [20]:
pred_topics = model.predict(tfidf_test_vectors)

In [21]:
print('Accuracy:', accuracy_score(test_set['topics'], pred_topics))


Accuracy: 0.5746441575637207
