In [1]:
import html
import glob
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
# files to read data from
file_names = glob.glob("data/*.sgm")
# classes to predict
places = ["west-germany", "usa", "france", "uk", "canada", "japan"]
# tags to extract from html files
tags = ["title", "dateline", "body", "places", "topics"]

# remove unuseful data from text
def parse_text(text):
    if isinstance(text, list):
        text = " ".join(text)
    text = re.sub(r"[^a-zA-Z<>/]|\b\w\b", " ", str(text))
    return re.sub(r"\s+", " ", text).strip()


# helper function to extract only articles with place tags as single element from places list
def place_validation(value):
    if isinstance(value, list):
        return False
    return value in places

In [3]:
# Preprocessing 

df = pd.DataFrame(columns=tags)
for file_name in file_names:
    with open(file_name, "r", encoding="unicode_escape") as f:
        contents = f.read()
    soup = BeautifulSoup(html.unescape(contents), "html.parser")
    articles = soup.find_all("reuters")
    for article in articles:
        art = {}
        for tag in tags:
            content = article.find(tag)
            if content:
                art[tag] = (
                    list(content.strings)
                    if len(list(content.children)) > 1
                    else content.get_text()
                )
        df = df.append(art, ignore_index=True)

# parse text
df["title"] = df["title"].apply(parse_text)
df["body"] = df["body"].apply(parse_text)
# save city from dateline and remove date
df["dateline"] = df["dateline"].apply(
    lambda x: str(x).split(",", 1)[0].strip()
)
# replace empty cells with NaN
df = df.replace(r"^\s*$", np.nan, regex=True)
# get articles with specified class
df = df[df["places"].apply(place_validation) == True]

df.head()

# df.to_csv("./parsed.csv", index=False)
# df = pd.read_csv("./parsed.csv")

Unnamed: 0,title,dateline,body,places,topics
0,CHRYSLER LATE MARCH CAR SALES UP,DETROIT,Chrysler Corp said car sales for the March per...,usa,
1,WALL STREET STOCKS/COMPAQ COMPUTER,NEW YORK,Compaq Computer Corp IBM chief rival in the pe...,usa,
2,NORANDA SETS TEMPORARY MINE SHUTDOWN,Murdochville,said production will remain shut down at its f...,canada,copper
3,CANADA BUDGET DEFICIT RISES IN JANUARY,OTTAWA,The Canadian government budget deficit rose to...,canada,
5,COPLEY PROPERTIES INC INCREASES DIVIDEND,BOSTON,Qtly div cts vs cts prior Payable APril Record...,usa,earn


In [4]:
# function to evaluate model and print f1 score on test set
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(
        "Micro-averaged F1 score on test set: "
        "%0.3f" % f1_score(y_test, y_pred, average="micro")
    )
    print("-" * 10)


# get data from dataframe
X, y = (
    df["body"].astype(str) + df["title"],
    df["places"].str.get_dummies(),
)
# split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# params for classifier
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log")
# params for vectorizer
vectorizer_params = dict(
    stop_words="english", ngram_range=(2, 7), min_df=6, max_df=0.8
)

pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", OneVsRestClassifier(SGDClassifier(**sdg_params))),
    ]
)

eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)

Number of training samples: 10324
Micro-averaged F1 score on test set: 0.933
----------
