# Configurables

In [15]:
CATEGORIES = ["business", "entertainment", "politics", "sport", "tech"]

# Data Preprocessing

In [19]:
def id2categories() -> dict:
    o = dict()
    for idx, cat in enumerate(CATEGORIES):
        o[idx] = cat
    return o

def categories2id() -> dict:
    o = dict()
    for idx, cat in enumerate(CATEGORIES):
        o[cat] = idx
    return o

In [20]:
from gensim.parsing.preprocessing import remove_stopwords, strip_multiple_whitespaces, strip_punctuation2, preprocess_string, strip_short, strip_numeric, stem_text, strip_tags

# Define custom filters
CUSTOM_FILTERS = [lambda x:
                  x.lower(),
                  strip_multiple_whitespaces,
                  strip_numeric,
                  remove_stopwords,
                  strip_short,
                  stem_text,
                  strip_tags,
                  strip_punctuation2
                  ]

In [21]:
from nltk.corpus import stopwords

STOP_WORDS = stopwords.words('english')

def load_stopwords() -> list:
    stop_words = []
    with open("stopword", "r") as fh:
        words = fh.readlines()
        for word in words:
            word = word.rstrip("\n").strip()
            stop_words.append(word)
    return stop_words

STOP_WORDS.extend(load_stopwords())

In [25]:
import os
from collections import defaultdict

DATA_PATH = "./BBC News Summary/BBC News Summary/News Articles"

# Load the text from directory, identify category from folder and 
# put the texts in a list in the order of the categories specified in configurables
def get_texts() -> (list, list, list):
    category_text_map = defaultdict(list)
    texts = []
    categories = []
    ignored = []
    ignore = False
    for root, _, files in os.walk(DATA_PATH):
        if len(files) == 0: continue
        category = root.split('/')[-1].lower()
        if category not in CATEGORIES:
            print("Category {} is not in pre-set categories. Please add it in and re-run the program!".format(category))
            ignore = True
        else:
            ignore = False
        for f in files:
            txt_path = os.path.join(root, f)
            text = ""
            with open(txt_path, 'r', encoding="ISO-8859-1") as fh:
                lines = fh.readlines()
                for line in lines:
                    text += line
            if ignore:
                ignored.append(text)
            else:
                category_text_map[category].append(text)
    
    for cat in CATEGORIES:
        files = category_text_map[cat]
        texts.extend(files)
        categories.extend([categories2id()[cat] for _ in range(len(files))])

    if len(texts) != len(categories):
        raise Exception("Number of articles and number of target categories do not have the same length: [{} != {}]".format(len(texts), len(categories)))

    return texts, categories, ignored


def clean_texts(texts: list) -> list:
    clean_texts = []
    for text in texts:
        processed_texts = preprocess_string(text, CUSTOM_FILTERS)
        processed_texts = [w for w in processed_texts if not w in STOP_WORDS]
        clean_texts.append(processed_texts)
    return clean_texts

texts, categories, ignored = get_texts()
texts = clean_texts(texts)
print("{} articles loaded. {} articles ignored due to non-existing categories.".format(len(texts), len(ignored)))

2225 articles loaded. 0 articles ignored due to non-existing categories.


# Get Corpus

In [26]:
import gensim
from gensim import corpora, models, similarities

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def get_corpus(texts, bigram_mod):
    bigram_mod = bigrams(texts)
    bigram = [bigram_mod[text] for text in texts]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

bigram_mod = bigrams(texts)
corpus, dictionary, bigram = get_corpus(texts, bigram_mod)
print(corpus)

), (3037, 1), (3059, 1), (3103, 2), (3128, 1), (3187, 1), (3214, 2), (3258, 1), (3302, 3), (3318, 12), (3321, 2), (3324, 1), (3330, 2), (3529, 1), (3545, 1), (3692, 1), (3703, 1), (3753, 2), (3805, 3), (3887, 2), (3972, 4), (4034, 1), (4105, 1), (4164, 1), (4166, 4), (4180, 1), (4559, 2), (4567, 1), (4575, 2), (4587, 1)], [(9, 1), (26, 3), (33, 1), (66, 1), (68, 1), (70, 1), (85, 1), (99, 1), (103, 1), (106, 1), (182, 1), (191, 1), (200, 1), (202, 1), (209, 1), (247, 1), (250, 1), (263, 1), (284, 1), (293, 1), (322, 1), (323, 4), (324, 1), (325, 1), (328, 1), (357, 3), (385, 1), (387, 1), (406, 1), (421, 1), (457, 1), (464, 1), (469, 1), (501, 1), (527, 1), (530, 1), (540, 1), (541, 1), (547, 1), (568, 1), (576, 1), (577, 1), (582, 1), (585, 1), (591, 1), (600, 1), (606, 1), (608, 1), (609, 1), (611, 1), (618, 2), (624, 2), (640, 1), (649, 1), (660, 1), (682, 1), (700, 1), (702, 4), (737, 1), (772, 1), (776, 1), (800, 1), (860, 1), (867, 1), (884, 1), (900, 1), (933, 1), (951, 1), (966

# Use HDP to auto-get topics

In [27]:
def get_hdp_model(corpus, dictionary):
    Hdp_model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=dictionary)
    return Hdp_model

hdp_model = get_hdp_model(corpus, dictionary)
topics = hdp_model.print_topics()
print(topics)

[(0, '0.007*peopl + 0.005*game + 0.004*plai + 0.004*work + 0.003*win + 0.003*music + 0.003*film + 0.003*number + 0.003*sai + 0.003*set'), (1, '0.003*labour + 0.003*blair + 0.003*brown + 0.002*govern + 0.002*plan + 0.002*compani + 0.002*firm + 0.002*indian + 0.002*accord + 0.002*india'), (2, '0.002*profit + 0.002*firm + 0.002*sale + 0.002*compani + 0.002*film + 0.002*gazprom + 0.002*govern + 0.002*produc + 0.001*wine + 0.001*japan'), (3, '0.002*bank + 0.002*govern + 0.002*peopl + 0.002*sharp + 0.002*rate + 0.001*remain + 0.001*england + 0.001*compani + 0.001*journei + 0.001*british'), (4, '0.002*market + 0.002*share + 0.002*lord + 0.002*countri + 0.001*expect + 0.001*govern + 0.001*risen + 0.001*warn + 0.001*asia + 0.001*compani'), (5, '0.002*britain + 0.002*consum_spend + 0.002*trade + 0.002*telecom + 0.002*data + 0.001*group + 0.001*peopl + 0.001*plan + 0.001*govern + 0.001*september'), (6, '0.003*film + 0.002*dollar + 0.002*best + 0.002*martin_scorsese + 0.002*director + 0.002*win + 

# Create LDA model

In [28]:

import gensim
import warnings
import logging # This allows for seeing if the model converges. A log file is created.
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def create_lda_model(corpus, dictionary, num_topics):
    if not os.path.exists("models"):
        os.mkdir("models")
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda_train = gensim.models.ldamulticore.LdaMulticore(
                            corpus=corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            chunksize=100,
                            workers=7, # Num. Processing Cores - 1
                            passes=50,
                            eval_every = 1,
                            per_word_topics=True)
        lda_train.save(os.path.join("models", "lda_train.model"))
    return lda_train

train_lda_model = create_lda_model(corpus, dictionary, len(CATEGORIES))
print(train_lda_model)

LdaModel(num_terms=4600, num_topics=5, decay=0.5, chunksize=100)


## Print topics

In [29]:
train_lda_model.print_topics(len(CATEGORIES),num_words=15)

[(0,
  '0.013*"site" + 0.013*"peopl" + 0.012*"net" + 0.012*"user" + 0.009*"firm" + 0.009*"softwar" + 0.009*"mail" + 0.008*"web" + 0.008*"secur" + 0.007*"blog" + 0.007*"attack" + 0.007*"microsoft" + 0.007*"compani" + 0.007*"file" + 0.007*"internet"'),
 (1,
  '0.008*"govern" + 0.008*"labour" + 0.007*"peopl" + 0.006*"plan" + 0.006*"sai" + 0.005*"parti" + 0.005*"elect" + 0.004*"blair" + 0.004*"public" + 0.004*"work" + 0.004*"tori" + 0.004*"brown" + 0.004*"issu" + 0.004*"claim" + 0.004*"law"'),
 (2,
  '0.008*"firm" + 0.008*"compani" + 0.007*"market" + 0.007*"bn" + 0.006*"sale" + 0.006*"china" + 0.006*"share" + 0.006*"bank" + 0.006*"growth" + 0.006*"report" + 0.005*"rise" + 0.005*"rate" + 0.005*"month" + 0.005*"expect" + 0.005*"analyst"'),
 (3,
  '0.009*"plai" + 0.009*"game" + 0.009*"win" + 0.008*"film" + 0.006*"best" + 0.005*"england" + 0.005*"player" + 0.005*"star" + 0.005*"award" + 0.004*"team" + 0.004*"club" + 0.004*"set" + 0.004*"match" + 0.004*"good" + 0.004*"includ"'),
 (4,
  '0.013*"

# Supervised Training

## Get Feature Vectors

In [37]:

def get_feature_vectors(texts: list, corpus, lda_model):
    train_vecs = []
    for i in range(len(texts)):
        top_topics = lda_model.get_document_topics(corpus[i],minimum_probability=0.0)  
        topic_vec = [top_topics[i][1] for i in range(len(CATEGORIES))]
        topic_vec.append(len(texts))
        topic_vec.append(len(texts[i]))
        train_vecs.append(topic_vec)
    return train_vecs

## Prepare dataset

In [73]:
from sklearn.model_selection import train_test_split

texts_corpus_zip = list(zip(texts, corpus))

train_texts, test_texts, train_categories, test_categories = train_test_split(texts, categories, test_size=0.2, shuffle=True)

feature_vectors = get_feature_vectors(train_texts, corpus, train_lda_model)
print(len(feature_vectors), len(train_categories))

1780 1780


## Neural Network Multi-Class Classifier Model 

## Model

In [183]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

def create_model(activation="relu", optimizer="adam", dropout_rate=0.0):
    model = Sequential()
    model.add(Dense(32, input_dim=len(CATEGORIES)+2, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(64, input_dim=len(CATEGORIES)+2, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(256, input_dim=len(CATEGORIES)+2, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(128, input_dim=len(CATEGORIES)+2, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(len(CATEGORIES), activation="softmax"))
    model.compile(loss="categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])
    return model

In [184]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

OPTIMIZERS = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
ACTIVATION = ['relu', 'tanh', 'sigmoid', 'linear']
BATCH_SIZE = [5, 10, 20, 30, 40, 50]
EPOCHS = [10, 50, 100]
DROPOUT = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]

param_grid = {
    "batch_size": BATCH_SIZE,
    "epochs": EPOCHS,
    "optimizer": OPTIMIZERS,
    "dropout_rate": DROPOUT,
    "activation": ACTIVATION,
}

def train(train_vecs, targets):
    X = np.array(train_vecs)
    Y = np_utils.to_categorical(np.array(targets))

    model = KerasClassifier(build_fn=create_model, verbose=0)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=10, verbose=5)

    grid_result = grid.fit(X, Y)
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']

    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

    # kf = KFold(n_splits=10, shuffle=True)

    # acc_per_fold = []
    # loss_per_fold = []

    # for train_index, val_index in kf.split(X, y):
    #     x_train = X[train_index]
    #     y_train = y[train_index]

    #     model.fit(x_train, y_train, epochs=100, batch_size=5)
    #     scores = model.evaluate(X[val_index], y[val_index], verbose=0)
    #     print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]:.6f}; {model.metrics_names[1]} of {scores[1]*100:.3f}%')
    #     acc_per_fold.append(scores[1])
    #     loss_per_fold.append(scores[0])
    #     fold_no += 1

    # print("Mean Loss: {:.6f} Mean Accuracy: {:.3f}".format(np.mean(acc_per_fold), np.mean(loss_per_fold)))

    best_model = grid_result.best_estimator_
    best_model.save(os.path.join("models", "classifier.pth"))

In [185]:
train(feature_vectors, train_categories)

Fitting 10 folds for each of 3024 candidates, totalling 30240 fits


KeyboardInterrupt: 

# Testing

In [153]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score, accuracy_score
from keras.models import load_model

test_feature_vectors = get_feature_vectors(test_texts, corpus, train_lda_model) # Important to use the same LDA model
X = np.array(test_feature_vectors)
Y = np_utils.to_categorical(np.array(test_categories))

# Load model
model = load_model(os.path.join("models", "classifier.pth"))

y_pred = model.predict(X)

In [157]:
print(y_pred)
y_pred.argmax(axis=1)

[[0.22782992 0.1724608  0.18308431 0.23607926 0.1805457 ]
 [0.22782992 0.1724608  0.18308431 0.23607926 0.1805457 ]
 [0.22782992 0.1724608  0.18308431 0.23607926 0.1805457 ]
 ...
 [0.22782992 0.1724608  0.18308431 0.23607926 0.1805457 ]
 [0.22782992 0.1724608  0.18308431 0.23607926 0.1805457 ]
 [0.22782992 0.1724608  0.18308431 0.23607926 0.18054572]]


array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3,

In [158]:
np.array(test_categories)

array([0, 3, 0, 3, 0, 4, 2, 3, 0, 4, 1, 2, 2, 1, 3, 2, 1, 1, 1, 3, 4, 2,
       1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 2, 3, 2, 1, 0, 2, 1, 3, 4, 2, 0, 3,
       1, 0, 2, 0, 3, 1, 4, 1, 2, 1, 3, 1, 0, 1, 4, 4, 4, 4, 4, 4, 2, 4,
       4, 0, 2, 3, 3, 0, 4, 3, 3, 2, 0, 3, 4, 0, 0, 3, 0, 3, 0, 2, 2, 1,
       1, 2, 4, 3, 3, 4, 0, 1, 2, 3, 2, 1, 4, 3, 4, 2, 2, 2, 1, 2, 0, 2,
       2, 3, 1, 1, 3, 1, 0, 4, 3, 2, 3, 0, 0, 4, 0, 0, 2, 4, 2, 0, 1, 3,
       2, 0, 3, 1, 3, 1, 2, 2, 3, 3, 4, 3, 3, 0, 2, 4, 0, 2, 0, 0, 0, 0,
       1, 4, 0, 3, 3, 1, 1, 2, 4, 3, 2, 4, 1, 0, 4, 2, 4, 3, 3, 0, 3, 1,
       0, 0, 2, 2, 2, 3, 0, 4, 1, 3, 3, 2, 1, 3, 1, 1, 4, 4, 2, 4, 1, 3,
       2, 3, 3, 0, 1, 3, 4, 2, 2, 0, 0, 4, 2, 2, 2, 4, 3, 0, 0, 0, 2, 1,
       2, 4, 0, 3, 4, 4, 2, 3, 0, 2, 3, 2, 1, 0, 0, 4, 2, 4, 1, 2, 1, 0,
       2, 0, 1, 3, 0, 0, 4, 0, 3, 4, 4, 2, 0, 3, 3, 4, 1, 1, 0, 0, 3, 4,
       2, 4, 0, 1, 2, 4, 4, 0, 2, 4, 2, 0, 4, 1, 0, 2, 2, 0, 0, 3, 2, 2,
       3, 0, 2, 2, 0, 4, 2, 2, 1, 2, 1, 3, 2, 4, 0,