# Data Preprocessing

In [54]:
from gensim.parsing.preprocessing import remove_stopwords, strip_multiple_whitespaces, strip_punctuation2, preprocess_string, strip_short, strip_numeric, stem_text, strip_tags

# Define custom filters
CUSTOM_FILTERS = [lambda x:
                  x.lower(),
                  strip_multiple_whitespaces,
                  strip_numeric,
                  remove_stopwords,
                  strip_short,
                  stem_text,
                  strip_tags,
                  strip_punctuation2
                  ]

In [55]:
import os
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'said'])

DATA_PATH = "./BBC News Summary/BBC News Summary/News Articles"

def get_texts() -> list:
    texts = []
    for root, _, files in os.walk(DATA_PATH):
        for f in files:
            txt_path = os.path.join(root, f)
            text = ""
            with open(txt_path, 'r', encoding="ISO-8859-1") as fh:
                lines = fh.readlines()
                for line in lines:
                    text += line
            texts.append(text)
    return texts


def clean_texts(texts: list) -> list:
    clean_texts = []
    for text in texts:
        processed_texts = preprocess_string(text, CUSTOM_FILTERS)
        processed_texts = [w for w in processed_texts if not w in stop_words]
        clean_texts.append(processed_texts)
    return clean_texts

texts = get_texts()
texts = clean_texts(texts)
print(len(texts))

2225


# Get Corpus

In [56]:
from gensim import corpora, models, similarities

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
print(corpus)

 1), (3575, 3), (3646, 3), (3763, 1), (4002, 1), (4004, 1), (4074, 3), (4276, 2), (4281, 1), (4331, 2), (4438, 1), (4860, 1), (4939, 1), (5350, 2), (5423, 1), (5868, 1), (5988, 1), (6082, 1), (7136, 1), (7439, 1), (7525, 1), (7627, 1), (8231, 1), (9042, 1), (9152, 1), (9450, 2), (9576, 1), (9614, 1), (9930, 1), (10144, 1), (10214, 2), (13071, 1), (13495, 1), (14227, 1), (14959, 1), (15820, 1), (16461, 4), (16910, 1), (25568, 1), (25569, 2), (25570, 1)], [(64, 3), (69, 1), (81, 1), (84, 1), (141, 2), (151, 1), (162, 1), (202, 2), (347, 1), (525, 1), (532, 1), (551, 1), (600, 1), (724, 1), (774, 1), (834, 1), (950, 1), (981, 1), (1104, 1), (1173, 1), (1281, 1), (1297, 1), (1330, 1), (1367, 1), (1408, 1), (1423, 2), (1456, 2), (1701, 1), (1715, 1), (1840, 1), (1894, 3), (2073, 1), (2083, 1), (2089, 1), (2098, 1), (2391, 1), (2515, 1), (2570, 1), (2954, 1), (3058, 1), (3089, 3), (3176, 1), (3181, 1), (3219, 1), (3221, 4), (3651, 1), (3835, 1), (4330, 1), (4557, 2), (4709, 2), (4958, 1), (5

# Use HDP to get number of topics

In [57]:
def get_hdp_model(corpus, dictionary):
    Hdp_model = gensim.models.hdpmodel.HdpModel(corpus=corpus, id2word=dictionary)
    return Hdp_model

hdp_model = get_hdp_model(corpus, dictionary)
topics = hdp_model.print_topics()
print(topics)
num_of_topics = len(topics)

[(0, '0.007*year + 0.006*new + 0.006*film + 0.004*peopl + 0.004*best + 0.004*music + 0.003*time + 0.003*like + 0.003*includ + 0.003*award'), (1, '0.006*year + 0.006*new + 0.004*peopl + 0.004*game + 0.003*time + 0.003*market + 0.003*like + 0.003*world + 0.003*music + 0.003*firm'), (2, '0.006*year + 0.005*new + 0.005*peopl + 0.003*firm + 0.003*like + 0.003*film + 0.003*time + 0.002*work + 0.002*music + 0.002*us'), (3, '0.005*year + 0.004*new + 0.004*peopl + 0.003*mobil + 0.003*servic + 0.002*time + 0.002*phone + 0.002*work + 0.002*govern + 0.002*us'), (4, '0.003*year + 0.003*new + 0.002*time + 0.002*govern + 0.002*minist + 0.002*plai + 0.002*mobil + 0.002*like + 0.002*servic + 0.002*peopl'), (5, '0.002*year + 0.002*hunt + 0.002*england + 0.002*law + 0.002*game + 0.002*new + 0.002*time + 0.002*told + 0.002*robinson + 0.002*secur'), (6, '0.004*labour + 0.003*elect + 0.003*peopl + 0.002*new + 0.002*year + 0.002*parti + 0.002*tori + 0.002*work + 0.001*brown + 0.001*vote'), (7, '0.003*new + 0

# Create LDA model

In [58]:

import gensim
import warnings
import logging # This allows for seeing if the model converges. A log file is created.
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def create_lda_model(corpus, dictionary, num_topics):
    if not os.path.exists("models"):
        os.mkdir("models")
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda_train = gensim.models.ldamulticore.LdaMulticore(
                            corpus=corpus,
                            num_topics=num_topics,
                            id2word=dictionary,
                            chunksize=100,
                            workers=7, # Num. Processing Cores - 1
                            passes=50,
                            eval_every = 1,
                            per_word_topics=True)
        lda_train.save(os.path.join("models", "lda_train.model"))
    return lda_train

train_lda_model = create_lda_model(corpus, dictionary, 5)
print(train_lda_model)

LdaModel(num_terms=25666, num_topics=5, decay=0.5, chunksize=100)


## Print topics

In [60]:
train_lda_model.print_topics(5,num_words=10)

[(0,
  '0.010*"peopl" + 0.008*"game" + 0.007*"new" + 0.006*"mobil" + 0.006*"us" + 0.006*"phone" + 0.006*"technolog" + 0.005*"like" + 0.005*"digit" + 0.004*"user"'),
 (1,
  '0.007*"year" + 0.006*"new" + 0.005*"govern" + 0.004*"bn" + 0.004*"labour" + 0.004*"peopl" + 0.003*"plan" + 0.003*"market" + 0.003*"firm" + 0.003*"compani"'),
 (2,
  '0.008*"game" + 0.007*"year" + 0.007*"plai" + 0.007*"win" + 0.006*"england" + 0.005*"world" + 0.005*"player" + 0.005*"time" + 0.004*"club" + 0.004*"team"'),
 (3,
  '0.020*"film" + 0.013*"best" + 0.010*"year" + 0.009*"award" + 0.009*"star" + 0.008*"music" + 0.005*"includ" + 0.005*"new" + 0.005*"actor" + 0.004*"song"'),
 (4,
  '0.007*"music" + 0.005*"sale" + 0.005*"chart" + 0.005*"new" + 0.004*"hunt" + 0.004*"includ" + 0.004*"band" + 0.004*"year" + 0.004*"record" + 0.003*"singl"')]

# Supervised Training

## Get Feature Vectors

In [22]:

def get_feature_vectors(texts: list, corpus, lda_model):
    train_vecs = []
    for i in range(len(texts)):
        top_topics = lda_model.get_document_topics(corpus[i],minimum_probability=0.0)  
        topic_vec = [top_topics[i][1] for i in range(20)]
        topic_vec.append(len(texts))
        topic_vec.append(len(texts[i]))
        train_vecs.append(topic_vec)
    return train_vecs

feature_vectors = get_feature_vectors(texts, corpus, train_lda_model)
print(len(feature_vectors))

2225


## Logistics Regression Model 

In [None]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import f1_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score

def train(train_vecs, targets):
    X = np.array(train_vecs)
    y = np.array(targets)

    kf = KFold(5, shuffle=True, random_state=42)
    cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []

    for train_ind, val_ind in kf.split(X, y):
        # Assign CV IDX
        X_train, y_train = X[train_ind], y[train_ind]
        X_val, y_val = X[val_ind], y[val_ind]
        
        # Scale Data
        scaler = StandardScaler()
        X_train_scale = scaler.fit_transform(X_train)
        X_val_scale = scaler.transform(X_val)

        # Logisitic Regression
        lr = LogisticRegression(
            class_weight= 'balanced',
            solver='newton-cg',
            fit_intercept=True
        ).fit(X_train_scale, y_train)

        y_pred = lr.predict(X_val_scale)
        cv_lr_f1.append(f1_score(y_val, y_pred, average='binary'))
        
        # Logistic Regression SGD
        sgd = linear_model.SGDClassifier(
            max_iter=1000,
            tol=1e-3,
            loss='log',
            class_weight='balanced'
        ).fit(X_train_scale, y_train)
        
        y_pred = sgd.predict(X_val_scale)
        cv_lrsgd_f1.append(f1_score(y_val, y_pred, average='binary'))
        
        # SGD Modified Huber
        sgd_huber = linear_model.SGDClassifier(
            max_iter=1000,
            tol=1e-3,
            alpha=20,
            loss='modified_huber',
            class_weight='balanced'
        ).fit(X_train_scale, y_train)
        
        y_pred = sgd_huber.predict(X_val_scale)
        cv_svcsgd_f1.append(f1_score(y_val, y_pred, average='binary'))

    print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
    print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')
    print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')

    with open(os.path.join("models", "logistic_regression.pkl"), 'wb') as f:
        pickle.dump(lr, f)
    
    with open(os.path.join("models", "logistic_regression_sgd.pkl"), 'wb') as f:
        pickle.dump(sgd, f)

    with open(os.path.join("models", "sgd_modified_huber.pkl"), 'wb') as f:
        pickle.dump(sgd_huber, f)

train(feature_vectors, )