In [None]:
import pandas as pd
import re
import csv
from statistics import mean, median

from zipfile import ZipFile
from io import BytesIO
from urllib.request import urlopen
from pathlib import Path

from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import fasttext
from wordcloud import WordCloud

import matplotlib.pyplot as plt

In [None]:
pd.options.mode.chained_assignment = None

# Preparation

In [None]:
# create directory
Path('data').mkdir(parents=True, exist_ok=True)

# download datafile and unzip
zipurl = 'ZIP-URL-HERE'
zippassword = 'ZIP-PASSWORD-HERE'
with urlopen(zipurl) as zipresp:
    with ZipFile(BytesIO(zipresp.read())) as zfile:
        zfile.extractall('data', None, zippassword.encode())

In [None]:
df = pd.read_csv("./data/data_redacted.tsv", sep="\t")

# Table of Contents

* [Data Processing](#dataprocess)
* [Data Analysis](#dataanalysis)
    * [Magic in URL](#magicurl)
    * [Text Content](#textcontent)
* [Train Model with Fasttext](#trainfasttext)
    * [Manually Trained Wordvec](#manualwordvec)
    * [Pretrained Wordvec](#pretrainedwordvec)
* [Evaluation](#evaluation)
* [Conclusion](#conclusion)

# Data Processing <a class="anchor" id="dataprocess"></a>

Now take a quick look at the data

In [None]:
df

We have normal text in title and text field, let's do some simple processing for text, like remove numbers (they don't contain much information about article category), make them lowercase, remove punctation, and remove stopwords.

In [None]:
nltk.download('stopwords')

In [None]:
ENGLISH_STOPWORDS = set(stopwords.words('english'))
def tokenize(text):
    tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b')
    token_list = tokenizer.tokenize(text)
    
    return ' '.join([w.lower() for w in token_list if len(w) > 1 and w not in ENGLISH_STOPWORDS])

In [None]:
df['cleaned_title'] = df['title'].apply(tokenize)
df['cleaned_text'] = df['text'].apply(tokenize)

For the url, we can do more than normal processing because they have some interesting information. 
- Some website can be specialized in a small subset of category
- The website url usually follows a structured path. Some websites use date or article category in the url.

In [None]:
def tokenize_url(url):
    # for url, fasttest way is to split by /
    tokens = url.split('/')
    if len(tokens) > 3:
        # ignore the http and double slash, the website address is the 3rd
        website_url = tokens[2]
        # the final should contain article title
        article_title = tokens[-1]
        # we should clean the extensions to get the title
        article_title = re.sub('\.[\w]{3,4}$', '', article_title)
        article_title = tokenize(article_title)

        # now the important part, the parts in between. Usually they contain the article categories or date, let's try to ignore numbers
        article_cats = tokenize(' '.join([c for c in tokens[3:-1] if not re.search('\d', c)]))
        return website_url, article_cats, article_title
    else:
        return None, '', tokenize(url)

In [None]:
df['website_url'], \
df['article_cats'], \
df['article_title'] = zip(*df['url'].apply(tokenize_url))

In [None]:
df

Everything is looking fine so far

# Data Analysis <a class="anchor" id="dataanalysis"></a>

First, we need to check some basic information

In [None]:
df.shape[0]

In [None]:
df['category'].value_counts()

We have 8646 entries of data, belong to 12 categories, which is quite small size of data. There are some unbalanced in data, however the lowest category still has 367 entries, which is not bad.

### Magic in URL <a class="anchor" id="magicurl"></a>

Before we do some normal checking, let's focus to the URL, which can contain valuable information that we may need. As I said before, many website use category in the URL to organize the structure. 

We can do a wordcloud for the article cats that we extracted from URL

In [None]:
all_article_cats = ' '.join([s for cat in df['article_cats'].tolist() for s in cat.split() if len(cat)>0])

In [None]:
wordcloud = WordCloud(width=800, height=400).generate(all_article_cats)

# Display the generated image:
plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

Let's ignore the word "new" and "article" for now because they have no meaning, and graph again

In [None]:
all_article_cats = all_article_cats.split(' ')
all_article_cats = ' '.join([s for s in all_article_cats if s not in set(['new', 'article', 'articles'])])

wordcloud = WordCloud(width=800, height=400).generate(all_article_cats)

# Display the generated image:
plt.figure( figsize=(20,10) )
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

It's quite interesting that the top words in the URL have some common grounds with the categories that we want to map. I want to try a simple method: match the words in url (excluding the article title and website) with the category mapping.

In [None]:
def check_category_article_cats(category, article_cats):
    '''
    This function try to check if the category match any words with the article cats in url
    '''
    category_set = set(category.lower().split('_'))
    if 'cars' in category_set:
        category_set.add('car')
    if 'motors' in category_set:
        category_set.add('motor')
        category_set.add('motoring')
    if 'sports' in category_set:
        category_set.add('sport')
        
    article_cat_set = set(article_cats.split(' '))
    if 'life' in article_cat_set and 'style' in article_cat_set:
        article_cat_set.add('lifestyle')
    return not category_set.isdisjoint(article_cat_set)

In [None]:
def check_category_article_cats_false(category, article_cats):
    '''
    This function try to check if the article cats in url match other wrong categories
    '''
    all_category = set(['fashion', 'beauty', 'lifestyle', 'sports', 'technology', 'science', 'digital', 'life', 'money', 'business, news',
                    'music', 'culture', 'travel', 'cars', 'motors', 'politics', 'people', 'shows'])
    category_set = set(category.lower().split('_'))
    other_category_set = all_category.difference(category_set)
    if 'cars' in other_category_set:
        other_category_set.add('car')
    if 'motors' in other_category_set:
        other_category_set.add('motor')
        other_category_set.add('motoring')
    if 'sports' in other_category_set:
        other_category_set.add('sport')
    
    article_cat_set = set(article_cats.split(' '))
    
    if 'life' in article_cat_set and 'style' in article_cat_set:
        article_cat_set.add('lifestyle')
    return not other_category_set.isdisjoint(article_cat_set)

In [None]:
df['check_article_cats'] = df.apply(lambda row: check_category_article_cats(row['category'], row['article_cats']), axis=1)
df['check_article_cats_false'] = df.apply(lambda row: check_category_article_cats_false(row['category'], row['article_cats']), axis=1)

First, we test our theory to see if we can use matching between url and category

In [None]:
df[df['check_article_cats'] & ~df['check_article_cats_false']].shape

We can correctly categorize 2003, around 23% of data by just matching the url! How about False Positive?

In [None]:
check_df = df[~df['check_article_cats'] & df['check_article_cats_false']]

In [None]:
check_df.shape

There are 248 articles, 2.86% that we wrongly map because the url matches one of the incorrect categories. Let's have a look at those articles

In [None]:
check_df

Some quick look in the data shows some interesting insights into the data
- Some articles seem to be wrongly mapped, why stress article is in technology? Lionel Messi private life should be in sports or in people_shows when he is a sport star? Primary school admission process is  about technology?
- Some categories are loosely defined. "news" category is too broad, because every article can be considered news. "cars" should be a subset of "technology". "digital_life" and "technology" can be mixed up in some ways.
- If "news" is a general category (in case we couldn't fit the article in other categories), should we remove it from the category list and just assign it if the predict probability couldn't reach a specified threshold?

Look at the website that we grabbed articles from

In [None]:
df['website_url'].value_counts()[:10]

We can see the top websites are general news site, which can contain all kind of categories. Let's skip this information for now.

## Text Content <a class="anchor" id="textcontent"></a>

Now we look at the article titles and content

In [None]:
df[['cleaned_title', 'cleaned_text']]

In [None]:
len_text = [len(x) for x in df['cleaned_text'].tolist()]
print(mean(len_text))
print(max(len_text))
print(min(len_text))

Some articles are more than 10 times the normal content, which can make the content too general for classification. We may need to find a way to capture the essential part of the article.

# Train Model with Fasttext <a class="anchor" id="trainfasttext"></a>

I choose Fasttext model for text classification. Fasttext is a word embedding method that is similar to word2vec. Fasttext is designed to recognize rare words or not in the dictionary because it can learn prefixes and suffixes from training dataset. We can train the word vectors by training dataset, or download it from other sources. 

In my experience, I am quite satisfied with fasttext performance for text classification. We could explore other algorithms of course, but I believe fasttext is a solid choice in beginning.

### Manually Trained Wordvec <a class="anchor" id="manualwordvec"></a>

First we will train the wordvec by fasttext from our training dataset and use it for classifier. We need to do some processing steps for url, title and text, then combine them together in one. This step can be optimized later if we want to emphasize the importance of article cats that we get from url.

Fasttext require a csv file without quoting, and the last word contains the label of the text, with __label__ prefix.

In [None]:
def combine_text(data, feature_fields):
    # Remove spaces from topic names, so a topic is recognized as one word
    # data.Topic = data.Topic.replace(' ','_', regex=True)
    data["Text"] = ''
    for field in feature_fields:
        data["Text"] += data[field] + ' '
    data["Label"] = '__label__' + data["category"]
    data["TextWLabels"] = data["Text"] + ' ' + data["Label"]
    return data

def train_model(train_data , wordNgrams, lr, epoch, feature_fields, dim=100, pretrained_vectors = None):
    train_data = combine_text(train_data, feature_fields)
    train_data = train_data[train_data['TextWLabels'].notnull()]
    train_dataframe = train_data[['TextWLabels']]
    
   
    train_dataframe.to_csv('training_file.csv', sep='\t', encoding='utf-8', index=False, quoting=csv.QUOTE_NONE, escapechar='"', header=False)
     
    if pretrained_vectors is None:
        model = fasttext.train_supervised('training_file.csv', wordNgrams=wordNgrams, lr=lr, epoch=epoch, dim=dim)
    else:
        model = fasttext.train_supervised('training_file.csv', wordNgrams=wordNgrams, lr=lr, epoch=epoch, dim=dim, pretrainedVectors=pretrained_vectors)
        
    return model

def predict(model, row, feature_fields):
    clean_description = ''
    for field in feature_fields:
        clean_description += row[field] + ' '
    clean_description = clean_description.replace('\n', ' ')
    clean_description = tokenize(clean_description)
    clean_description = (clean_description + ' ') * 1
    clean_description = clean_description.strip()
    result = model.predict(clean_description,k=3)
    
    return result[0][0][9:], result[1][0]

We should do a quick parameter tuning for 3 most important factors in fasttext: wordngrams, learning rate and epoch. I also design the training function to take different feature fields to see if we actually require all article features or not.

In [None]:
def evaluate_performance(temp_df):
    data = []
    threshold = 0
    
    for label in temp_df['category'].drop_duplicates().tolist():
        true_positives = temp_df[(temp_df.predict_label == label) & (temp_df.category == label) & (temp_df.predict_value>=threshold)].shape[0]
        try:
            precision = true_positives/temp_df[(temp_df.predict_label == label) & (temp_df.predict_value>=threshold)].shape[0]
        except:
            precision = 0
        try:
            recall = true_positives/temp_df[temp_df.category == label].shape[0]
        except:
            recall = 0

        try:
            f1 = 2*precision*recall/(precision + recall)
        except:
            f1 = 0
        number = temp_df[temp_df.category == label].shape[0]
        data.append([label, precision, recall, f1, number])
    stat_df = pd.DataFrame(data, columns = ['label', 'precision', 'recall', 'f1', 'count'])
    macrof1 = stat_df['f1'].mean()
    weightedf1 = stat_df['f1'].values.dot(stat_df['count'].values)/stat_df['count'].sum()
    macroprecision = stat_df['precision'].mean()
    weightedprecision = stat_df['precision'].values.dot(stat_df['count'].values)/stat_df['count'].sum()
    macrorecall = stat_df['recall'].mean()
    weightedrecall = stat_df['recall'].values.dot(stat_df['count'].values)/stat_df['count'].sum()
    
    print('Macro F1: {}'.format(macrof1))
    print('Weighted F1: {}'.format(weightedf1))
    print('Macro Precision: {}'.format(macroprecision))
    print('Weighted Precision: {}'.format(weightedprecision))
    print('Macro Recall: {}'.format(macrorecall))
    print('Weighted Recall: {}'.format(weightedrecall))
    
    return stat_df

In [None]:
train_df, test_df = train_test_split(d, test_size = 0.2, stratify = d.category)

In [None]:
d = df.copy()
d["website_url"] = d["website_url"].fillna('')
d["article_cats"] = d["article_cats"].fillna('')
d["cleaned_title"] = d["cleaned_title"].fillna('')
d["cleaned_text"] = d["cleaned_text"].fillna('')
threshold = 0
           
for wordngrams in [1,2]:
    for lr in [0.1, 0.25, 0.5]:
        for epoch in [5, 10, 25]:
            name = f'{wordngrams}|{lr}|{epoch}'
            print(name)
            
            feature_fields = ["website_url", "article_cats", "cleaned_title", "cleaned_text"]
            print(feature_fields)
            model = train_model(train_df, wordngrams,lr,epoch, feature_fields)
            test_df['predict_label'], test_df['predict_value'] = zip(*test_df.apply(lambda x: predict(model, x, feature_fields), axis=1))
            evaluate_performance(test_df)
            
            print('---------')
            
            feature_fields = ["article_cats", "cleaned_title", "cleaned_text"]
            print(feature_fields)
            model = train_model(train_df, wordngrams,lr,epoch, feature_fields)
            test_df['predict_label'], test_df['predict_value'] = zip(*test_df.apply(lambda x: predict(model, x, feature_fields), axis=1))
            evaluate_performance(test_df)
            
            print('---------')
            
            feature_fields = ["cleaned_title", "cleaned_text"]
            print(feature_fields)
            model = train_model(train_df, wordngrams,lr,epoch, feature_fields)
            test_df['predict_label'], test_df['predict_value'] = zip(*test_df.apply(lambda x: predict(model, x, feature_fields), axis=1))
            evaluate_performance(test_df)
            
            print('===============================================')
            

One interesting fact is that there is not much difference to overall performance when we exclude the url (website_url and article_cats) while training the fasttext model. It can happen because we just concatenate the text normally without emphasizing the importance of those features. There is not much difference when wordngrams is 1 or 2, but let's choose the best one for now: worngrams = 1, lr = 0.5, epoch = 25 with weighted F1 of 88.51%

### Pretrained Wordvec <a class="anchor" id="pretrainedwordvec"></a>

We can see that the training dataset is quite small for our purpose: news classification. The number of articles could not cover the whole English dictionary and we may have problems with new words that are completely different. Fortunately there are pretrained wordvec that are trained from big news source. We can integrate it as wordembedding for our fasttext model and use it for classifier.

For simplicity, I will use the small ones, 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset 

In [None]:
# create directory
Path("pretrained_wordvec").mkdir(parents=True, exist_ok=True)

# download the pretrained wordvec
zipurl = 'https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip'
with urlopen(zipurl) as zipresp:
    with ZipFile(BytesIO(zipresp.read())) as zfile:
        zfile.extractall('pretrained_wordvec')

In [None]:
d = df.copy()
d["website_url"] = d["website_url"].fillna('')
d["article_cats"] = d["article_cats"].fillna('')
d["cleaned_title"] = d["cleaned_title"].fillna('')
d["cleaned_text"] = d["cleaned_text"].fillna('')
threshold = 0
           
for wordngrams in [1,2]:
    for lr in [0.1, 0.25, 0.5]:
        for epoch in [5, 10, 25]:
            name = f'{wordngrams}|{lr}|{epoch}'
            print(name)
            
            feature_fields = ["website_url", "article_cats", "cleaned_title", "cleaned_text"]
            print(feature_fields)
            model = train_model(train_df, wordngrams,lr,epoch, feature_fields, dim=300, pretrained_vectors = 'pretrained_wordvec/wiki-news-300d-1M.vec')
            test_df['predict_label'], test_df['predict_value'] = zip(*test_df.apply(lambda x: predict(model, x, feature_fields), axis=1))
            evaluate_performance(test_df)
            
            print('==========================')
            

We could achieve up to 89.86% with wordngrams=1, lr=0.5, epoch=5, which is a noticable improvements! This confirmed that our dataset is still so small for the news and by utilizing the dictionary from other sources, we could achieve higher performance. I tried with 2 million word vectors trained on Common Crawl and got the weighted F1 score of 90.1%

# Evaluation <a class="anchor" id="evaluation"></a>

Because the improvements are not significant, and the time to train/load the big pretrained vectors are quite high, I would choose manually trained wordvectors based on training dataset for now.

For final evaluation of our classifier with the optimized parameters, we will do a repeated stratified K fold to ensure that the F1 score is unbiased to the train/split dataset.

In [None]:
d = df.copy()
d["website_url"] = d["website_url"].fillna('')
d["article_cats"] = d["article_cats"].fillna('')
d["cleaned_title"] = d["cleaned_title"].fillna('')
d["cleaned_text"] = d["cleaned_text"].fillna('')
k = 5
kfold = RepeatedStratifiedKFold(k, n_repeats=1, random_state=100)
i=0
result = []
data = []
threshold = 0

for train_index, test_index in kfold.split(d, d.category):
    print("Running {}-fold".format(i))
    train_df = d.iloc[train_index]
    test_df = d.iloc[test_index]
            

    model = train_model2(train_df, 1, 0.5, 25)
    test_df['predict_label'], test_df['predict_value'] = zip(*test_df.apply(lambda x: predict2(model, x), axis=1))

    result.append(test_df.copy())

    i += 1

temp_df = pd.concat(result)  
for label in df['category'].drop_duplicates().tolist():
    true_positives = temp_df[(temp_df.predict_label == label) & (temp_df.category == label) & (temp_df.predict_value>=threshold)].shape[0]
    try:
        precision = true_positives/temp_df[(temp_df.predict_label == label) & (temp_df.predict_value>=threshold)].shape[0]
    except:
        precision = 0
    try:
        recall = true_positives/temp_df[temp_df.category == label].shape[0]
    except:
        recall = 0

    try:
        f1 = 2*precision*recall/(precision + recall)
    except:
        f1 = 0
    number = temp_df[temp_df.category == label].shape[0]
    data.append([label, precision, recall, f1, number])
stat_df = pd.DataFrame(data, columns = ['label', 'precision', 'recall', 'f1', 'count'])
macrof1 = stat_df['f1'].mean()
weightedf1 = stat_df['f1'].values.dot(stat_df['count'].values)/stat_df['count'].sum()
macroprecision = stat_df['precision'].mean()
weightedprecision = stat_df['precision'].values.dot(stat_df['count'].values)/stat_df['count'].sum()
macrorecall = stat_df['recall'].mean()
weightedrecall = stat_df['recall'].values.dot(stat_df['count'].values)/stat_df['count'].sum()

print('Macro F1: {}'.format(macrof1))
print('Weighted F1: {}'.format(weightedf1))
print('Macro Precision: {}'.format(macroprecision))
print('Weighted Precision: {}'.format(weightedprecision))
print('Macro Recall: {}'.format(macrorecall))
print('Weighted Recall: {}'.format(weightedrecall))

The final Weighted F1-score is 88.66%

In [None]:
There is also one factor that control our model is threshold value to control the balance between precision and F1-score.

In [None]:
threshold_data = []
for threshold in [0, 0.1, 0.2, 0.3, 0.4, 0.5,0.6,0.7,0.8, 0.85, 0.9, 0.95]:
    data = []
    for label in df['category'].drop_duplicates().tolist():
        true_positives = temp_df[(temp_df.predict_label == label) & (temp_df.category == label) & (temp_df.predict_value>=threshold)].shape[0]
        try:
            precision = true_positives/temp_df[(temp_df.predict_label == label) & (temp_df.predict_value>=threshold)].shape[0]
        except:
            precision = 0
        try:
            recall = true_positives/temp_df[temp_df.category == label].shape[0]
        except:
            recall = 0

        try:
            f1 = 2*precision*recall/(precision + recall)
        except:
            f1 = 0
        number = temp_df[temp_df.category == label].shape[0]
        data.append([label, precision, recall, f1, number])
    stat_df = pd.DataFrame(data, columns = ['label', 'precision', 'recall', 'f1', 'count'])
    weightedf1 = stat_df['f1'].values.dot(stat_df['count'].values)/stat_df['count'].sum()
    weightedprecision = stat_df['precision'].values.dot(stat_df['count'].values)/stat_df['count'].sum()
    weightedrecall = stat_df['recall'].values.dot(stat_df['count'].values)/stat_df['count'].sum()

    threshold_data.append([threshold,weightedprecision,weightedrecall,weightedf1])
    
threshold_df = pd.DataFrame(threshold_data, columns = ['threshold', 'precision', 'recall', 'f1'])

In [None]:
fig, ax = plt.subplots()

ax = threshold_df.plot(ax=ax, kind='line', x='threshold', y='precision', label='precision')
ax = threshold_df.plot(ax=ax, kind='line', x='threshold', y='recall', label='recall')
ax = threshold_df.plot(ax=ax, kind='line', x='threshold', y='f1', label='f1')

plt.legend(loc='best')
plt.show()

We can see the model is quite stable upto threshold value 0.4, and then precision can go up to 95% with recall value goes down to 75%. Based on the scope and requirements of application, we should change the threshold accordingly. I would choose the value of 0.5 because that's when the slope of recall becomes much steeper, and we got a small boost in precision from 88% to 90%. 

# Conclusion <a class="anchor" id="conclusion"></a>

In this notebook I did some analysis to the data and also trained a Machine Learning model to automatically categorize an articles. There are still lots of things we need to explore and optimize. 

For a news application, the word dictionary is very general, cover all kinds of topics. Taking more data will help a lot to make our model smarter, as shown when we use a general pretrained word vectors from multiple articles around internet. Of course using our data for word vectors is still better if we want to focus on some kind of topics (more about life, technology and less about wars for example). 

The category that we defined also plays a big factor in our model. The current categories is still a bit confusing, and we need to consider if we should have a hierachy structure for categories, in case we want to have deeper understanding level of the article or not. Some categories are still overlapped together, and we should have a strategy to deal with those cases.

The url part that we have is also very useful. We can learn from it to better categorize our platform, as well as increase its priority in our classification by repeating multiple times before concatenating. 

The fasttext model I created is quite small and practical in production, with one small server to run the model. However it's not suitable if we want to have deeper understanding for other purpose. We may need to use other deep learning techniques and having bigger infrastructure to house it.