@dataset{
    dataset,
    author = {Timilsina, Bimal},
    year = {2021},
    month = {08},
    pages = {},
    title = {News Article Category Dataset},
}

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../input/newsarticlecategories/news-article-categories.csv')

In [3]:
df['category'] = df['category'].astype('category')

# Prepare Data

## Remove stop words

Stop words are words that do not significantly contribute to the meaning of the text. Words like 'is', 'a', and 'the' can be removed as part of the data preparation so that the categorization can focus on the words that contribute the most meaning.

To accomplish this, we'll import the Natural Language Toolkit and then download the English stop words dataset.

In [4]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stopwords = (stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import re
from nltk import pos_tag, word_tokenize

def tokenize_title(title):
    tokens = word_tokenize(title)
    cleaned = []
    for t in tokens:
        if t.lower() in stopwords:
            continue
        t = re.sub(r'(?i)\d{1,3}-year-old', 'AGE', t)
        t = re.sub(r'[^A-Za-z0-9]', '', t).strip()
        t = re.sub(r'^20\d{2}$', '21STCENTURY',t)
        t = re.sub(r'^19\d{2}$', '20THCENTURY',t)
        t = re.sub(r'^\d+$', 'NUMBERCOUNT', t)
        if len(t) > 1:
            cleaned.append(t)
    return cleaned

df['tokenized'] = df['title'].map(tokenize_title)

df = df.drop(columns=['title', 'body'], axis=1).explode('tokenized')

In [6]:
categories = (df
    .groupby(['category', 'tokenized'], observed=True)
    .value_counts().to_frame('count')
)
categories = categories[categories['count'] > 2].reset_index()

In [7]:
display(categories)

Unnamed: 0,category,tokenized,count
0,ARTS & CULTURE,20THCENTURY,8
1,ARTS & CULTURE,21STCENTURY,26
2,ARTS & CULTURE,AGE,7
3,ARTS & CULTURE,Abandoned,3
4,ARTS & CULTURE,Acclaimed,3
...,...,...,...
3894,WOMEN,Year,6
3895,WOMEN,Years,3
3896,WOMEN,Young,4
3897,WOMEN,nt,14
