In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
ROOT_PATH = os.getcwd()

In [3]:
with open(f'{ROOT_PATH}/data/datastore/article_titles_plus_contents_all.txt', mode='r', encoding='utf-8') as file:
  raw_contents = file.read().splitlines()

with open(f'{ROOT_PATH}/data/target/article_categories_all.txt', mode='r', encoding='utf-8') as file:
  target = file.read().splitlines()


In [8]:
raw_df = pd.DataFrame({
    'category': target,
    'content': raw_contents
})
raw_df

Unnamed: 0,category,content
0,technology,21st-Century Sports: How Digital Technology Is...
1,business,Asian quake hits European shares Shares in Eur...
2,technology,BT offers free net phone calls BT is offering ...
3,business,Barclays shares up on merger talk Shares in UK...
4,sport,Barkley fit for match in Ireland England centr...
...,...,...
1403,sport,Woodward eyes Brennan for Lions Toulouse's for...
1404,business,WorldCom trial starts in New York The trial of...
1405,business,Yukos accused of lying to court Russian oil fi...
1406,business,Yukos drops banks from court bid Russian oil c...


In [5]:
# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('omw-1.4')

# Default Tokenizer

## Define STOP_WORDS

In [68]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

NLTK_STOP_WORDS = set(stopwords.words('english'))
SKLEARN_STOP_WORDS = ENGLISH_STOP_WORDS

# Merge stop words from ntlk and sklearn
STOP_WORDS = NLTK_STOP_WORDS.union(ENGLISH_STOP_WORDS)

## Define Tokenizer

In [128]:
from nltk.tokenize import word_tokenize
import re

def word_tokenizer(text:str)->list:
  """
  Tokenize given text using NLTK's word tokenizer
  with normalizing (lowercasing string) and filtering 
  stop words, numbers and punctuation marks.

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenize(text.lower())

  tokens_to_return = list()
  for token in tokens:
    token = token.strip("'")

    # Filter number
    if (re.match(r"^[\d.]+$", token)): 
      continue
    # Filter punctuation mark and stop word
    elif (re.match(r"[\w'-]+", token) and (token not in ['-',"'"]) 
    and (token not in STOP_WORDS)):
      tokens_to_return.append(token)

  return tokens_to_return

## Test Tokenizer

In [126]:
test_text = """It's 21-century education. 
They are Mr. and Mrs. Brown. 
While this seems like a cliché, it is true. 
6.80 pounds :; or £6.80
— em dash – en dash - hyphen"""
print("Original text:")
print(test_text)
print('-'*10)
print("NLTK's word tokenizer:")
print(word_tokenize(test_text))
print('-'*10)
print("Custom word tokenizer:")
print(word_tokenizer(test_text))

Original text:
It's 21-century education. 
They are Mr. and Mrs. Brown. 
While this seems like a cliché, it is true. 
6.80 pounds :; or £6.80
— em dash – en dash - hyphen
----------
NLTK's word tokenizer:
['It', "'s", '21-century', 'education', '.', 'They', 'are', 'Mr.', 'and', 'Mrs.', 'Brown', '.', 'While', 'this', 'seems', 'like', 'a', 'cliché', ',', 'it', 'is', 'true', '.', '6.80', 'pounds', ':', ';', 'or', '£6.80', '—', 'em', 'dash', '–', 'en', 'dash', '-', 'hyphen']
----------
Custom word tokenizer:
['21-century', 'education', 'mr.', 'mrs.', 'brown', 'like', 'cliché', 'true', 'pounds', 'em', 'dash', 'en', 'dash', 'hyphen']


# Tokenizer with Stemmer or Lemmatizer

## Tokenizer with Stemmer

In [130]:
# Option1: NLTK PorterStemmer
from nltk.stem.porter import PorterStemmer

def porter_stem_tokenizer(text:str)->list:
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Porter stemmer

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  stemmer = PorterStemmer()
  stems = list()
  for token in tokens:
    stems.append(stemmer.stem(token))
  return stems

In [129]:
# Option2: NLTK SnowballStemmer
from nltk.stem.snowball import EnglishStemmer as SnowballStemmer

def snowball_stem_tokenizer(text:str)->list:
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Snowball stemmer

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  stemmer = SnowballStemmer()
  stems = list()
  for token in tokens:
      stems.append(stemmer.stem(token))
  return stems

In [132]:
# Option3: NLTK LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer

def lancaster_stem_tokenizer(text):
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Lancaster stemmer

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  stemmer = LancasterStemmer()
  stems = list()
  for token in tokens:
    stems.append(stemmer.stem(token))
  return stems

## Tokenizer with Lemmatizer

In [134]:
# Option4: NLTK WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer

def convert_tag(tag):
  """
  Convert part-of-speech tag to tag compatible 
  with WordNet lemmatizer.

  Parameters
  ----------
  tag : str
    Text to be tokenized

  Returns
  ----------
  str
    Part-of-speech tag compatible with WordNet lemmatizer; 
    "n" for noun, "v" for verb, "a" for adjective and "r" for adverb
  """
  if tag[0] == 'V':
    return 'v'
  elif tag[0] == 'J':
    return 'a'
  elif tag[0] == 'R':
    return 'r'
  else:
    return 'n'

def wordnet_lemma_tokenizer(text, with_pos:bool=False):
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Wordnet Lemmatizer

  Parameters
  ----------
  text : str
    Text to be tokenized
  with_pos : bool, default False
    Flag indicating lemmatizing words with or without pos.  

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  lemmatizer = WordNetLemmatizer()
  lemmas = list()
  
  if (with_pos):
    tokens_with_pos_tag = pos_tag(tokens)
    for token in tokens_with_pos_tag:
      word = token[0]
      pos = convert_tag(token[1])
      lemmas.append(lemmatizer.lemmatize(word, pos=pos))
  else:
    for token in tokens:
      lemmas.append(lemmatizer.lemmatize(token))
  return lemmas

# Term Weighting

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

def term_weighting(tokenizer, all_texts):
  """
  Parameters
  ----------
  tokenizer : list
    A function of stemmer or lemmatizer method

  all_text : list
    List of all contents

  Returns
  ----------
  matrix
    Weighting matrix
  """
  vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                              min_df=0.01)
  term_weighted = vectorizer.fit_transform(all_texts)
  return term_weighted

text_feature = term_weighting(snowball_stem_tokenizer, raw_df['contents'].values)

In [78]:
print(text_feature)

  (0, 286)	0.14244186559641372
  (0, 394)	0.08897947797666445
  (0, 129)	0.08345971352777776
  (0, 569)	0.0800371539108057
  (0, 1783)	0.08028163527649192
  (0, 1963)	0.08789475802964346
  (0, 183)	0.104781121759625
  (0, 583)	0.06419716487237341
  (0, 1053)	0.052560693555919115
  (0, 1538)	0.0720375362840792
  (0, 2283)	0.10727192703393339
  (0, 550)	0.09869739315309342
  (0, 1137)	0.09543750666594165
  (0, 1385)	0.1055792316696807
  (0, 2179)	0.07751935769141835
  (0, 1138)	0.06663058231608103
  (0, 1671)	0.06967664086823136
  (0, 244)	0.0858791952855784
  (0, 765)	0.10640864788093655
  (0, 950)	0.09152007037477075
  (0, 2344)	0.04267942348354652
  (0, 1865)	0.06550077880753861
  (0, 2360)	0.03208555019953715
  (0, 1773)	0.08403913066572191
  (0, 625)	0.06908917524204597
  :	:
  (1407, 1845)	0.08605860434921996
  (1407, 988)	0.05734745954919708
  (1407, 1997)	0.09245945876726355
  (1407, 860)	0.09305340136082642
  (1407, 1437)	0.09397275350262445
  (1407, 1586)	0.10714311935072668
  

# Modeling

In [105]:
X = text_feature
Y = raw_df['categories']

In [106]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=7, shuffle=False)

In [107]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(X_train, Y_train)


RandomForestClassifier()

In [108]:
from sklearn.metrics import classification_report

pred = clf.predict(X_test)
print(classification_report(Y_test, pred))


              precision    recall  f1-score   support

    business       0.94      0.97      0.96       158
       sport       0.98      0.98      0.98       172
  technology       0.98      0.93      0.95       135

    accuracy                           0.96       465
   macro avg       0.96      0.96      0.96       465
weighted avg       0.96      0.96      0.96       465

