In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
ROOT_PATH = os.getcwd()

In [3]:
with open(f'{ROOT_PATH}/data/datastore/article_titles_plus_contents_all.txt', mode='r', encoding='utf-8') as file:
  raw_contents = file.read().splitlines()

with open(f'{ROOT_PATH}/data/target/article_categories_all.txt', mode='r', encoding='utf-8') as file:
  target = file.read().splitlines()


In [4]:
raw_df = pd.DataFrame({
    'category': target,
    'content': raw_contents
})
raw_df

Unnamed: 0,category,content
0,technology,21st-Century Sports: How Digital Technology Is...
1,business,Asian quake hits European shares Shares in Eur...
2,technology,BT offers free net phone calls BT is offering ...
3,business,Barclays shares up on merger talk Shares in UK...
4,sport,Barkley fit for match in Ireland England centr...
...,...,...
1403,sport,Woodward eyes Brennan for Lions Toulouse's for...
1404,business,WorldCom trial starts in New York The trial of...
1405,business,Yukos accused of lying to court Russian oil fi...
1406,business,Yukos drops banks from court bid Russian oil c...


In [5]:
# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('omw-1.4')

# Default Tokenizer

## Define STOP_WORDS

In [6]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

NLTK_STOP_WORDS = set(stopwords.words('english'))
SKLEARN_STOP_WORDS = ENGLISH_STOP_WORDS

# Merge stop words from ntlk and sklearn
STOP_WORDS = NLTK_STOP_WORDS.union(SKLEARN_STOP_WORDS)

## Define Tokenizer

In [7]:
from nltk.tokenize import word_tokenize
import re

def word_tokenizer(text:str)->list:
  """
  Tokenize given text using NLTK's word tokenizer
  with normalizing (lowercasing string) and filtering 
  stop words, numbers and punctuation marks.

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenize(text.lower())

  tokens_to_return = list()
  for token in tokens:
    token = token.strip("'")

    # Filter number
    if (re.match(r"^[\d.]+$", token)): 
      continue
    # Filter punctuation mark and stop word
    elif (re.match(r"[\w'-]+", token) and (token not in ['-',"'"]) 
    and (token not in STOP_WORDS)):
      tokens_to_return.append(token)

  return tokens_to_return

## Test Tokenizer

In [8]:
test_text = """It's 21-century education. 
They are Mr. and Mrs. Brown. 
While this seems like a cliché, it is true. 
6.80 pounds :; or £6.80
— em dash – en dash - hyphen"""
print("Original text:")
print(test_text)
print('-'*10)
print("NLTK's word tokenizer:")
print(word_tokenize(test_text))
print('-'*10)
print("Custom word tokenizer:")
print(word_tokenizer(test_text))

Original text:
It's 21-century education. 
They are Mr. and Mrs. Brown. 
While this seems like a cliché, it is true. 
6.80 pounds :; or £6.80
— em dash – en dash - hyphen
----------
NLTK's word tokenizer:
['It', "'s", '21-century', 'education', '.', 'They', 'are', 'Mr.', 'and', 'Mrs.', 'Brown', '.', 'While', 'this', 'seems', 'like', 'a', 'cliché', ',', 'it', 'is', 'true', '.', '6.80', 'pounds', ':', ';', 'or', '£6.80', '—', 'em', 'dash', '–', 'en', 'dash', '-', 'hyphen']
----------
Custom word tokenizer:
['21-century', 'education', 'mr.', 'mrs.', 'brown', 'like', 'cliché', 'true', 'pounds', 'em', 'dash', 'en', 'dash', 'hyphen']


# Tokenizer with Stemmer or Lemmatizer

## Tokenizer with Stemmer

In [9]:
# Option1: NLTK PorterStemmer
from nltk.stem.porter import PorterStemmer

def porter_stem_tokenizer(text:str)->list:
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Porter stemmer

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  stemmer = PorterStemmer()
  stems = list()
  for token in tokens:
    stems.append(stemmer.stem(token))
  return stems

In [10]:
# Option2: NLTK SnowballStemmer
from nltk.stem.snowball import EnglishStemmer as SnowballStemmer

def snowball_stem_tokenizer(text:str)->list:
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Snowball stemmer

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  stemmer = SnowballStemmer()
  stems = list()
  for token in tokens:
      stems.append(stemmer.stem(token))
  return stems

In [11]:
# Option3: NLTK LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer

def lancaster_stem_tokenizer(text):
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Lancaster stemmer

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  stemmer = LancasterStemmer()
  stems = list()
  for token in tokens:
    stems.append(stemmer.stem(token))
  return stems

## Tokenizer with Lemmatizer

In [12]:
# Option4: NLTK WordNetLemmatizer
from nltk.stem import WordNetLemmatizer

def wordnet_lemma_tokenizer(text:str)->list:
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Wordnet lemmatizer

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  lemmatizer = WordNetLemmatizer()
  lemmas = list()
  for token in tokens:
    lemmas.append(lemmatizer.lemmatize(token))
  return lemmas

In [None]:
# Option5: NLTK WordNet Lemmatizer with POS
from nltk.tag import pos_tag

def convert_tag(tag:str)->str:
  """
  Convert part-of-speech tag to tag compatible 
  with WordNet lemmatizer.

  Parameters
  ----------
  tag : str
    Text to be tokenized

  Returns
  ----------
  str
    Part-of-speech tag compatible with WordNet lemmatizer; 
    "n" for noun, "v" for verb, "a" for adjective and "r" for adverb
  """
  if tag[0] == 'V':
    return 'v'
  elif tag[0] == 'J':
    return 'a'
  elif tag[0] == 'R':
    return 'r'
  else:
    return 'n'

def wordnet_lemma_pos_tokenizer(text:str)->list:
  """
  Tokenize given text using custom word tokenizer
  (based on NLTK word tokenizer) with Wordnet lemmatizer
  with predicting word's part-of-speech

  Parameters
  ----------
  text : str
    Text to be tokenized

  Returns
  ----------
  list
    List of tokens
  """
  tokens = word_tokenizer(text)
  lemmatizer = WordNetLemmatizer()
  lemmas = list()
  tokens_with_pos_tag = pos_tag(tokens)
  for token in tokens_with_pos_tag:
    word = token[0]
    pos = convert_tag(token[1])
    lemmas.append(lemmatizer.lemmatize(word, pos=pos))
  return lemmas

# Term Weighting

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

def term_weighting(tokenizer, all_texts):
  """
  Parameters
  ----------
  tokenizer : list
    A function of stemmer or lemmatizer method

  all_text : list
    List of all contents

  Returns
  ----------
  matrix
    Weighting matrix
  """
  vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                              min_df=0.01)
  term_weighted = vectorizer.fit_transform(all_texts)
  return term_weighted

text_feature = term_weighting(snowball_stem_tokenizer, raw_df['content'].values)

In [15]:
print(text_feature)

  (0, 201)	0.14244129734799157
  (0, 309)	0.08897912300765479
  (0, 44)	0.08345938057896157
  (0, 484)	0.08003683461572883
  (0, 1696)	0.08028131500609681
  (0, 1876)	0.08789440738794584
  (0, 98)	0.1047807037525534
  (0, 498)	0.06419690876833059
  (0, 968)	0.052560483873666956
  (0, 1452)	0.07203724890216268
  (0, 2195)	0.10727149909020334
  (0, 465)	0.09869699941605765
  (0, 1052)	0.09543712593368747
  (0, 1299)	0.10557881047868073
  (0, 2092)	0.07751904844067588
  (0, 1053)	0.0666303165043191
  (0, 1585)	0.06967636290471924
  (0, 159)	0.08587885268463721
  (0, 680)	0.10640822338111686
  (0, 865)	0.09151970527048509
  (0, 2256)	0.042679253220998
  (0, 1778)	0.06550051750294215
  (0, 2272)	0.032085422199505706
  (0, 1686)	0.08403879540541624
  (0, 540)	0.06908889962213145
  :	:
  (1407, 1501)	0.22675670283303906
  (1407, 443)	0.16195944583138722
  (1407, 1758)	0.08868296030521729
  (1407, 903)	0.059096269539404826
  (1407, 1910)	0.0952790086907031
  (1407, 775)	0.09589106355548772
  (

# Modeling

In [18]:
X = text_feature
Y = raw_df['category']

In [100]:
# Define Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

models = []
models.append(RandomForestClassifier())
models.append(KNeighborsClassifier())

In [117]:
from sklearn.model_selection import cross_val_score

def model_testing(models, feature, target):
    """
    Parameters
    ----------
    models : list
        List of sklearn's model

    feature : matrix
        Weighting matrix

    target : pandas series
        Target of data

    Returns
    ----------
    pandas dataframe
        Result of cross validation with mean score
    """

    cv_result = {
        "model_name": [],
        "score": []
    }
    X = feature
    Y = target
    for model in models:
        cv_score = cross_val_score(model, X, Y, cv=5)
        cv_result["model_name"].append(str(model)[:-2])
        cv_result["score"].append(cv_score.mean())

    result_df = pd.DataFrame(cv_result).sort_values(by=['score'], ascending=False)
    return result_df

In [145]:
model_test_result = model_testing(models, X, Y)
model_test_result

Unnamed: 0,model_name,score
1,RandomForestClassifier,0.969453
0,KNeighborsClassifier,0.965907


# Hyperparameter Tuning

In [138]:
# define parameter
params ={
    "knn_params":{
        'n_neighbors' : list(range(1, 16))
    },
    "r2f_params":{
        'max_depth': list(range(2, 21, 2)),
        'min_samples_split': list(range(2, 21, 2))
    }
}

In [143]:
from sklearn.model_selection import GridSearchCV

def parameter_tuning(models, params, feature, target):
    """
    Parameters
    ----------
    models : list
        List of sklearn's model

    params : dict
        Dict of parameters depending on model

    feature : matrix
        Weighting matrix

    target : pandas series
        Target of data

    Returns
    ----------
    pandas dataframe
        Result of GridSearchCV hyperparameter-tuning with best parameter and best score
    """
    tuning_result = {
        "model_name": [],
        "best_parameter": [],
        "best_score": []
    }
    X = feature
    Y = target
    for model, param in zip(models, params.values()):
        clf = GridSearchCV(model, param, cv=5, n_jobs=-1, verbose=1) # 5 fold, n_jobs=-1 (use all core of processors)
        result = clf.fit(X, Y)
        tuning_result['model_name'].append(str(model)[:-2])
        tuning_result['best_score'].append(result.best_score_)
        tuning_result['best_parameter'].append(result.best_params_)

    tuning_result = pd.DataFrame(tuning_result).sort_values(by=['best_score'], ascending=False)


    return tuning_result

In [146]:
model_tuning_result = parameter_tuning(models, params, X, Y)
model_tuning_result

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


Unnamed: 0,model_name,best_parameter,best_score
1,RandomForestClassifier,"{'max_depth': 18, 'min_samples_split': 4}",0.974433
0,KNeighborsClassifier,{'n_neighbors': 7},0.968751


In [160]:
compare_result = pd.merge(model_test_result,
                            model_tuning_result,
                            how='inner',
                            on='model_name')[['model_name', 'score', 'best_score', 'best_parameter']]
compare_result['increase'] = compare_result['best_score'] - compare_result['score']
compare_result

Unnamed: 0,model_name,score,best_score,best_parameter,increase
0,RandomForestClassifier,0.969453,0.974433,"{'max_depth': 18, 'min_samples_split': 4}",0.00498
1,KNeighborsClassifier,0.965907,0.968751,{'n_neighbors': 7},0.002844
