In [237]:
import os
import pandas as pd

In [238]:
ROOT_PATH = os.getcwd()

In [239]:
with open(f'{ROOT_PATH}/data/datastore/article_titles_plus_contents_all.txt', mode='r', encoding='utf-8') as file:
   raw_contents = file.read().splitlines()

with open(f'{ROOT_PATH}/data/target/article_categories_all.txt', mode='r', encoding='utf-8') as file:
   target = file.read().splitlines()


In [240]:
raw_df = pd.DataFrame({
    'categories': target,
    'contents': raw_contents
})
raw_df

Unnamed: 0,categories,contents
0,technology,21st-Century Sports: How Digital Technology Is...
1,business,Asian quake hits European shares Shares in Eur...
2,technology,BT offers free net phone calls BT is offering ...
3,business,Barclays shares up on merger talk Shares in UK...
4,sport,Barkley fit for match in Ireland England centr...
...,...,...
1403,sport,Woodward eyes Brennan for Lions Toulouse's for...
1404,business,WorldCom trial starts in New York The trial of...
1405,business,Yukos accused of lying to court Russian oil fi...
1406,business,Yukos drops banks from court bid Russian oil c...


In [241]:
# import nltk
# nltk.download('wordnet')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('omw-1.4')

# Tokenizer

In [242]:
# Merge stop words from ntlk and sklearn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS)

In [243]:
# from sklearn.feature_extraction.text import CountVectorizer

# def sklearn_tokenizer(text):
#     tokenizer = CountVectorizer(stop_words=stop_words).build_tokenizer()
#     tokens = tokenizer(text)
#     return tokens

In [244]:
from nltk.tokenize import word_tokenize
import re

def nltk_word_tokenizer(text):
  tokens = word_tokenize(text)
  tokens_to_return = list()
  for token in tokens:
    if(re.match(r"[\w'-]+", token) and (token not in ['-',"'"]) and (token not in stop_words)):
      tokens_to_return.append(token)

  return tokens_to_return

# Stemmer

In [245]:
# Option1: nltk PorterStemmer
from nltk.stem.porter import PorterStemmer

def porter_stem_tokenizer(text):
    tokens = nltk_word_tokenizer(text.lower())
    stemmer = PorterStemmer()
    stems = list()
    for token in tokens:
        stems.append(stemmer.stem(token))
    return stems

In [246]:
# Option2: nltk SnowballStemmer
from nltk.stem.snowball import EnglishStemmer as SnowballStemmer

def snowball_stem_tokenizer(text):
    tokens = nltk_word_tokenizer(text.lower())
    stemmer = SnowballStemmer()
    stems = list()
    for token in tokens:
        stems.append(stemmer.stem(token))
    return stems

In [247]:
# Option3: nltk LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer

def lancaster_stem_tokenizer(text):
    tokens = nltk_word_tokenizer(text.lower())
    stemmer = LancasterStemmer()
    stems = list()
    for token in tokens:
        stems.append(stemmer.stem(token))
    return stems

# Lemmatizer

## Lemmatizer without pos tag

In [248]:
# Option4: nltk WordNetLemmatizer without pos tag
from nltk.stem import WordNetLemmatizer

def wordnet_lemma_tokenizer(text):
    tokens = nltk_word_tokenizer(text.lower())
    lemmatizer = WordNetLemmatizer()
    lemmas = list()
    for token in tokens:
        lemmas.append(lemmatizer.lemmatize(token))
    return lemmas

## Lemmatizer with pos tag

In [249]:
# Option5: nltk WordNetLemmatizer with pos tag
def convert_tag(tag):
  if tag[0] == 'V':
    return 'v'
  elif tag[0] == 'J':
    return 'a'
  elif tag[0] == 'R':
    return 'r'
  else:
    return 'n'

def wordnet_lemma_pos_tokenizer(text):
  tokens = nltk_word_tokenizer(text.lower())
  tokens_with_pos_tag = nltk.pos_tag(tokens)
  lemmatizer = WordNetLemmatizer()
  lemma_tokens = list()
  for token in tokens_with_pos_tag:
    word = token[0]
    pos = convert_tag(token[1])
    lemma_tokens.append(lemmatizer.lemmatize(word, pos=pos))
  return lemma_tokens  

# Term Weighting

In [250]:
from sklearn.feature_extraction.text import TfidfVectorizer

def term_weighting(tokenizer, all_texts):
    """
    Parameters
    ----------
    tokenizer : list
        A function of stemmer or lemmatizer method

    all_text : list
        List of all contents

    Returns
    ----------
    matrix
        Weighting matrix
    """
    vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                min_df=0.01)
    term_weighted = vectorizer.fit_transform(all_texts)
    return term_weighted

text_feature = term_weighting(snowball_stem_tokenizer, raw_df['contents'].values)

In [251]:
# Concat text feature to dataframe
raw_df['weight_feature'] = text_feature.toarray().tolist()

In [252]:
raw_df

Unnamed: 0,categories,contents,weight_feature
0,technology,21st-Century Sports: How Digital Technology Is...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,business,Asian quake hits European shares Shares in Eur...,"[0.05315856074546948, 0.0, 0.0, 0.173218607356..."
2,technology,BT offers free net phone calls BT is offering ...,"[0.03556298211753117, 0.0, 0.0, 0.011036470351..."
3,business,Barclays shares up on merger talk Shares in UK...,"[0.022094298001699646, 0.0, 0.0, 0.02056996210..."
4,sport,Barkley fit for match in Ireland England centr...,"[0.03651482394555501, 0.0668501490153514, 0.0,..."
...,...,...,...
1403,sport,Woodward eyes Brennan for Lions Toulouse's for...,"[0.10142723315034727, 0.0, 0.0, 0.118036921912..."
1404,business,WorldCom trial starts in New York The trial of...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1405,business,Yukos accused of lying to court Russian oil fi...,"[0.01063187804449671, 0.0, 0.0, 0.039593442338..."
1406,business,Yukos drops banks from court bid Russian oil c...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
