# Preprocessing text data with TF-IDF

- TF-IDF with a context d in D (corpus):

$r_d = [tf-idf(w_1, d, D), tf-idf(w_2, d, D), ..., tf-idf(w_{|V|}, d, D)]$

with, $r_d \in R^{|V|}$ is a vector $|V|$ dims and $V = {w_i}$ is a dictionary (all words appear in $D$) respect to $D$

- Inside:

$tf-idf(w_i, d, D) = tf(w_i, d) * idf(w_i, D)$

with,

$tf(w_i, d) = \dfrac{f(w_i, d)}{max(f(w_j, d): w_j \in V)}$

$idf(w_i, D) = log_{10}^{\dfrac{|D|}{|d' \in D: w_i \in d'|}}$

- Identify dictionary V:

  - With each context $d$ in $D$:
    - Separate d to some word by punctuation, then collect $W_d$
    - Delete stop words from $W_d$
    - Convert word to original (stemming), then collect $W_d$
  - Finally:
    $V = $ Intersection of $W_d$ with $d \in D$

# 1. Gather Data 

In [56]:
# Module Path
import os

# Module Stemming
from nltk.stem import PorterStemmer

# Avoid Key Error with init default value by Defaultdict for that key has not been set yet
# We still can avoid this  by using defaulting in normal dict as well: d.get(key,0)
# https://bit.ly/3BWCX0E
# [[a,b]] / [(a,b)] -> dict/default() -> 2 var in pack
from collections import defaultdict

# Other lib
import pandas as pd 
import numpy as np
import math 
import re

In [47]:
# Delete .DS_Store file (only for Mac)
# cd /Users/charles/MLGT/SESSION_1
# find . -name ".DS_Store" -print -delete

In [48]:
def gather_data(path):
    # Get list dir of folder & news_group
    # listdir(): get the list of all files and directories in the specified directory. 
    dirs = [path + dir_name + "/"
            for dir_name in os.listdir(path)
                if not os.path.isfile(path + dir_name)]
    # Assign folder train & test dir
    train_dir, test_dir = (dirs[0], dirs[1]) if "train" in dirs else (dirs[1], dirs[0])
    # Crawl news group
    list_newsgroup = [news for news in os.listdir(train_dir)]
    list_newsgroup.sort()
    
    # Create Dictionary
    # Read stop words
    with open(path + "stop_word.txt") as f:
        stop_words = f.read().splitlines()
    stemmer = PorterStemmer()

    # Collect Data from news group
    def collect_data_from(parent_dir, newsgroup_list):
        data = []
        # Enumerate(): Simplify Looping With Counters
        # Get news files & path 
        for group_id, newsgroup in enumerate(newsgroup_list):
            label = group_id
            dir_path = parent_dir + newsgroup + '/'
            files = [(filename, dir_path + filename)
                        for filename in os.listdir(dir_path)
                            if os.path.isfile(dir_path + filename)]
            files.sort()

            # Stemming words for each file
            for filename, filepath in files:
                with open(filepath, errors='ignore') as f:
                    # Ignore byte sequence which is not allowed in utf-8-encoded strings (namely this 0xff at position 0)
                    # https://bit.ly/3BUsiUl
                    text = f.read().lower()
                    # re \W: matches any character that is not a word character from the basic Latin alphabet
                    # Remove stop words and stemming words
                    words = [stemmer.stem(word)
                                for word in re.split('\W+', text)    
                                    if word not in stop_words]
                    # Combine remaning words
                    content = ' '.join(words)
                    assert len(content.splitlines()) == 1
                    data.append(str(label) + '<fff>' +
                                    filename + '<fff>' + content)
        return data     
    
    # Set data train & test 
    data_train = collect_data_from(train_dir, list_newsgroup)
    data_test = collect_data_from(test_dir, list_newsgroup)
    data_full = data_train + data_test

    # Write data to file with w+
    with open(path + "20news_train_processed.txt", "w+") as f:
        f.write("\n".join(data_train))
    with open(path + "20news_test_processed.txt", "w+") as f:
        f.write("\n".join(data_test))
    with open(path + "20news_full_processed.txt", "w+") as f:
        f.write("\n".join(data_full))

path = "/Users/charles/MLGT/SESSION_1/Data/20news-bydate/" 
gather_data(path)

# 2. Preprocessing Text

In [60]:
# Check full data 
with open("/Users/charles/MLGT/SESSION_1/Data/20news-bydate/20news_full_processed.txt", errors="ignore") as f:
    doc = f.read().splitlines()
    print(len(doc))
# docs: (label, doc_id, text)
doc[0][:30]

18846


'0<fff>49960<fff>mathew mathew '

In [57]:
# Compute IDF 
def compute_idf(doc_freq, corpus_size):
    assert doc_freq > 0
    return np.log10(corpus_size * 1. / doc_freq)

In [61]:
# Generate Vocab (word, idf): abc<fff>7.8230203023421
def generate_vocabulary(data_path):
    with open(data_path, errors='ignore') as f:
        lines = f.read().splitlines()
    # Init DefaultDict with type int 
    doc_count = defaultdict(int)
    corpus_size = len(lines)

    for line in lines:
        features = line.split('<fff>')
        text = features[-1]
        words = list(set(text.split()))
        for word in words:
            doc_count[word] += 1

    # Calculate IDF to (word, idf)
    words_idfs = [(word, compute_idf(document_freq, corpus_size))
                   for word, document_freq in zip(doc_count.keys(), doc_count.values())
                   if document_freq > 10 and not word.isdigit()]
    words_idfs.sort(key=lambda word_idf: -word_idf[1])

    # Write dictionary file 
    with open("/Users/charles/MLGT/SESSION_1/Data/20news-bydate/words_idfs.txt", 'w+') as f:
        f.write('\n'.join([word + '<fff>' + str(idf) for word, idf in words_idfs]))
    
# Apply for full data 
generate_vocabulary("/Users/charles/MLGT/SESSION_1/Data/20news-bydate/20news_full_processed.txt")

In [68]:
# TF-IDF
def get_tf_idf(data_path):
    with open("/Users/charles/MLGT/SESSION_1/Data/20news-bydate/words_idfs.txt", errors='ignore') as f:
        words_idfs = [(line.split('<fff>')[0], float(line.split('<fff>')[1]))
                    for line in f.read().splitlines()]
        words_IDs = dict([(word, index)
                        for index, (word, idf) in enumerate(words_idfs)])
        idfs = dict(words_idfs)
    
    # Split <fff>
    with open(data_path, errors='ignore') as f:
        documents = [(int(line.split('<fff>')[0]),
                    int(line.split('<fff>')[1]),
                    line.split('<fff>')[2])
                    for line in f.read().splitlines()]

        data_tf_idf = []
        for document in documents:
            label, doc_id, text = document
            words = [word for word in text.split() if word in idfs]
            word_set = list(set(words))
            max_term_freq = max([words.count(word) for word in word_set])
            words_tfidfs = []
            sum_squares = 0.0

            for word in word_set:
                term_freq = words.count(word)
                tf_idf_value = term_freq * (1. / max_term_freq) * idfs[word]
                words_tfidfs.append((words_IDs[word], tf_idf_value))
                sum_squares += tf_idf_value ** 2

            words_tfidfs_normalize = [str(index) + ':'
                                    + str(tf_idf_value / np.sqrt(sum_squares))
                                    for index, tf_idf_value in words_tfidfs]
            sparse_rep = ' '.join(words_tfidfs_normalize)
            data_tf_idf.append((label, doc_id, sparse_rep))
    return data_tf_idf

# Apply, result = (label, doc_id, sparse_rep)
data_tf_idf = get_tf_idf("/Users/charles/MLGT/SESSION_1/Data/20news-bydate/20news_full_processed.txt")

In [103]:
# Write file data_tf_idf with a<fff>b<fff>c
with open("/Users/charles/MLGT/SESSION_1/Data/20news-bydate/data_tf_idf.txt", 'w+') as f:
    result = []
    for num in range(len(data_tf_idf)):
        connect = "<fff>".join([str(data_tf_idf[num][0]), str(data_tf_idf[num][1]), data_tf_idf[num][2]])
        result.append(connect)
    f.write('\n'.join(result))