# Preprocessing text data with TF-IDF

- TF-IDF with a context d in D (corpus):

$r_d = [tf-idf(w_1, d, D), tf-idf(w_2, d, D), ..., tf-idf(w_{|V|}, d, D)]$

with, $r_d \in R^{|V|}$ is a vector $|V|$ dims and $V = {w_i}$ is a dictionary (all words appear in $D$) respect to $D$

- Inside:

$tf-idf(w_i, d, D) = tf(w_i, d) * idf(w_i, D)$

with,

$tf(w_i, d) = \dfrac{f(w_i, d)}{max(f(w_j, d): w_j \in V)}$

$idf(w_i, D) = log_{10}^{\dfrac{|D|}{|d' \in D: w_i \in d'|}}$

- Identify dictionary V:

  - With each context $d$ in $D$:
    - Separate d to some word by punctuation, then collect $W_d$
    - Delete stop words from $W_d$
    - Convert word to original (stemming), then collect $W_d$
  - Finally:
    $V = $ Intersection of $W_d$ with $d \in D$

# 0. TF-IDF Tutorial

## 0.1. Processing Data

In [76]:
# Module Path
import os
# Module Stemming
from nltk.stem import PorterStemmer
# Other lib
import pandas as pd 
import numpy as np
import math 
import re

In [141]:
# Init data
sentence_1 = "Data Science is the sexiest job of the 21st century, Data Scientist is my dream"
sentence_2 = "Machine Learning is the key for Data Science, Machine Learning is my life"

# Process data 
sentence_1, sentence_2 = sentence_1.lower().split(), sentence_2.lower().split()
sentence_1n2 = set(sentence_1).union(sentence_2)

print(sentence_1, sentence_2, sentence_1n2, sep = "\n")

['data', 'science', 'is', 'the', 'sexiest', 'job', 'of', 'the', '21st', 'century,', 'data', 'scientist', 'is', 'my', 'dream']
['machine', 'learning', 'is', 'the', 'key', 'for', 'data', 'science,', 'machine', 'learning', 'is', 'my', 'life']
{'data', 'learning', 'for', 'the', 'job', 'my', 'science,', 'scientist', 'of', 'science', 'dream', '21st', 'is', 'sexiest', 'life', 'century,', 'key', 'machine'}


In [142]:
# Download file stopwords
import nltk
nltk.download('stopwords')         
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Filter sentence by stopwords with module ntlk
ft_sentence_1 = [word for word in sentence_1 if word not in stop_words]
ft_sentence_2 = [word for word in sentence_2 if word not in stop_words]
ft_sentence_1n2 = [word for word in sentence_1n2 if word not in stop_words]

print(ft_sentence_1, ft_sentence_2, ft_sentence_1n2, sep = "\n")

['data', 'science', 'sexiest', 'job', '21st', 'century,', 'data', 'scientist', 'dream']
['machine', 'learning', 'key', 'data', 'science,', 'machine', 'learning', 'life']
['data', 'learning', 'job', 'science,', 'scientist', 'science', 'dream', '21st', 'sexiest', 'life', 'century,', 'key', 'machine']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/charles/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [143]:
# dict.fromkeys: create  dictionary with keys from iterable and values set to value.
dictA, dictB = dict.fromkeys(ft_sentence_1n2, 0), dict.fromkeys(ft_sentence_1n2, 0)
# Check element for each sentence 
for _ in ft_sentence_1:
    dictA[_] = dictA.get(_, 0) + 1 
for _ in ft_sentence_2:
    dictB[_] = dictB.get(_, 0) + 1 

# Create DF
df = pd.DataFrame([dictA, dictB])
df

Unnamed: 0,data,learning,job,"science,",scientist,science,dream,21st,sexiest,life,"century,",key,machine
0,2,0,1,0,1,1,1,1,1,0,1,0,0
1,1,2,0,1,0,0,0,0,0,1,0,1,2


## 0.2. TF-IDF

In [146]:
# Compute TF-IDF
def compute_tfidf(word_dict):
    # Compute TF 
    def compute_tf(word_dict):
        tf_dict = {}
        for key, val in word_dict.items():
            tf_dict[key] = word_dict[key] / max(word_dict.values())
        return tf_dict
    
    # Compute IDF 
    def compute_idf(word_dict):
        # |D| is number of elements D (chose dict)
        N = len(word_dict)
        idf_dict = {}
        for key, val in word_dict.items():
            if val != 0:
                idf_dict[key] = math.log10(N / val)
        return idf_dict   
    
    # Compute TF-IDF
    tf_dict, idf_dict, tfidf_dict = compute_tf(word_dict), compute_idf(word_dict), {}
    for key_tf, val_tf in tf_dict.items():
        for key_idf, val_idf in idf_dict.items():
            if key_idf == key_tf:
                tfidf_dict[key_tf] = val_tf * val_idf
    return tfidf_dict

# Convert DF with TF-IDF
compute_tfidf(dictA)

{'data': 0.8129133566428556,
 'job': 0.5569716761534184,
 'scientist': 0.5569716761534184,
 'science': 0.5569716761534184,
 'dream': 0.5569716761534184,
 '21st': 0.5569716761534184,
 'sexiest': 0.5569716761534184,
 'century,': 0.5569716761534184}

# 1. Read & Gather Data 

In [80]:
# Module Path
import os
# Module Stemming
from nltk.stem import PorterStemmer

In [81]:
def gather_data(path):
    # Get list dir of folder & news_group
    # listdir(): get the list of all files and directories in the specified directory. 
    dirs = [path + dir_name + "/"
            for dir_name in os.listdir(path)
                if not os.path.isfile(path + dir_name)]
    # Assign folder train & test dir
    train_dir, test_dir = (dirs[0], dirs[1]) if "train" in dirs else (dirs[1], dirs[0])
    # Crawl news group
    list_newsgroup = [news for news in os.listdir(train_dir)]
    list_newsgroup.sort()
    
    # Create Dictionary
    # Read stop words
    with open("/Users/charles/MLGT/SESSION 1/Data/stop_word") as f:
        stop_words = f.read().splitlines()
    # Stemming data 
    

path = "/Users/charles/MLGT/SESSION 1/Data/20news-bydate/" 
gather_data(path)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/charles/MLGT/SESSION 1/Data/stop_word'