## Introduction to TF-IDF

https://www.analyticsvidhya.com/blog/2021/09/creating-a-movie-reviews-classifier-using-tf-idf-in-python/

**Term Frequency**

The term is frequency measure of a word w in a document (text) d. It is equal to the number of instances of word w in document d divided by the total number of words in document d. Term frequency serves as a metric to determine a word’s occurrence in a document as compared to the total number of words in a document. 

**Inverse Document Frequency (IDF)**

This parameter gives a numeric value of the importance of a word. Inverse Document frequency of word w is defined as the total number of documents (N) in a text corpus D, divided by the number of documents containing w.

The issue with such methods is that they cannot understand synonyms, semantics, and other emotional aspects of language. For example, large and big are synonymous, but such methods cannot identify that.

----
https://www.kaggle.com/code/rowhitswami/keywords-extraction-using-tf-idf-method

In [None]:
# General libraries
import re, os, string
import pandas as pd

# Scikit-learn importings
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def get_stopwords_list(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))

In [None]:
def clean_text(text):
    """Doc cleaning"""
    
    # Lowering text
    text = text.lower()
    
    # Removing punctuation
    text = "".join([c for c in text if c not in PUNCTUATION])
    
    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)
    
    return text

In [None]:
def sort_coo(coo_matrix):
    """Sort a dict with highest score"""
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature, score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [None]:
def get_keywords(vectorizer, feature_names, doc):
    """Return top k keywords from a doc using TF-IDF method"""

    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)
    
    return list(keywords.keys())

In [None]:
# Constants
PUNCTUATION = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""" 
TOP_K_KEYWORDS = 10 # top k number of keywords to retrieve in a ranked document
STOPWORD_PATH = "./kaggle/input/stopwords.txt"
PAPERS_PATH = "./kaggle/input/papers.csv"

### Reading data

In [None]:
data = pd.read_csv(PAPERS_PATH)
data.head()

In [None]:
data.dropna(subset=['full_text'], inplace=True)

### Preparing data

In [None]:
data['full_text'] = data['full_text'].apply(clean_text)

In [None]:
corpora = data['full_text'].to_list()

### Keywords Extraction using TF-IDF

In [None]:
#load a set of stop words
stopwords=get_stopwords_list(STOPWORD_PATH)

# Initializing TF-IDF Vectorizer with stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords, smooth_idf=True, use_idf=True)

# Creating vocab with our corpora
# Exlcluding first 10 docs for testing purpose
vectorizer.fit_transform(corpora[10::])

# Storing vocab
feature_names = vectorizer.get_feature_names_out()

In [None]:
result = []
for doc in corpora[0:10]:
    df = {}
    df['full_text'] = doc
    df['top_keywords'] = get_keywords(vectorizer, feature_names, doc)
    result.append(df)
    
final = pd.DataFrame(result)
final

------
#### Broken down without functions

### Reading data

In [None]:
data = pd.read_csv(PAPERS_PATH)
data.head()

In [None]:
data.dropna(subset=['full_text'], inplace=True)

### Preparing data

In [None]:
data['full_text'] = data['full_text'].apply(clean_text)

In [None]:
corpora = data['full_text'].to_list()

### Keywords Extraction using TF-IDF

In [None]:
# Extract list of stopwords - note: I probably just want to use a standard set
with open(STOPWORD_PATH, 'r', encoding="utf-8") as f:
    stopwords = f.readlines()
    stop_set = set(m.strip() for m in stopwords)
    stopwords = list(frozenset(stop_set))

In [None]:
stopwords

In [None]:
# Initializing TF-IDF Vectorizer with stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords, smooth_idf=True, use_idf=True)

In [None]:
# Creating vocab with our corpora
# Exlcluding first 10 docs for testing purpose: note, do we need to exclude our category of interest?
vectorizer.fit_transform(corpora[10::])

In [None]:
# Storing vocab
feature_names = vectorizer.get_feature_names_out()

In [None]:
result = []
for doc in corpora[0:10]:
    df = {}
    df['full_text'] = doc
     
        
    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)
    
    df['top_keywords'] = list(keywords.keys())
    
    result.append(df)
    
final = pd.DataFrame(result)
final

-----
Code above applied to single instance:

In [None]:
doc = corpora[0]

In [None]:
#generate tf-idf for the given document
tf_idf_vector = vectorizer.transform([doc])

In [None]:
#sort the tf-idf vectors by descending order of scores
sorted_items=sort_coo(tf_idf_vector.tocoo())

In [None]:
#extract only TOP_K_KEYWORDS
keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)

In [None]:
keywords