# **TF-IDF**
## TF: Term Frecuency, how many times is a specific term on a document
## IDF: Inverse Document Frecuency: In how many documents is the term found

In [1]:
import os
import re
from collections import Counter, defaultdict
from math import log
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from heapq import nlargest
import time

# Ensure you have the required NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Data cleaning
Removing StopWords, Special Characters and transform to lowercase

In [2]:
# Function to clean text
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return words

## TF - IDF Computing

In [3]:
# Function to compute TF
def compute_tf(word_list):
    tf = Counter(word_list)
    total_words = len(word_list)
    return {word: count / total_words for word, count in tf.items()}

In [4]:
# Function to compute IDF
def compute_idf(documents):
    total_docs = len(documents)
    idf = defaultdict(lambda: 0)
    for doc in documents:
        unique_words = set(doc)
        for word in unique_words:
            idf[word] += 1
    return {word: log(total_docs / count) for word, count in idf.items()}

In [5]:
# Main function to calculate TF-IDF and extract keywords
def extract_keywords(folder_path):
    overall_start = time.time()

    # Step 1: Read and clean all files
    start = time.time()
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Assuming text files
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                documents[filename] = clean_text(file.read())
    step1_time = time.time() - start

    # Step 2: Calculate TF
    start = time.time()
    tfs = {doc: compute_tf(words) for doc, words in documents.items()}
    step2_time = time.time() - start

    # Step 3: Calculate IDF
    start = time.time()
    idf = compute_idf(documents.values())
    step3_time = time.time() - start

    # Step 4: Calculate TF-IDF and extract keywords
    start = time.time()
    results = []
    for doc, tf in tfs.items():
        tf_idf = {word: tf[word] * idf[word] for word in tf}
        top_keywords = nlargest(15, tf_idf.items(), key=lambda x: x[1])
        results.append((doc, [(word, round(score, 5)) for word, score in top_keywords]))
    step4_time = time.time() - start

    overall_time = time.time() - overall_start

    print(f"Timing Metrics:")
    print(f"1. Text Cleaning and Loading: {step1_time:.2f} seconds")
    print(f"2. TF Calculation: {step2_time:.2f} seconds")
    print(f"3. IDF Calculation: {step3_time:.2f} seconds")
    print(f"4. TF-IDF and Keyword Extraction: {step4_time:.2f} seconds")
    print(f"Overall Time: {overall_time:.2f} seconds")
    print(f"\n")
    print(f"\n")
    print(f"\n")

    return results

## Keyword extraction

In [6]:
folder_path = "../Data"
keywords = extract_keywords(folder_path)
for doc, top_keywords in keywords:
    print(f"{doc}: {top_keywords}")

Timing Metrics:
1. Text Cleaning and Loading: 440.66 seconds
2. TF Calculation: 11.40 seconds
3. IDF Calculation: 11.63 seconds
4. TF-IDF and Keyword Extraction: 10.81 seconds
Overall Time: 474.50 seconds






pg238.txt: [('grier', 0.01269), ('judy', 0.01022), ('sallie', 0.00879), ('mcbride', 0.00765), ('sadie', 0.00622), ('jervis', 0.00621), ('macrae', 0.0053), ('mcb', 0.00494), ('betsy', 0.0049), ('snaith', 0.00477), ('witherspoon', 0.00461), ('allegra', 0.00446), ('mcgurk', 0.0042), ('gordon', 0.00405), ('asylum', 0.00394)]
pg378.txt: [('tirant', 0.10738), ('plaerdemavida', 0.01683), ('tirants', 0.01674), ('diafebus', 0.01458), ('hippolytus', 0.01196), ('emperor', 0.01028), ('escariano', 0.00837), ('princess', 0.0081), ('stephanie', 0.00599), ('empress', 0.00584), ('moors', 0.00518), ('infanta', 0.00515), ('philippe', 0.00475), ('knights', 0.00415), ('agramunt', 0.00414)]
pg779.txt: [('faustus', 0.11667), ('mephist', 0.03461), ('footnote', 0.03286), ('mephistophilis', 0.03138), ('4