# Objective

1- The objective of this project is to implement a basic information retrieval system using TF-IDF (Term Frequency–Inverse Document Frequency) on a book provided in PDF format.
You will read the PDF, extract and preprocess its content, vectorize the textual data, and finally retrieve relevant information based on a user query.

2- Objective of the Extension:
Modify the previous TF-IDF-based retrieval system by combining TF-IDF weighting with Word2Vec embeddings. Instead of using raw TF-IDF vectors for similarity, represent each paragraph as a TF-IDF-weighted average of Word2Vec word embeddings. This allows capturing semantic similarity in addition to term importance

# Install required libraries and Imports


In [None]:
!pip install PyPDF2 nltk scikit-learn gensim gradio
!pip install pdfplumber nltk scikit-learn gensim gradio



In [None]:
import pdfplumber
import re
import nltk
import numpy as np
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# READ PDF

In [None]:
# 1. READ PDF USING PDFPLUMBER
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:     # avoids NoneType
                text += page_text + "\n"
    return text


# CHUNKS

In [None]:
# 2. CHUNK INTO PARAGRAPHS
def chunk_into_paragraphs(text):
    paragraphs = [p.strip() for p in text.split("\n") if len(p.strip()) > 50]
    return paragraphs

# PREPROCESSING

In [None]:
# 3. PREPROCESSING
stop_words = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemm.lemmatize(t) for t in tokens]
    return " ".join(tokens)


# CLEAN PARAGRAPHS

In [None]:
# LOAD PDF → CLEAN PARAGRAPHS
PDF_PATH = "/content/Hands_On_Machine_Learning_with_Scikit_Learn_and_TensorFlow.pdf"

raw_text = extract_text_from_pdf(PDF_PATH)
paragraphs = chunk_into_paragraphs(raw_text)
clean_paragraphs = [preprocess(p) for p in paragraphs]

print("Number of paragraphs extracted:", len(paragraphs))


Number of paragraphs extracted: 10826


# EXTENSION: TF-IDF WEIGHTED WORD2VEC

In [None]:
# = EXTENSION: TF-IDF WEIGHTED WORD2VEC
# Train Word2Vec on your cleaned paragraphs
tokenized_paragraphs = [p.split() for p in clean_paragraphs]
w2v_model = Word2Vec(sentences=tokenized_paragraphs, vector_size=300, min_count=1, workers=4)

In [None]:
# Create weighted embedding for each paragraph
def get_weighted_embedding(text):
    tokens = text.split()
    vec = np.zeros(300)
    weight_sum = 0

    for word in tokens:
        if word in w2v_model.wv and word in tfidf.vocabulary_:
            tfidf_weight = tfidf.idf_[tfidf.vocabulary_[word]]
            vec += w2v_model.wv[word] * tfidf_weight
            weight_sum += tfidf_weight

    if weight_sum == 0:
        return vec
    return vec / weight_sum

paragraph_embeddings = np.array([get_weighted_embedding(p) for p in clean_paragraphs])

In [None]:
# SEARCH ENGINE FUNCTIONS
def search_tfidf(query, top_k=5):
    q = preprocess(query)
    q_vec = tfidf.transform([q])
    scores = cosine_similarity(q_vec, tfidf_matrix)[0]
    idx = scores.argsort()[::-1][:top_k]
    results = [paragraphs[i] for i in idx]
    return "\n\n---\n\n".join(results)


def search_semantic(query, top_k=5):
    q = preprocess(query)
    q_embed = get_weighted_embedding(q)
    sims = cosine_similarity([q_embed], paragraph_embeddings)[0]
    idx = sims.argsort()[::-1][:top_k]
    results = [paragraphs[i] for i in idx]
    return "\n\n---\n\n".join(results)

# TF-IDF VECTORIZATION

In [None]:
# 4. TF-IDF VECTORIZATION
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(clean_paragraphs)

# GRADIO INTERFACE

In [None]:
# GRADIO INTERFACE
def retrieve(query, mode, top_k):
    if mode == "TF-IDF":
        return search_tfidf(query, top_k)
    else:
        return search_semantic(query, top_k)

interface = gr.Interface(
    fn=retrieve,
    inputs=[
        gr.Textbox(label="Enter your query"),
        gr.Radio(["TF-IDF", "TF-IDF + Word2Vec"], label="Retrieval Mode"),
        gr.Slider(1, 10, value=5, step=1, label="Top K Results"),
    ],
    outputs=gr.Textbox(label="Results",lines = 3)
)

interface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1459b3cd8d0c606ff3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


