In [3]:
!pip install gradio transformers torch sentence-transformers PyPDF2
import gradio as gr
import PyPDF2
import os
from sentence_transformers import SentenceTransformer, util
import torch
import re

# Initialize the sentence transformer model for embeddings (simulating GPT-3.5-like behavior)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Store document embeddings and content
documents = []
doc_embeddings = []

# Function to extract text from uploaded files (supports PDF and TXT)
def extract_text_from_file(file):
    if file.name.endswith('.pdf'):
        pdf_reader = PyPDF2.PdfReader(file.name)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text
    elif file.name.endswith('.txt'):
        with open(file.name, 'r', encoding='utf-8') as f:
            return f.read()
    return ""

# Function to preprocess and split text into chunks
def preprocess_text(text):
    # Basic cleaning: remove extra whitespace and split into sentences
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = text.split('. ')
    return [s.strip() + '.' for s in sentences if s.strip()]

# Function to index a document
def index_document(file):
    global documents, doc_embeddings
    text = extract_text_from_file(file)
    if not text:
        return "Error: Could not extract text from the file."

    # Split into chunks (sentences)
    chunks = preprocess_text(text)
    documents.extend(chunks)

    # Generate embeddings for each chunk
    embeddings = model.encode(chunks, convert_to_tensor=True)
    doc_embeddings.append(embeddings)

    return f"Indexed document: {os.path.basename(file.name)} ({len(chunks)} chunks)"

# Function to search documents
def search_documents(query):
    if not documents:
        return "No documents indexed yet. Please upload a document first."

    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity between query and document chunks
    all_embeddings = torch.cat(doc_embeddings, dim=0)
    scores = util.cos_sim(query_embedding, all_embeddings)[0]

    # Get top 3 most relevant chunks
    top_k = min(3, len(documents))
    top_results = torch.topk(scores, k=top_k)
    result_indices = top_results.indices.tolist()
    result_scores = top_results.values.tolist()

    # Format the results
    results = []
    for idx, score in zip(result_indices, result_scores):
        results.append(f"Score: {score:.4f} - {documents[idx]}")

    return "\n\n".join(results)

# Clear indexed documents (optional reset functionality)
def clear_index():
    global documents, doc_embeddings
    documents = []
    doc_embeddings = []
    return "Index cleared."

# Gradio Interface
with gr.Blocks(title="AI-Powered Document Search") as demo:
    gr.Markdown("# AI-Powered Document Search and Retrieval System")
    gr.Markdown("Upload documents (PDF or TXT) and search through them using natural language queries.")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload Document (PDF or TXT)")
            upload_btn = gr.Button("Index Document")
            clear_btn = gr.Button("Clear Index")
            upload_output = gr.Textbox(label="Upload Status")

        with gr.Column():
            query_input = gr.Textbox(label="Enter your search query")
            search_btn = gr.Button("Search")
            search_output = gr.Textbox(label="Search Results", lines=10)

    # Connect functions to buttons
    upload_btn.click(fn=index_document, inputs=file_input, outputs=upload_output)
    clear_btn.click(fn=clear_index, inputs=None, outputs=upload_output)
    search_btn.click(fn=search_documents, inputs=query_input, outputs=search_output)

# Launch the interface
demo.launch()
import os
from sentence_transformers import SentenceTransformer, util
import torch
import re

# Initialize the sentence transformer model for embeddings (simulating GPT-3.5-like behavior)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Store document embeddings and content
documents = []
doc_embeddings = []

# Function to extract text from uploaded files (supports PDF and TXT)
def extract_text_from_file(file):
    if file.name.endswith('.pdf'):
        pdf_reader = PyPDF2.PdfReader(file.name)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text
    elif file.name.endswith('.txt'):
        with open(file.name, 'r', encoding='utf-8') as f:
            return f.read()
    return ""

# Function to preprocess and split text into chunks
def preprocess_text(text):
    # Basic cleaning: remove extra whitespace and split into sentences
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = text.split('. ')
    return [s.strip() + '.' for s in sentences if s.strip()]

# Function to index a document
def index_document(file):
    global documents, doc_embeddings
    text = extract_text_from_file(file)
    if not text:
        return "Error: Could not extract text from the file."

    # Split into chunks (sentences)
    chunks = preprocess_text(text)
    documents.extend(chunks)

    # Generate embeddings for each chunk
    embeddings = model.encode(chunks, convert_to_tensor=True)
    doc_embeddings.append(embeddings)

    return f"Indexed document: {os.path.basename(file.name)} ({len(chunks)} chunks)"

# Function to search documents
def search_documents(query):
    if not documents:
        return "No documents indexed yet. Please upload a document first."

    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity between query and document chunks
    all_embeddings = torch.cat(doc_embeddings, dim=0)
    scores = util.cos_sim(query_embedding, all_embeddings)[0]

    # Get top 3 most relevant chunks
    top_k = min(3, len(documents))
    top_results = torch.topk(scores, k=top_k)
    result_indices = top_results.indices.tolist()
    result_scores = top_results.values.tolist()

    # Format the results
    results = []
    for idx, score in zip(result_indices, result_scores):
        results.append(f"Score: {score:.4f} - {documents[idx]}")

    return "\n\n".join(results)

# Clear indexed documents (optional reset functionality)
def clear_index():
    global documents, doc_embeddings
    documents = []
    doc_embeddings = []
    return "Index cleared."

# Gradio Interface
with gr.Blocks(title="AI-Powered Document Search") as demo:
    gr.Markdown("# AI-Powered Document Search and Retrieval System")
    gr.Markdown("Upload documents (PDF or TXT) and search through them using natural language queries.")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload Document (PDF or TXT)")
            upload_btn = gr.Button("Index Document")
            clear_btn = gr.Button("Clear Index")
            upload_output = gr.Textbox(label="Upload Status")

        with gr.Column():
            query_input = gr.Textbox(label="Enter your search query")
            search_btn = gr.Button("Search")
            search_output = gr.Textbox(label="Search Results", lines=10)

    # Connect functions to buttons
    upload_btn.click(fn=index_document, inputs=file_input, outputs=upload_output)
    clear_btn.click(fn=clear_index, inputs=None, outputs=upload_output)
    search_btn.click(fn=search_documents, inputs=query_input, outputs=search_output)

# Launch the interface
demo.launch()
import os
from sentence_transformers import SentenceTransformer, util
import torch
import re

# Initialize the sentence transformer model for embeddings (simulating GPT-3.5-like behavior)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Store document embeddings and content
documents = []
doc_embeddings = []

# Function to extract text from uploaded files (supports PDF and TXT)
def extract_text_from_file(file):
    if file.name.endswith('.pdf'):
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
        return text
    elif file.name.endswith('.txt'):
        with open(file.name, 'r', encoding='utf-8') as f:
            return f.read()
    return ""

# Function to preprocess and split text into chunks
def preprocess_text(text):
    # Basic cleaning: remove extra whitespace and split into sentences
    text = re.sub(r'\s+', ' ', text).strip()
    sentences = text.split('. ')
    return [s.strip() + '.' for s in sentences if s.strip()]

# Function to index a document
def index_document(file):
    global documents, doc_embeddings
    text = extract_text_from_file(file)
    if not text:
        return "Error: Could not extract text from the file."

    # Split into chunks (sentences)
    chunks = preprocess_text(text)
    documents.extend(chunks)

    # Generate embeddings for each chunk
    embeddings = model.encode(chunks, convert_to_tensor=True)
    doc_embeddings.append(embeddings)

    return f"Indexed document: {os.path.basename(file.name)} ({len(chunks)} chunks)"

# Function to search documents
def search_documents(query):
    if not documents:
        return "No documents indexed yet. Please upload a document first."

    # Encode the query
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity between query and document chunks
    all_embeddings = torch.cat(doc_embeddings, dim=0)
    scores = util.cos_sim(query_embedding, all_embeddings)[0]

    # Get top 3 most relevant chunks
    top_k = min(3, len(documents))
    top_results = torch.topk(scores, k=top_k)
    result_indices = top_results.indices.tolist()
    result_scores = top_results.values.tolist()

    # Format the results
    results = []
    for idx, score in zip(result_indices, result_scores):
        results.append(f"Score: {score:.4f} - {documents[idx]}")

    return "\n\n".join(results)

# Clear indexed documents (optional reset functionality)
def clear_index():
    global documents, doc_embeddings
    documents = []
    doc_embeddings = []
    return "Index cleared."

# Gradio Interface
with gr.Blocks(title="AI-Powered Document Search") as demo:
    gr.Markdown("# AI-Powered Document Search and Retrieval System")
    gr.Markdown("Upload documents (PDF or TXT) and search through them using natural language queries.")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload Document (PDF or TXT)")
            upload_btn = gr.Button("Index Document")
            clear_btn = gr.Button("Clear Index")
            upload_output = gr.Textbox(label="Upload Status")

        with gr.Column():
            query_input = gr.Textbox(label="Enter your search query")
            search_btn = gr.Button("Search")
            search_output = gr.Textbox(label="Search Results", lines=10)

    # Connect functions to buttons
    upload_btn.click(fn=index_document, inputs=file_input, outputs=upload_output)
    clear_btn.click(fn=clear_index, inputs=None, outputs=upload_output)
    search_btn.click(fn=search_documents, inputs=query_input, outputs=search_output)

# Launch the interface
demo.launch()


Collecting gradio
  Downloading gradio-5.23.3-py3-none-any.whl.metadata (16 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.4-py3-none-manylinux_2_17_x86_

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6afa67ebbfada92776.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7c8d884f4b3e0dbf4c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e6ecdb56805e74770b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


