# Extracting Text from PDFs

In [None]:
import os
from PyPDF2 import PdfReader
import numpy as np

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()
    return extracted_text

def extract_text_from_pdfs_in_directory(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            extracted_text = extract_text_from_pdf(pdf_path)
            txt_filename = os.path.splitext(filename)[0] + ".txt"
            txt_filepath = os.path.join(directory, txt_filename)
            with open(txt_filepath, "w") as txt_file:
                txt_file.write(extracted_text)

# Specify the directory containing PDF files
directory_path = "Docs/"

# Extract text from PDFs in the directory and save as text files
extract_text_from_pdfs_in_directory(directory_path)

In [None]:
import os
from nltk.tokenize import sent_tokenize

directory_path = "Docs"

# List all .txt files in the directory
txt_files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]

# List to store sentences from all files
all_sentences = []

# Read each text file, split into sentences, and store
for txt_file in txt_files:
    file_path = os.path.join(directory_path, txt_file)
    with open(file_path, "r") as file:
        text = file.read()
        sentences = sent_tokenize(text)
        all_sentences.extend(sentences)

# Print the first few sentences as an example
print(all_sentences[:10])  # Print first 10 sentences


# Generating Embedding for the text using FastEmbed

In [None]:
from fastembed import TextEmbedding
import numpy as np
import time

# Initialize the TextEmbedding model
embedding_model = TextEmbedding(model_name="BAAI/bge-base-en", cache_dir="./embeddings")

def embed_documents(documents):
    embeddings = []
    for document in documents:
        # Embed document using FastEmbed
        embedding = np.array(list((embedding_model.embed([document]))))
        
        # Append the embedding to the list of embeddings
        embeddings.append(embedding)
    
    return embeddings

# Define the documents
documents = all_sentences

# Perform embedding generation
embeddings = embed_documents(documents)

In [None]:
embeddings = [sublist[0] for sublist in embeddings]

# Starting Qdrant-Client

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.http.models import PointStruct

client = QdrantClient(
    url="https://c065099d-b51c-4e03-b680-646b177fc993.us-east4-0.gcp.cloud.qdrant.io:6333", 
    api_key="[Qdrant-API-Key]",
    https=True,
)
collection_name = 'RAG-Usage-Example'
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

In [None]:
client.get_collections()

# Uploading Embedding to Qdrant Vector DB

In [None]:
client.upload_points(
    collection_name=collection_name,
    points=[
        PointStruct(
            id=idx,
            vector=vector.tolist(),
            payload={"text": text}
        )
        for idx, (vector, text) in enumerate(zip(embeddings, documents))
    ]
)

# Building a RAG System with OpenaI for any Query

# Using OpenAI API (Gradio)

In [None]:
from typing import List
from qdrant_client import QdrantClient
from openai import OpenAI
import gradio as gr
import numpy as np

OpenAI_client = OpenAI(api_key='[OpenAI-API-Key]')

# Function to generate completion from prompt
def generate_completion(prompt):
    completion = OpenAI_client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are assisting in answering a question."},
            {"role": "user", "content": prompt}
        ]
    )
    return completion.choices[0].message.content

# Function to embed Queries
def embed_query(Question):
    return np.array(list(embedding_model.embed([Question])))

# Initialize Qdrant Client
client = QdrantClient(
    url="https://c065099d-b51c-4e03-b680-646b177fc993.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key="[Qdrant-API-Key]",
    https=True,
)

def generate_response(Question):
    query_embeddings = embed_query(Question)
    collection_name = 'RAG-Usage-Example'
    all_text = ""

    # Retrieve all hits and concatenate texts into a single prompt
    for query_embedding in query_embeddings:
        query_vector: List[np.ndarray] = list(query_embedding)
        hits = client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=8
        )
        for hit in hits:
            text = hit.payload["text"]
            all_text += text + "\n\n"

    # Generate completion using all texts as a single prompt
    prompt = f"Given the following text, answer the following question:\n\n{all_text}\n\nQuestion: What is the main idea of the text?\n\nAnswer:"
    completion = generate_completion(prompt)
    return completion

# Set up the Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=[gr.Textbox(label="Question")],
    outputs=[gr.Textbox(label="Generated Response")],
    title="RAG with Qdrant, FastEmbed and OpenAI",
    description="Enter a question and get a generated response based on the retrieved text.",
)

iface.launch()

# Using Google Gemini API (Gradio)

In [None]:
!pip install -q -U google-generativeai
!pip install gradio

In [None]:
from typing import List
from qdrant_client import QdrantClient
import google.generativeai as genai
import gradio as gr
import numpy as np

genai.configure(api_key="[Google-API-Key]")
model = genai.GenerativeModel('gemini-pro')

# Function to generate completion from prompt
def generate_completion(prompt):
    response = model.generate_content(prompt)
    return response.text

# Function to embed Queries
def embed_query(Question):
    return np.array(list(embedding_model.embed([Question])))

# Initialize Qdrant Client
client = QdrantClient(
    url="https://c065099d-b51c-4e03-b680-646b177fc993.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key="[Qdrant-API-Key]",
    https=True,
)

def generate_response(Question):
    query_embeddings = embed_query(Question)
    collection_name = 'RAG-Usage-Example'
    all_text = ""

    # Retrieve all hits and concatenate texts into a single prompt
    for query_embedding in query_embeddings:
        query_vector: List[np.ndarray] = list(query_embedding)
        hits = client.search(
            collection_name=collection_name,
            query_vector=query_vector,
            limit=50
        )
        for hit in hits:
            text = hit.payload["text"]
            all_text += text + "\n\n"

    # Generate completion using all texts as a single prompt
    prompt = f"Given the following text, answer the following question:\n\n{all_text}\n\nQuestion: What is the main idea of the text?\n\nAnswer:"
    completion = generate_completion(prompt)
    return completion

# Set up the Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=[gr.Textbox(label="Question")],  # Pass input as a list
    outputs=[gr.Textbox(label="Generated Response")],  # Pass output as a list
    title="RAG with Qdrant, FastEmbed and Gemini",
    description="Enter a question and get a generated response based on the retrieved text.",
)

iface.launch()