In [None]:
!pip install -qqq PyPDF2
!pip install -qqq chromadb
!pip install -qqq sentence_transformers
!pip install -qqq langchain
!pip install -qqq groq

In [25]:
import os
import PyPDF2
import chromadb
from google.colab import files  # For Colab file uploading
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import textwrap

from groq import Groq

# Groq API key
API_KEY = "GROQ_API_KEY"

# Initialize the Groq client
client = Groq(api_key=API_KEY)

# Set model and parameters for Groq API
MODEL = "llama-3.1-70b-versatile"
TEMPERATURE = 0.7
MAX_TOKENS = 500  # Adjust based on your needs

In [6]:
# Step 1: Extract text from PDF files
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as f:
        pdf_reader = PyPDF2.PdfReader(f)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Step 2: Load PDFs from a specified directory in Colab
def load_pdfs(directory_path):
    all_texts = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory_path, filename)
            text = extract_text_from_pdf(pdf_path)
            all_texts.append(text)
    return all_texts

# Step 3: Use Recursive Text Splitter to split the text into chunks
def split_text_recursive(text, chunk_size=500, chunk_overlap=50):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = splitter.split_text(text)
    return chunks

# Step 4: Embed text chunks using a pre-trained transformer model
def embed_texts(chunks):
    model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"  # Replace with a Turkish-specific model if needed.
    embedding_model = SentenceTransformer(model_name)
    embeddings = embedding_model.encode(chunks, show_progress_bar=True)
    return embeddings

# Step 5: Store embeddings in Chroma
def create_chroma_db(chunks, embeddings):
    collection_name = "document_chunks"  # Choose a name for your collection
    client = chromadb.Client()  # Initialize Chroma client

    try:
        # Try to delete the collection if it already exists
        client.delete_collection(collection_name)
        print(f"Deleted existing collection: {collection_name}")
    except Exception as e:
        print(f"Could not delete collection: {e}")

    collection = client.create_collection(collection_name)

    # Add chunks and their corresponding embeddings to the Chroma collection
    collection.add(
        documents=chunks,
        embeddings=embeddings.tolist(),
        metadatas=[{"chunk": i} for i in range(len(chunks))],  # Metadata to track each chunk
        ids=[str(i) for i in range(len(chunks))]
    )
    return collection

# Step 6: Retrieve relevant chunks from Chroma based on the user's question
def retrieve_relevant_chunks(question, embedding_model, collection, top_k=3):
    question_embedding = embedding_model.encode([question])[0]  # Embed the user's question

    # Perform a search in Chroma and retrieve the top_k most similar chunks
    results = collection.query(query_embeddings=[question_embedding.tolist()], n_results=top_k)

    # Extract the retrieved chunks
    retrieved_chunks = [doc for doc in results['documents'][0]]
    return retrieved_chunks

# Step 7: Use Groq API via the GroqClient to generate answers using chat completions
def generate_answer_with_groq(question, retrieved_chunks):
    context = "\n".join(retrieved_chunks)  # Combine retrieved chunks into context

    # Format messages in a chat-like structure for the Groq API
    messages = [
        {"role": "system", "content": "You are an assistant for answering questions based on the provided context."},
        {"role": "user", "content": f"Context: {context}"},
        {"role": "user", "content": f"Question: {question}"}
    ]

    # Use the Groq client to send the messages for inference
    response = client.chat.completions.create(
        model=MODEL,
        messages=messages,
        temperature=TEMPERATURE,
        max_tokens=MAX_TOKENS,
    )

    # Parse the response from the Groq API
    if response:
        answer = response.choices[0].message
        #answer = response.get("choices", [{}])[0].get("message", {}).get("content", "No answer generated.")
        return answer
    else:
        return "Could not generate an answer."



In [None]:

# Step 1: Upload PDF files in Colab manually or mount Google Drive
print("Please upload PDF files.")
uploaded_files = files.upload()  # Use the Colab file uploader

# Save uploaded files to a temporary folder
directory_path = "/content/pdf_files"
os.makedirs(directory_path, exist_ok=True)

for filename in uploaded_files.keys():
    with open(os.path.join(directory_path, filename), "wb") as f:
        f.write(uploaded_files[filename])

# Step 2: Load and extract text from PDFs
texts = load_pdfs(directory_path)

# Step 3: Split the text into smaller chunks using the recursive splitter
text_chunks = []
for text in texts:
    chunks = split_text_recursive(text)
    text_chunks.extend(chunks)

# Step 4: Embed the text chunks
embeddings = embed_texts(text_chunks)

# Step 5: Create a Chroma DB and store embeddings
collection = create_chroma_db(text_chunks, embeddings)

# Step 6: Ask questions
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")  # Same model as used for embedding text

while True:
    question = input("\nEnter your question: ")
    if question.lower() == "exit":
        break

    # Retrieve relevant chunks based on the user's question
    retrieved_chunks = retrieve_relevant_chunks(question, embedding_model, collection)

    # Generate an answer based on the retrieved chunks and the question using Groq API
    answer = generate_answer_with_groq(question, retrieved_chunks)
    wrapped_lines = [textwrap.fill(line, width=80) for line in answer.content.splitlines()]
    wrapped_text = "\n".join(wrapped_lines)
    print("\n")
    print(wrapped_text)



In [29]:
len(text_chunks)

132