In [None]:
!pip install faiss-cpu
!pip install groq
!pip install langchain-groq
!pip install PyPDF2
!pip install langchain_google_genai
!pip install langchain
!pip install streamlit
!pip install langchain_community
!pip install python-dotenv
!pip install pypdf
!pip install google-cloud-aiplatform>=1.38
!pip install fpdf
!pip install google-auth

In [None]:
import os
from getpass import getpass
from google.colab import files
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from fpdf import FPDF
import re
import time

# Step 1: Initialize API keys
if "GROQ_API_KEY" not in os.environ:
    #groq_api_key = getpass("Enter your GROQ API Key: ")
    os.environ["GROQ_API_KEY"] = "gsk_AlLaoagluFZqjG8LmGEKWGdyb3FYEX49qIfOkYNwiJXq1nMIbHK"
else:
    groq_api_key = os.environ["GROQ_API_KEY"]

if "GOOGLE_API_KEY" not in os.environ:
    #google_api_key = getpass("Enter your Google API Key: ")
    os.environ["GOOGLE_API_KEY"] = "AIzaSyDaWZ8eYeDJWjzeNMR43nfJNTikX3weQM"
else:
    google_api_key = os.environ["GOOGLE_API_KEY"]

# Step 2: Upload files
print("Please upload your PDF files.")
uploaded = files.upload()

# Step 3: Save uploaded files locally
pdf_files = list(uploaded.keys())
print(f"Uploaded files: {pdf_files}")

for file_name in pdf_files:
    with open(file_name, "wb") as f:
        f.write(uploaded[file_name])

# Step 4: Helper Functions
def extract_urls(text):
    """Extract URLs from the given text."""
    url_pattern = r'(https?://[^\s]+)'
    return re.findall(url_pattern, text)

def create_pdf(text, filename="response.pdf"):
    """Save the given text as a PDF."""
    pdf = FPDF()
    pdf.add_page()he YouTube links will display correctly once the frontend is fully developed and integrated. Let me know if you have any further questio
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text.encode('latin-1', 'replace').decode('latin-1'))
    pdf.output(filename, dest='F')

# Step 5: Process Files
def process_uploaded_pdfs(pdf_paths):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    all_documents = []
    extracted_urls = {}

    for pdf_path in pdf_paths:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        all_documents.extend(documents)

        # Extract URLs from each document
        urls = []
        for doc in documents:
            urls.extend(extract_urls(doc.page_content))
        extracted_urls[pdf_path] = urls

    # Split documents and create vector embeddings
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
    final_documents = text_splitter.split_documents(all_documents)
    vectors = FAISS.from_documents(final_documents, embeddings)

    return vectors, extracted_urls

# Step 6: Load PDFs and Create Vector Embeddings
vectors, extracted_urls = process_uploaded_pdfs(pdf_files)
print(f"Processed {len(pdf_files)} PDF(s). Vector embeddings are ready.")

for pdf, urls in extracted_urls.items():
    print(f"Extracted URLs from {pdf}: {', '.join(urls)}")

# Step 7: Initialize Language Model and Prompt
llm = ChatGroq(groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768")

prompt = ChatPromptTemplate.from_template("""
Answer the following questions based on the provided context ONLY.
Please provide detailed, accurate, and professional responses.
Where possible, include references to URLs or source documents.
<context>
{context}
<context>
Question: {input}
""")

# Main Loop
while True:
    user_question = input("Enter your question about the uploaded documents (or type 'exit' to quit): ")

    if user_question.lower() == "exit":
        print("Exiting the program. Goodbye!")
        break

    if user_question.strip():
        # Step 9: Generate Response
        document_chain = create_stuff_documents_chain(llm, prompt)
        retriever = vectors.as_retriever()
        retrieval_chain = create_retrieval_chain(retriever, document_chain)

        # Measure response time
        start = time.process_time()
        response_with_docs = retrieval_chain.invoke({'input': user_question})
        elapsed_time = time.process_time() - start

        # Extract relevant documents and URLs
        retrieved_docs = response_with_docs.get('source_documents', [])
        relevant_urls = []
        for doc in retrieved_docs:
            doc_urls = extract_urls(doc.page_content)
            relevant_urls.extend(doc_urls)

        # Remove duplicates
        relevant_urls = list(set(relevant_urls))

        # Prepare response text
        response_text = response_with_docs['answer']
        if relevant_urls:
            response_text += "\n\nRelevant References:\n" + "\n".join(relevant_urls)

        # Display response and save to PDF
        print(f"Response time: {elapsed_time:.2f} seconds")
        print(response_text)
        create_pdf(response_text)
        print('The response has been saved to a PDF file as "response.pdf". Download it from the Colab file manager.')
    else:
        print("Please provide a valid question!")
