In [None]:
!pip install faiss-cpu
!pip install groq
!pip install langchain-groq
!pip install PyPDF2
!pip install langchain_google_genai
!pip install langchain
!pip install streamlit
!pip install langchain_community
!pip install python-dotenv
!pip install pypdf
!pip install google-cloud-aiplatform>=1.38
!pip install fpdf
!pip install google-auth
!pip install transformers

In [None]:
from langchain.schema import Document
import os
import zipfile
from getpass import getpass
from google.colab import files
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from fpdf import FPDF
import re
import time

# Step 1: Initialize API keys
if "GROQ_API_KEY" not in os.environ:
    groq_api_key = getpass("Enter your GROQ API Key: ")
    os.environ["GROQ_API_KEY"] = groq_api_key
else:
    groq_api_key = os.environ["GROQ_API_KEY"]

if "GOOGLE_API_KEY" not in os.environ:
    google_api_key = getpass("Enter your Google API Key: ")
    os.environ["GOOGLE_API_KEY"] = google_api_key
else:
    google_api_key = os.environ["GOOGLE_API_KEY"]

# Step 2: Upload the ZIP File
print("Please upload your ZIP file containing Markdown files.")
uploaded_zip = files.upload()
zip_file_name = list(uploaded_zip.keys())[0]

# Step 3: Extract Files from ZIP
extracted_folder = "extracted_docs"
os.makedirs(extracted_folder, exist_ok=True)

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder)

print(f"Extracted files to {extracted_folder}.")

# Step 4: Upload the Train Text File
print("Please upload your 'train.txt' file.")
uploaded_txt = files.upload()
train_file_name = list(uploaded_txt.keys())[0]

# Step 5: Helper Functions
def extract_urls(text):
    """Extract URLs from the given text."""
    url_pattern = r'(https?://[^\s]+)'
    return re.findall(url_pattern, text)

def create_pdf(text, filename="response.pdf"):
    """Save the given text as a PDF."""
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    pdf.multi_cell(0, 10, text.encode('latin-1', 'replace').decode('latin-1'))
    pdf.output(filename, dest='F')

def parse_train_txt(txt_path):
    """Extract multiple Q&A pairs from the training text file."""
    with open(txt_path, 'r') as file:
        content = file.read()

    # Split by double newline to identify multiple Q&A pairs
    qa_pairs = []
    questions = content.split("\n\n")  # Assume each Q&A pair is separated by two newlines
    for qa in questions:
        if "Answer:" in qa:
            question, answer = qa.split("Answer:", 1)
            qa_pairs.append({"question": question.strip(), "answer": answer.strip()})
    return qa_pairs

# Step 6: Process Markdown Files and Training Data
def process_documents(folder_path, qa_pairs):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    all_documents = []
    extracted_urls = {}

    # Process Markdown Files
    for root, _, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith('.md'):
                file_path = os.path.join(root, file_name)
                loader = TextLoader(file_path)
                documents = loader.load()
                all_documents.extend(documents)

                # Extract URLs from each document
                urls = []
                for doc in documents:
                    urls.extend(extract_urls(doc.page_content))
                extracted_urls[file_name] = urls

    # Add Q&A Pairs as Additional Documents
    for qa in qa_pairs:
        qa_doc = Document(page_content=f"Q: {qa['question']}\nA: {qa['answer']}")
        all_documents.append(qa_doc)

    # Split documents and create vector embeddings
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
    final_documents = text_splitter.split_documents(all_documents)
    vectors = FAISS.from_documents(final_documents, embeddings)

    return vectors, extracted_urls

# Step 7: Parse Training Text File
qa_pairs = parse_train_txt(train_file_name)
print(f"Extracted {len(qa_pairs)} Q&A pairs from training data.")

# Step 8: Load Markdown Files and Training Data into Vector Store
vectors, extracted_urls = process_documents(extracted_folder, qa_pairs)
print(f"Processed Markdown files and Q&A training data. Vector embeddings are ready.")

# Display extracted URLs
for md_file, urls in extracted_urls.items():
    print(f"Extracted URLs from {md_file}: {', '.join(urls)}")

# Step 9: Initialize Language Model and Prompt
llm = ChatGroq(groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768")

prompt = ChatPromptTemplate.from_template("""
Answer the following question using the retrieved context.
Include references to the original documents or URLs whenever possible.

<context>
{context}
<context>

Question: {input}
""")

# Main Loop
while True:
    user_question = input("Enter your question about the uploaded documents (or type 'exit' to quit): ")

    if user_question.lower() == "exit":
        print("Exiting the program. Goodbye!")
        break

    if user_question.strip():
        # Step 10: Generate Response with RAG
        document_chain = create_stuff_documents_chain(llm, prompt)
        retriever = vectors.as_retriever()
        retrieval_chain = create_retrieval_chain(retriever, document_chain)

        # Measure response time
        start = time.process_time()
        response_with_docs = retrieval_chain.invoke({'input': user_question})
        elapsed_time = time.process_time() - start

        # Extract relevant documents and URLs
        retrieved_docs = response_with_docs.get('source_documents', [])
        relevant_references = []
        for doc in retrieved_docs:
            source = doc.metadata.get('source', 'Unknown Document')
            relevant_references.append(f"{source}: {doc.page_content[:200]}...")

        # Prepare response text
        response_text = response_with_docs['answer']
        if relevant_references:
            response_text += "\n\nReferences:\n" + "\n".join(relevant_references)

        # Display response and save to PDF
        print(f"Response time: {elapsed_time:.2f} seconds")
        print(response_text)
        create_pdf(response_text)
        print('The response has been saved to a PDF file as "response.pdf". Download it from the Colab file manager.')
    else:
        print("Please provide a valid question!")


Please upload your ZIP file containing Markdown files.


Saving docs.zip to docs (2).zip
Extracted files to extracted_docs.
Please upload your 'train.txt' file.


Saving train.txt to train (2).txt
Extracted 5 Q&A pairs from training data.
Processed Markdown files and Q&A training data. Vector embeddings are ready.
Extracted URLs from todo.md: 
Extracted URLs from writing_guide.md: 
Extracted URLs from index.md: https://www.fairmat-nfdi.eu/events/fairmat-tutorial-1/tutorial-1-materials){:target="_blank"}, https://youtube.com/playlist?list=PLrRaxjvn6FDW-_DzZ4OShfMPcTtnFoynT){:target="_blank"}, https://nomad-lab.eu/nomad-lab/support.html){:target="_blank"}, https://matsci.org/c/nomad/32){:target="_blank"}, https://nomad-lab.eu/nomad-lab/features.html){:target="_blank"}
Extracted URLs from aitoolkit.md: 
Extracted URLs from processing.md: 
Extracted URLs from architecture.md: https://pypi.org/project/nomad-lab/){:target="_blank"})., https://www.python.org/dev/peps/pep-0008/){:target="_blank"},, https://www.python.org/dev/peps/pep-0484/){:target="_blank"}, https://docs.astral.sh/ruff){:target="_blank"},, http://mypy-lang.org/){:target="_blank"}, http