In [None]:
!pip install langchain langchain-core langchain-community colab-xterm pypdf ollama transformers streamlit torch numpy scikit-learn
!curl -fsSL https://ollama.com/install.sh | sh

# !ollama serve & ollama run llama3

%load_ext colabxterm
%xterm

In [None]:
%%writefile app.py
import os
import re
import torch
import pickle
import streamlit as st
import numpy as np
from io import BytesIO
from typing import Dict, List
from pypdf import PdfReader
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter
import ollama
import time

# Initializing embedding tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-base-en-v1.5")

#function to parse the uploaded files: PDF, TXT, MD
def parse_folder(files: List[BytesIO], filenames: List[str]) -> Dict[str, List[str]]:
    def parse_pdf(file: BytesIO) -> List[str]:
        pdf = PdfReader(file)
        output = []
        for page in pdf.pages:
            text = page.extract_text()
            text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
            text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
            text = re.sub(r"\n\s*\n", "\n\n", text)
            output.append(text)
        return output

    def parse_text(file: BytesIO) -> List[str]:
        output = []
        text = file.read().decode('utf-8')
        text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
        text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
        text = re.sub(r"\n\s*\n", "\n\n", text)
        output.append(text)
        return output

    parsed_files = {}
    for file, filename in zip(files, filenames):
        if filename.endswith('.pdf'):
            parsed_files[filename] = parse_pdf(file)
        elif filename.endswith('.md') or filename.endswith('.txt'):
            parsed_files[filename] = parse_text(file)

    return parsed_files

#function to conver text into chunks of size=2000
def split_text_into_chunks(parsed_files: Dict[str, List[str]], chunk_size=2000, chunk_overlap=0) -> Dict[str, List[str]]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
        chunk_overlap=chunk_overlap,
    )

    chunks = {}
    for filename, texts in parsed_files.items():
        file_chunks = []
        for text in texts:
            file_chunks.extend(text_splitter.split_text(text))
        chunks[filename] = file_chunks

    return chunks

#function to conver chunks to embeddings using hugging face model
def convert_chunks_to_embeddings(chunks: Dict[str, List[str]], embeddings_dir: str = "embeddings") -> Dict[str, List[np.ndarray]]:
    if not os.path.exists(embeddings_dir):
        os.makedirs(embeddings_dir)

    embeddings = {}

    for filename, chunked_texts in chunks.items():
        embedding_file = os.path.join(embeddings_dir, f"{filename}_embeddings.pkl")

        if os.path.exists(embedding_file):
            # Load existing embeddings
            with open(embedding_file, 'rb') as f:
                embeddings[filename] = pickle.load(f)
        else:
            file_embeddings = []
            for chunk in chunked_texts:
                inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True)

                # Generate embeddings
                with torch.no_grad():
                    outputs = model(**inputs)
                    embeddings_tensor = outputs.last_hidden_state

                # Mean pooling to get sentence embeddings
                attention_mask = inputs['attention_mask']
                embeddings_tensor = torch.sum(embeddings_tensor * attention_mask.unsqueeze(-1), dim=1) / torch.clamp(attention_mask.sum(dim=1, keepdim=True), min=1e-9)

                # Convert to numpy array and store
                embedding = embeddings_tensor.numpy()
                file_embeddings.append(embedding)

            embeddings[filename] = file_embeddings

            with open(embedding_file, 'wb') as f:
                pickle.dump(file_embeddings, f)

    return embeddings

#function comparing the embeddings and returning top3 relevant matches 
def find_top3_relevant_chunks(question_embeddings: np.ndarray, chunks_embeddings: Dict[str, List[np.ndarray]], chunk_texts: Dict[str, List[str]]) -> Dict[str, List[str]]:
    top3_relevant_chunks = {}

    for filename, embeddings in chunks_embeddings.items():
        flattened_embeddings = [emb.squeeze(axis=0) for emb in embeddings]
        flattened_embeddings = np.vstack(flattened_embeddings)

        # Calculate cosine similarity between question embeddings and chunks embeddings
        similarities = cosine_similarity([question_embeddings], flattened_embeddings).flatten()

        sorted_indices = np.argsort(similarities)[::-1]  # Sort in descending order
        top3_chunks = [chunk_texts[filename][idx] for idx in sorted_indices[:3]]

        top3_relevant_chunks[filename] = top3_chunks

    return top3_relevant_chunks

#function to generate the prompt and retrieving the response from the llama3 model
def prepare_and_get_response(question: str, top3_relevant_chunks: Dict[str, List[str]]) -> str:
    prompt_template = f"""
      You are a helpful Assistant who answers to users questions based on multiple contexts given to you.

      Keep your answer short and to the point.

      The evidence are the context of the pdf extract with metadata.

      Carefully focus on the metadata specially 'filename' and 'page' whenever answering.

      Make sure to add filename and page number at the end of sentence you are citing to.

      Reply "Not applicable" if text is irrelevant.

      The file content is:
    """

    # Add each filename's top 3 chunks to the prompt template
    for fname, chunks in top3_relevant_chunks.items():
        prompt_template += f"\n\nFilename: {fname}\n"
        prompt_template += "\n".join([f"- {chunk}" for chunk in chunks])

    # Generate response using ollama.chat method
    response = ollama.chat(
        model='llama3',
        messages=[
            {
                'role': 'system',
                'content': prompt_template
            },
            {
                'role': 'user',
                'content': question
            },
        ],
    )

    return response['message']['content']

#main function hosting the app using streamlit
def main():
    st.set_page_config(layout="wide")
    st.title("RAG-Powered Llama AI")

    st.sidebar.header("Upload Documents")
    uploaded_files = st.sidebar.file_uploader("Upload your PDF, TXT, or MD files", type=["pdf", "txt", "md"], accept_multiple_files=True)

    if uploaded_files:
        filenames = [uploaded_file.name for uploaded_file in uploaded_files]
        file_bytes = [BytesIO(uploaded_file.read()) for uploaded_file in uploaded_files]

        parsed_contents = parse_folder(file_bytes, filenames)
        chunks = split_text_into_chunks(parsed_contents)
        embeddings = convert_chunks_to_embeddings(chunks)

        if 'history' not in st.session_state:
            st.session_state.history = []

        if st.session_state.history:
            st.subheader("History")
            for entry in st.session_state.history:
                st.write(f"**Question:** {entry['question']}")
                st.write(f"**Response:** {entry['response']}")
                st.write("---")

        with st.form(key='question_form', clear_on_submit=True):
            question = st.text_input("Enter your question:", key="question_input")
            submit_button = st.form_submit_button(label='Submit')

            if submit_button and question:
                # Get question embeddings
                inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True)
                with torch.no_grad():
                    outputs = model(**inputs)
                    question_embeddings_tensor = outputs.last_hidden_state
                attention_mask = inputs['attention_mask']
                question_embeddings_tensor = torch.sum(question_embeddings_tensor * attention_mask.unsqueeze(-1), dim=1) / torch.clamp(attention_mask.sum(dim=1, keepdim=True), min=1e-9)
                question_embeddings = question_embeddings_tensor.numpy().squeeze(axis=0)

                # Find top 3 relevant chunks
                top3_relevant_chunks = find_top3_relevant_chunks(question_embeddings, embeddings, chunks)

                response = prepare_and_get_response(question, top3_relevant_chunks)

                st.session_state.history.append({
                    'question': question,
                    'response': response
                })

                # Display response word by word
                response_placeholder = st.empty()
                words = response.split()
                for i in range(len(words)):
                    response_placeholder.write(" ".join(words[:i + 1]))
                    time.sleep(0.05)

if __name__ == "__main__":
    main()

In [None]:
!wget -q -O - ipv4.icanhazip.com
! streamlit run app.py & npx localtunnel --port 8501