In [None]:
!pip install pymupdf
!pip install nltk
!pip install sentence_transformers
!pip install openai

Collecting pymupdf
  Downloading PyMuPDF-1.24.7-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.6 (from pymupdf)
  Downloading PyMuPDFb-1.24.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.6 pymupdf-1.24.7
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvi

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import fitz  # PyMuPDF
import re
from nltk.tokenize import word_tokenize, sent_tokenize


def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Extract text from three textbooks
pdf_paths = ["/content/drive/MyDrive/stepsis/BIOINFORMATICS AN INTRODUCTION_BY_J.RAMSEDEN.pdf", "/content/drive/MyDrive/stepsis/Bailey & Scott's Diagnostic Microbiology 14e.pdf", "/content/drive/MyDrive/stepsis/bioethics_and_biosafety_in_biotechnology.pdf"]
texts = [extract_text_from_pdf(pdf_path) for pdf_path in pdf_paths]
# Ensure text extraction is successful
for i, text in enumerate(texts):
    print(f"Textbook {i+1} has {len(text.split())} words")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Textbook 1 has 123601 words
Textbook 2 has 681702 words
Textbook 3 has 72385 words


In [None]:
def preprocess_text(text):
    """
    Preprocess text by removing digits, converting to lower case, removing punctuation, and tokenizing.

    """
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Convert to lower case
    text = text.lower()
    # Remove punctuation, except for sentence-ending punctuation
    text = re.sub(r'[^\w\s\.\?!]', '', text)
    # Tokenize text
    words = word_tokenize(text)
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    # Rejoin words into a string
    text = ' '.join(filtered_words)
    return text

# Preprocess the texts
preprocessed_texts = [preprocess_text(text) for text in texts]

# Inspect the length of preprocessed texts
for i, text in enumerate(preprocessed_texts):
    print(f"Textbook {i+1} has {len(text.split())} words after preprocessing")

Textbook 1 has 78174 words after preprocessing
Textbook 2 has 472465 words after preprocessing
Textbook 3 has 44038 words after preprocessing


In [None]:
def chunk_text(text, chunk_size=100):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(word_tokenize(sentence))
        if current_length + sentence_length <= chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Chunk the preprocessed texts
chunked_texts = [chunk_text(text) for text in preprocessed_texts]

# Ensure chunking is successful
for i, chunks in enumerate(chunked_texts):
    print(f"Textbook {i+1} has {len(chunks)} chunks")


Textbook 1 has 847 chunks
Textbook 2 has 5162 chunks
Textbook 3 has 483 chunks


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.mixture import GaussianMixture
import openai

# Load Sentence-BERT model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Function to generate embeddings
def get_embeddings(chunks):
    if not chunks:
        return []
    return sbert_model.encode(chunks, convert_to_tensor=True).tolist()

# Embed the chunks
chunk_embeddings = [get_embeddings(chunks) for chunks in chunked_texts]

# Ensure embeddings are generated
for i, embeddings in enumerate(chunk_embeddings):
    if not embeddings:
        print(f"Textbook {i+1} embeddings are empty")
    else:
        print(f"Textbook {i+1} has {len(embeddings)} embeddings")





  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Textbook 1 has 847 embeddings
Textbook 2 has 5162 embeddings
Textbook 3 has 483 embeddings


In [None]:
from sklearn.mixture import GaussianMixture

def cluster_embeddings(embeddings, n_clusters=15):
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='tied', max_iter=150, random_state=42)
    gmm.fit(embeddings)
    cluster_assignments = gmm.predict(embeddings)
    return cluster_assignments, gmm.means_

# Example of clustering the embeddings of the first textbook
cluster_assignments, cluster_centers = cluster_embeddings(chunk_embeddings[0], n_clusters=10)

# Print cluster assignments and centers for inspection
print(f"Cluster assignments: {cluster_assignments}")
print(f"Cluster centers shape: {cluster_centers.shape}")


Cluster assignments: [2 0 0 2 2 0 2 0 0 2 2 0 2 6 6 7 6 6 6 6 7 6 6 7 7 7 7 7 7 7 6 7 7 7 6 6 7
 7 7 3 7 7 6 3 3 3 3 3 3 3 3 3 3 3 3 3 3 6 3 3 3 7 7 3 3 3 3 3 6 6 3 3 3 7
 6 7 3 3 6 3 6 3 7 3 6 6 6 7 6 2 2 4 2 0 0 2 0 0 0 0 0 0 0 0 2 2 2 2 2 2 9
 2 1 9 9 9 9 9 2 1 1 9 8 2 2 2 2 2 9 2 2 9 9 9 2 2 9 9 9 9 9 9 2 2 2 2 2 2
 2 2 2 2 4 0 5 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 8 2 2 2 0 2 2 2 2 2 9 8 2 8 4
 4 4 4 5 8 1 1 9 9 9 9 1 2 9 9 1 1 2 9 9 9 9 9 8 9 9 9 0 9 2 1 1 9 1 1 1 1
 1 1 9 9 9 9 9 9 9 9 9 9 9 9 1 9 9 9 9 9 9 9 1 1 1 9 9 9 9 9 9 2 9 9 9 9 9
 9 9 9 9 2 9 9 4 4 9 9 9 9 9 1 1 1 1 1 1 1 1 8 1 9 9 2 9 9 9 9 9 2 9 9 9 9
 9 2 2 1 1 1 1 1 1 1 8 8 8 1 1 8 8 8 8 9 8 8 8 8 8 8 4 1 1 1 1 1 1 2 1 2 8
 8 1 1 1 8 9 4 4 2 2 2 1 9 1 2 1 1 1 1 1 1 1 8 1 1 1 1 1 2 2 2 2 4 4 4 4 4
 2 4 2 2 4 4 2 8 8 8 4 1 1 2 8 2 4 2 2 2 2 2 0 2 0 0 4 5 5 5 5 5 5 5 5 5 5
 5 1 4 4 4 0 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 0 5 4 9 4 4 4 4 4 4 4 4 4 4 4 4 0 4 5 5 4 4 4 4 4 5 4 5 4 4 5 4 5 4
 2 2

In [None]:
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def summarize_cluster(cluster_texts):
    if not cluster_texts:
        return ""
    cluster_text = ' '.join(cluster_texts)
    if len(cluster_text) > 1024:  # BART has a token limit, adjust if necessary
        cluster_text = cluster_text[:1024]  # Truncate to the first 1024 tokens
    summary = summarizer(cluster_text, max_length=100, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# Summarize each cluster
summarized_texts = []
for i in range(len(cluster_centers)):
    cluster_texts = [chunked_texts[0][j] for j in range(len(cluster_assignments)) if cluster_assignments[j] == i]
    if not cluster_texts:
        print(f"Cluster {i} has no texts assigned.")
        continue
    print(f"Cluster {i} has {len(cluster_texts)} texts.")
    try:
        summary = summarize_cluster(cluster_texts)
        summarized_texts.append(summary)
    except Exception as e:
        print(f"Error summarizing cluster {i}: {e}")
        summarized_texts.append("")

# Print summaries for inspection
for i, summary in enumerate(summarized_texts):
    print(f"Summary for cluster {i}: {summary}")


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Cluster 0 has 111 texts.
Cluster 1 has 76 texts.
Cluster 2 has 127 texts.
Cluster 3 has 31 texts.
Cluster 4 has 206 texts.
Cluster 5 has 113 texts.
Cluster 6 has 22 texts.
Cluster 7 has 23 texts.
Cluster 8 has 45 texts.
Cluster 9 has 93 texts.
Summary for cluster 0:  computational biology jeremy ramsden bioinformatics introduction third edition computational biology volume editorsinchief andreas dress casmpg partner institute computational biology shanghai china michal linial hebrew university jerusalem jerusalem israel olga troyanskaya princeton university princeton nj usa martin vingron max planck institute molecular genetics berlin germany editorial board robert giegerich university bielefeld bie
Summary for cluster 1: One way to quantify variety is to count number of different kinds objects. objects considered course category category speciﬁed ball. shown tray containing balls three colours red r blue b white w might reasonably assert variety three . hence one way quantify variety 

In [None]:
# Function to manually inspect a few clusters and their summaries
def inspect_clusters(cluster_texts, summaries, num_clusters=3):
    for i in range(min(num_clusters, len(cluster_texts))):
        print(f"\nCluster {i} Original Texts:")
        for text in cluster_texts[i][:5]:  # Show first 5 texts for brevity
            print(f"- {text[:200]}...")  # Show first 200 characters of each text

        print(f"\nCluster {i} Summary:")
        print(summaries[i])
        print("-" * 80)

# Prepare cluster texts for inspection
cluster_texts = [[] for _ in range(len(cluster_centers))]
for chunk, cluster_id in zip(chunked_texts[0], cluster_assignments):
    cluster_texts[cluster_id].append(chunk)

# Manually inspect a few clusters
inspect_clusters(cluster_texts, summarized_texts, num_clusters=3)



Cluster 0 Original Texts:
- computational biology jeremy ramsden bioinformatics introduction third edition computational biology volume editorsinchief andreas dress casmpg partner institute computational biology shanghai china m...
- main emphasis current scientiﬁc developments innovative techniques computational biology bioinformatics bringing light methods mathemat ics statistics computer science directly address biological prob...
- main additions part iii applications acquired new sections chapters seemingly everexpanding omicsnow metagenomics toxicogenomics glycomics lipidomics microbiomics phenomics covered albeit mostly brieﬂ...
- organization features chapters grouped three parts respectively covering relevant fun damentals information science overviewing biology surveying applications . thus part fundamentals carefully explai...
- chapters start discussion experimental aspects dna sequencing genomics chapter move thorough discussion data analysed . speciﬁcally medical applica

In [None]:
# Re-embed the summarized texts
def get_summarized_embeddings(summarized_texts):
    return sbert_model.encode(summarized_texts, convert_to_tensor=True).tolist()

summarized_embeddings = get_summarized_embeddings(summarized_texts)

# Ensure embeddings are generated
if not summarized_embeddings:
    print("Summarized embeddings are empty")
else:
    print(f"Generated {len(summarized_embeddings)} summarized embeddings")


Generated 10 summarized embeddings


In [None]:
def recursive_clustering(embeddings, level=0, max_level=3, n_clusters=5):
    if level >= max_level or len(embeddings) <= n_clusters:
        return [embeddings]

    # Perform clustering
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='tied')
    gmm.fit(embeddings)
    cluster_assignments = gmm.predict(embeddings)

    clusters = [[] for _ in range(n_clusters)]
    for i, assignment in enumerate(cluster_assignments):
        clusters[assignment].append(embeddings[i])

    summarized_clusters = []
    for i, cluster in enumerate(clusters):
        if len(cluster) > 0:
            cluster_texts = [summarized_texts[j] for j in range(len(cluster_assignments)) if cluster_assignments[j] == i]
            summary = summarize_cluster(cluster_texts)
            summarized_clusters.append(get_summarized_embeddings([summary])[0])

    # Recur for each cluster
    hierarchical_clusters = []
    for cluster_embeddings in clusters:
        if cluster_embeddings:
            hierarchical_clusters.append(recursive_clustering(cluster_embeddings, level + 1, max_level, n_clusters))

    return hierarchical_clusters

# Apply recursive clustering and summarization
hierarchical_clusters = recursive_clustering(summarized_embeddings)


Your max_length is set to 100, but your input_length is only 90. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=45)
Your max_length is set to 100, but your input_length is only 93. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=46)


In [None]:
# # Flatten hierarchical structure for insertion
# def flatten_hierarchical_clusters(hierarchical_clusters, flattened=None, level=0):
#     if flattened is None:
#         flattened = []
#     for cluster in hierarchical_clusters:
#         if isinstance(cluster, list):
#             flatten_hierarchical_clusters(cluster, flattened, level + 1)
#         else:
#             flattened.append((level, cluster))
#     return flattened

# flattened_clusters = flatten_hierarchical_clusters(hierarchical_clusters)
# ids, final_embeddings, metadata = zip(*[(i, cluster[1], f"level_{cluster[0]}") for i, cluster in enumerate(flattened_clusters)])


In [None]:
!pip install faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0.post1


In [None]:
import faiss
import numpy as np
def flatten_hierarchical_clusters(hierarchical_clusters, flattened=None, level=0):
    if flattened is None:
        flattened = []
    for cluster in hierarchical_clusters:
        if isinstance(cluster, list) and isinstance(cluster[0], list):
            flatten_hierarchical_clusters(cluster, flattened, level + 1)
        else:
            flattened.append((level, cluster))
    return flattened

flattened_clusters = flatten_hierarchical_clusters(hierarchical_clusters)
ids, final_embeddings, metadata = zip(*[(i, cluster[1], f"level_{cluster[0]}") for i, cluster in enumerate(flattened_clusters)])

# Check the type and structure of final_embeddings
print(f"First element of final_embeddings: {final_embeddings[0]}")
print(f"Type of final_embeddings: {type(final_embeddings)}")
print(f"Type of first element in final_embeddings: {type(final_embeddings[0])}")

# Ensure final_embeddings is a list of lists
final_embeddings = list(final_embeddings)
final_embeddings = [list(embedding) for embedding in final_embeddings]

# Verify the dimensionality of the first embedding
embedding_dim = len(final_embeddings[0])

# Convert embeddings to a 2D NumPy array
embeddings_array = np.array(final_embeddings).astype('float32')

# Verify the shape of embeddings_array
print(f"Shape of embeddings_array: {embeddings_array.shape}")

# Create FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # Using L2 (Euclidean) distance
index.add(embeddings_array)

# Save the index to disk
faiss.write_index(index, "textbook_index.faiss")

# Load the index (for demonstration purposes)
index = faiss.read_index("textbook_index.faiss")

def retrieve_faiss(query, index, texts, k=5): # Add texts as an argument
    query_embedding = sbert_model.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.detach().numpy().astype('float32')
    query_embedding = query_embedding.reshape(1, -1)
    D, I = index.search(query_embedding, k)
    # Ensure retrieved indices are within the bounds of the texts list
    results = [(texts[i], D[0][i]) for i in I[0] if i < len(texts)]
    return results

# Example query
query = " explain bioethics"
# Pass the texts to the retrieve_faiss function
results = retrieve_faiss(query, index, texts) # Pass the texts list
for result in results:
    print(result)

First element of final_embeddings: [-0.6670345067977905, 0.24375610053539276, -0.22594739496707916, -0.2778429090976715, -0.10117591917514801, -0.24830898642539978, -0.34930306673049927, -0.007992339320480824, 0.2928318381309509, -0.13887549936771393, 0.08715827763080597, -0.4636167287826538, -0.2233162671327591, 0.23821231722831726, -0.13676515221595764, -0.19285067915916443, -0.43806901574134827, 0.07302986085414886, -0.25754526257514954, 0.11900726705789566, 0.08934244513511658, 0.0028994630556553602, 0.4232771694660187, -0.08273644745349884, -0.03459816798567772, 0.01821252517402172, -0.059699345380067825, -0.11792311072349548, -0.04850798472762108, 0.1332034319639206, 0.05643795058131218, 0.13684716820716858, 0.28861209750175476, 0.22141504287719727, -0.1322302669286728, 0.3151949644088745, 0.02444002963602543, -0.0026938768569380045, 0.04185950756072998, 0.11469083279371262, -0.05642210319638252, 0.04589836671948433, 0.23386463522911072, 0.05293332040309906, 0.10462131351232529, 

In [None]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Collecting watchdog<5,>=2.1.5 (from streamlit)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl (83 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.0/83.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.19,<4,>=3.0.7->streamlit)
  Downloading gitdb-4

In [None]:
import streamlit as st
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline

# Load FAISS index
index = faiss.read_index("textbook_index.faiss")

# Load the summarization model and tokenizer
sbert_model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(sbert_model_name)
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

# Initialize the question answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Dummy summarized texts and metadata for demonstration
summarized_texts = [
    "Computational biology is the study of biology using computational techniques.",
    "Genetic information is stored in the DNA of living organisms.",
    "Proteins are made of amino acids and perform various functions in the body.",
    # Add more dummy summarized texts corresponding to your data
]
metadata = [
    {"title": "Textbook 1", "page_number": 10},
    {"title": "Textbook 2", "page_number": 23},
    {"title": "Textbook 3", "page_number": 45},
    # Add more metadata corresponding to your data
]

# Example retrieval function using FAISS
def retrieve_faiss(query, index, k=5):
    query_embedding = sbert_model.encode(query, convert_to_tensor=True)  # Get the Tensor
    query_embedding = query_embedding.detach().numpy().astype('float32')  # Convert to NumPy and then to float32
    query_embedding = query_embedding.reshape(1, -1)  # Reshape the query embedding to a 2D array
    D, I = index.search(query_embedding, k)
    results = [(summarized_texts[i], metadata[i], D[0][i]) for i in I[0]]
    return results

# Streamlit User Interface
st.title("Textbook Question Answering System")

query = st.text_input("Enter your query:", "")

if query:
    # Retrieve relevant passages using FAISS
    results = retrieve_faiss(query, index)

    st.write(f"Top {len(results)} relevant passages retrieved:")

    for i, (text, meta, score) in enumerate(results):
        st.write(f"**Passage {i+1}:**")
        st.write(f"Text: {text}")
        st.write(f"Title: {meta['title']}, Page Number: {meta['page_number']}")
        st.write(f"Relevance Score: {score:.4f}")

        # Use the QA pipeline to generate an answer
        answer = qa_pipeline(question=query, context=text)
        st.write(f"**Answer:** {answer['answer']}\n")



tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

2024-07-19 06:17:37.895 
  command:

    streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2024-07-19 06:17:37.904 Session state does not function when running a script without `streamlit run`


In [None]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [None]:
import os
from pyngrok import ngrok

# Replace "YOUR_NGROK_AUTH_TOKEN" with the authtoken you obtained from the ngrok dashboard
ngrok.set_auth_token("2jSDRNxaPNNaDhiqUMxQmS4NSIP_4MARbX6RiZmx5uHrQ2yJn")

# Run the Streamlit app
os.system('streamlit run app.py &')

# Create a public URL for the Streamlit app
public_url = ngrok.connect(8501)
print(f"Streamlit app is live at: {public_url}")


Streamlit app is live at: NgrokTunnel: "https://443b-35-245-49-164.ngrok-free.app" -> "http://localhost:8501"
