<a href="https://colab.research.google.com/github/talinm23/ML/blob/main/Copy_of_parsing_pdfs_step2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# In order to run these codes (parsing pdfs step 1 and 2), you can run them in a Google Colab notebook.
# To run fast, you should get connected to a GPU in the Colab environment.
# In the Colab secrets, create and copy your HUGGING_FACE_TOKEN and MONGO_URI (MongoDB URI) to the Secrets section.

In [None]:
# Clear all variables
%reset -f
#Check for Persistent Extensions or Cache Files
import os
import shutil
cache_dir = os.path.expanduser('~/.cache')
shutil.rmtree(cache_dir, ignore_errors=True)

In [None]:
# imports and granting access to the google drive to extrace the saved embedding vector information from the step 1 notebook.
import pandas as pd
from ast import literal_eval

# Get connected to Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Access the saved file from step 1.
file_path_to_save = '/content/drive/My Drive/Colab_Notebooks/csv_saved/'
dataset = pd.read_csv(file_path_to_save+'dataset_embedded.csv')

In [None]:
# Apply literal_eval on the "embeddings" column to extract the list inside the string in each row and save it into a new column.
dataset['embedding'] = dataset['embedding_'].apply(literal_eval)

In [None]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.14.1


In [None]:
# Connecting to MongoDB
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri, appname="devrel.content.python"
    #,ssl=True,tlsAllowInvalidCertificates=True
    ,connectTimeoutMS=40000
    ,socketTimeoutMS=40000)
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = userdata.get('MONGO_URI')
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB (into the created collection)
db = mongo_client['coldwell']
collection = db['coldwell_collection']
print('db:',db)
print('collection:',collection)

Connection to MongoDB successful
db: Database(MongoClient(host=['cluster0-shard-00-00.ptqsh.mongodb.net:27017', 'cluster0-shard-00-02.ptqsh.mongodb.net:27017', 'cluster0-shard-00-01.ptqsh.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-zkcvmi-shard-0', appname='devrel.content.python', connecttimeoutms=40000, sockettimeoutms=40000, tls=True), 'coldwell')
collection: Collection(Database(MongoClient(host=['cluster0-shard-00-00.ptqsh.mongodb.net:27017', 'cluster0-shard-00-02.ptqsh.mongodb.net:27017', 'cluster0-shard-00-01.ptqsh.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-zkcvmi-shard-0', appname='devrel.content.python', connecttimeoutms=40000, sockettimeoutms=40000, tls=True), 'coldwell'), 'coldwell_collection')


In [None]:
# Delete any existing records in the collection just in case. So we start with an empty collection.
collection.delete_many({})

DeleteResult({'n': 6921, 'electionId': ObjectId('7fffffff00000000000001fa'), 'opTime': {'ts': Timestamp(1755877323, 481), 't': 506}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1755877323, 481), 'signature': {'hash': b'\\)V\xce\xa6\x9eq\xd1F\xa4s\x87L\xad9\xbc\xf9S<G', 'keyId': 7495212647327989761}}, 'operationTime': Timestamp(1755877323, 481)}, acknowledged=True)

In [None]:
# Convert the dataset into a list of dictionary, where each data row is a new record.
# Then insert that documents variable in batch into the collection.
documents = dataset.to_dict("records")
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [None]:
collection

Collection(Database(MongoClient(host=['cluster0-shard-00-00.ptqsh.mongodb.net:27017', 'cluster0-shard-00-02.ptqsh.mongodb.net:27017', 'cluster0-shard-00-01.ptqsh.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-zkcvmi-shard-0', appname='devrel.content.python', connecttimeoutms=40000, sockettimeoutms=40000, tls=True), 'coldwell'), 'coldwell_collection')

In [None]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

In [None]:
# Next, we perform a vector search in the MongoDB collection based on the
# user query. We pass in the user's query string.
# And it searches the MongoDB collection and returns a list of matchig documents.

# Generate embedding for the user query
query = "What is the name of the author? what are the natural remedies according to the book? list the natural remedies from the book."

query_embedding = get_embedding(query)

# Define the vector search pipeline
vector_search_stage = {
    "$vectorSearch": {
        "index": "vector_index",
        "queryVector": query_embedding,
        "path": "embedding",
        "numCandidates": 1000 ,  # Number of candidate matches to consider
        "limit": 50 # Return top n matches
    }
}

project_stage = {
    "$project": {
        "_id": 1,  # Can exclude the _id field
        "sentences": 1, # Include the sentences fields so that we see the actual sentences.
        "score": {
            "$meta": "vectorSearchScore"  # Include the search score
        }
    }
}

pipeline = [vector_search_stage, project_stage]
results = collection.aggregate(pipeline)


In [None]:
# Execute the search
results = collection.aggregate(pipeline)

# Define Get knowledge by the list of results returned from the vector search.
get_knowledge = list(results)

# Add all the qualifying sentences from the vector search.
search_result = ""
for result in get_knowledge:
    search_result += f"result: {result.get('sentences')}\n"

In [None]:
source_information = search_result
combined_information = (
    f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
)

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=1000)
print(tokenizer.decode(response[0]))

<bos>Query: What is the name of the author? what are the natural remedies according to the book? list the natural remedies from the book.
Continue to answer the query by using the Search Results:
result:  Read what the legendary natural healer Dr
result:  After  being  an award -winning  author,  after  writing  22 best -selling  books  many  in 9 languages or  more  and seeing  why they  are so successful,  I decided  to write  an educational  book  about  health  and self-help  for all those  people  who don’t have as much knowledge about medicine or natural healing as a fully educated practitioner
result: My books have the purpose to educate people so that they can protect themselves and cure themselves with natural means
result: This book is written for the person with  common  sense  and the will to achieve  optimum  health  without getting killed by the medical profession, pharmaceutical industry  or the hocus pocus new age “wannabe healers” an d the new crowd of naturopathic  pr