In [2]:
# In order to run these codes (parsing pdfs step 1 and 2), you can run them in a Google Colab notebook.
# To run fast, you should get connected to a GPU in the Colab environment.
# In the Colab secrets, create and copy your HUGGING_FACE_TOKEN and MONGO_URI (MongoDB URI) to the Secrets section.

In [3]:
# Clear all variables
%reset -f
#Check for Persistent Extensions or Cache Files
import os
import shutil
cache_dir = os.path.expanduser('~/.cache')
shutil.rmtree(cache_dir, ignore_errors=True)

In [4]:
# imports and granting access to the google drive to extrace the saved embedding vector information from the step 1 notebook.
import pandas as pd
from ast import literal_eval

# Get connected to Google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# Access the saved file from step 1.
file_path_to_save = '/content/drive/My Drive/Colab Notebooks/csv_saved/'
dataset = pd.read_csv(file_path_to_save+'dataset_embedded.csv')

In [12]:
# Apply literal_eval on the "embeddings" column to extract the list inside the string in each row and save it into a new column.
dataset['embedding'] = dataset['embedding_'].apply(literal_eval)

In [13]:
pip install pymongo

Collecting pymongo
  Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython<3.0.0,>=1.16.0 (from pymongo)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading pymongo-4.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.4/1.4 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m313.6/313.6 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, pymongo
Successfully installed dnspython-2.7.0 pymongo-4.10.1


In [14]:
# Connecting to MongoDB
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri, appname="devrel.content.python"
    #,ssl=True,tlsAllowInvalidCertificates=True
    ,connectTimeoutMS=40000
    ,socketTimeoutMS=40000)
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = userdata.get('MONGO_URI')
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB (into the created collection)
db = mongo_client['coldwell']
collection = db['coldwell_collection']
print('db:',db)
print('collection:',collection)

Connection to MongoDB successful
db: Database(MongoClient(host=['cluster0-shard-00-00.ptqsh.mongodb.net:27017', 'cluster0-shard-00-01.ptqsh.mongodb.net:27017', 'cluster0-shard-00-02.ptqsh.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-zkcvmi-shard-0', appname='devrel.content.python', connecttimeoutms=40000, sockettimeoutms=40000, tls=True), 'coldwell')
collection: Collection(Database(MongoClient(host=['cluster0-shard-00-00.ptqsh.mongodb.net:27017', 'cluster0-shard-00-01.ptqsh.mongodb.net:27017', 'cluster0-shard-00-02.ptqsh.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-zkcvmi-shard-0', appname='devrel.content.python', connecttimeoutms=40000, sockettimeoutms=40000, tls=True), 'coldwell'), 'coldwell_collection')


In [15]:
# Delete any existing records in the collection just in case. So we start with an empty collection.
collection.delete_many({})

DeleteResult({'n': 6921, 'electionId': ObjectId('7fffffff00000000000001c0'), 'opTime': {'ts': Timestamp(1733802034, 341), 't': 448}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1733802034, 341), 'signature': {'hash': b'\x9f\xf2\xaf\xb4ZF<JCU\xa6\xda\xef\n!\x12hDc>', 'keyId': 7394800362101669901}}, 'operationTime': Timestamp(1733802034, 341)}, acknowledged=True)

In [16]:
# Convert the dataset into a list of dictionary, where each data row is a new record.
# Then insert that documents variable in batch into the collection.
documents = dataset.to_dict("records")
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [17]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [20]:
# Next, we perform a vector search in the MongoDB collection based on the
# user query. We pass in the user's query string.
# And it searches the MongoDB collection and returns a list of matchig documents.

# Generate embedding for the user query
query = "What is the name of the author? what are the natural remedies? What are the most effective cures? list the natural remedies."

query_embedding = get_embedding(query)
print('query_embedding',query_embedding)

# Define the vector search pipeline
vector_search_stage = {
    "$vectorSearch": {
        "index": "vector_index",
        "queryVector": query_embedding,
        "path": "embedding",
        "numCandidates": 1000 ,  # Number of candidate matches to consider
        "limit": 50 # Return top n matches
    }
}

project_stage = {
    "$project": {
        "_id": 1,  # Can exclude the _id field
        "sentences": 1, # Include the sentences fiels so that we see the actual sentences.
        "score": {
            "$meta": "vectorSearchScore"  # Include the search score
        }
    }
}

pipeline = [vector_search_stage, project_stage]

# Execute the search
results = collection.aggregate(pipeline)

# Define Get knowledge by the list of results returned from the vector search.
get_knowledge = list(results)

# Add all the qualifying sentences from the vector search.
search_result = ""
for result in get_knowledge:
    search_result += f"result: {result.get('sentences')}\n"



query_embedding [-0.02393944561481476, -0.02705160528421402, -0.006338696926832199, 0.00910158921033144, -0.001449763891287148, 0.002730857115238905, 0.018347304314374924, 0.04555801674723625, 0.023043468594551086, 0.024207409471273422, 0.025806918740272522, -0.019094331189990044, 0.0029512585606426, -0.031242551282048225, -0.020021384581923485, -0.0217873714864254, -0.0016816550632938743, -0.049117278307676315, 0.009886451065540314, 0.026753777638077736, 0.019858410581946373, 0.01756894402205944, -0.07939179241657257, -0.00200450886040926, -0.006666397675871849, 0.017530661076307297, 0.01997450180351734, -0.01734425313770771, 0.047861624509096146, 0.04593351110816002, 0.002251269528642297, -0.01052248664200306, 0.03839362412691116, -0.040523894131183624, 0.0041484530083835125, -0.011045150458812714, 0.0361473374068737, -0.03430759161710739, -0.024395663291215897, -0.06387729197740555, 0.011693598702549934, -0.0153859443962574, 0.033910587430000305, -0.021988490596413612, -0.0174059029

In [21]:
source_information = search_result
combined_information = (
    f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
)

In [22]:
from huggingface_hub import notebook_login
from huggingface_hub import login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below üëáüèΩ
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below üëáüèΩ
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [25]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=1000)
print(tokenizer.decode(response[0]))

<bos>Query: What is the name of the author? what are the natural remedies? What are the most effective cures? list the natural remedies.
Continue to answer the query by using the Search Results:
result:  Read what the legendary natural healer Dr
result:  After  being  an award -winning  author,  after  writing  22 best -selling  books  many  in 9 languages or  more  and seeing  why they  are so successful,  I decided  to write  an educational  book  about  health  and self-help  for all those  people  who don‚Äôt have as much knowledge about medicine or natural healing as a fully educated practitioner
result: My books have the purpose to educate people so that they can protect themselves and cure themselves with natural means
result:  Many  of my relatives,  friends,  and acqua intances have been cured of all kinds of health problems through my  method of  natural  healing
result:  Nature  has cures  for everything  and without  any negative  side effects
result: This book is written f