<a href="https://colab.research.google.com/github/talinm23/ML/blob/main/parsing_pdfs_step2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# In order to run these codes (parsing pdfs step 1 and 2), you can run them in a Google Colab notebook.
# To run fast, you should get connected to a GPU in the Colab environment.
# In the Colab secrets, create and copy your HUGGING_FACE_TOKEN and MONGO_URI (MongoDB URI) to the Secrets section.

In [None]:
# Clear all variables
%reset -f
#Check for Persistent Extensions or Cache Files
import os
import shutil
cache_dir = os.path.expanduser('~/.cache')
shutil.rmtree(cache_dir, ignore_errors=True)

In [None]:
# imports and granting access to the google drive to extrace the saved embedding vector information from the step 1 notebook.
import pandas as pd
from ast import literal_eval

# Get connected to Google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Access the saved file from step 1.
file_path_to_save = '/content/drive/My Drive/Colab_Notebooks/csv_saved/'
dataset = pd.read_csv(file_path_to_save+'dataset_embedded.csv')

In [None]:
# Apply literal_eval on the "embeddings" column to extract the list inside the string in each row and save it into a new column.
dataset['embedding'] = dataset['embedding_'].apply(literal_eval)

In [None]:
pip install pymongo

In [None]:
# Connecting to MongoDB
import pymongo
from google.colab import userdata

def get_mongo_client(mongo_uri):
  """Establish connection to the MongoDB."""
  try:
    client = pymongo.MongoClient(mongo_uri, appname="devrel.content.python"
    #,ssl=True,tlsAllowInvalidCertificates=True
    ,connectTimeoutMS=40000
    ,socketTimeoutMS=40000)
    print("Connection to MongoDB successful")
    return client
  except pymongo.errors.ConnectionFailure as e:
    print(f"Connection failed: {e}")
    return None

mongo_uri = userdata.get('MONGO_URI')
if not mongo_uri:
  print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB (into the created collection)
db = mongo_client['coldwell']
collection = db['coldwell_collection']
print('db:',db)
print('collection:',collection)

In [None]:
# Delete any existing records in the collection just in case. So we start with an empty collection.
collection.delete_many({})

In [None]:
# Convert the dataset into a list of dictionary, where each data row is a new record.
# Then insert that documents variable in batch into the collection.
documents = dataset.to_dict("records")
collection.insert_many(documents)
print("Data ingestion into MongoDB completed")

In [None]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-large")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = embedding_model.encode(text)

    return embedding.tolist()

In [None]:
# Next, we perform a vector search in the MongoDB collection based on the
# user query. We pass in the user's query string.
# And it searches the MongoDB collection and returns a list of matchig documents.

# Generate embedding for the user query
query = "What is the name of the author? what are the natural remedies? What are the most effective cures? list the natural remedies."

query_embedding = get_embedding(query)

# Define the vector search pipeline
vector_search_stage = {
    "$vectorSearch": {
        "index": "vector_index",
        "queryVector": query_embedding,
        "path": "embedding",
        "numCandidates": 1000 ,  # Number of candidate matches to consider
        "limit": 50 # Return top n matches
    }
}

project_stage = {
    "$project": {
        "_id": 1,  # Can exclude the _id field
        "sentences": 1, # Include the sentences fiels so that we see the actual sentences.
        "score": {
            "$meta": "vectorSearchScore"  # Include the search score
        }
    }
}

pipeline = [vector_search_stage, project_stage]

# Execute the search
results = collection.aggregate(pipeline)

# Define Get knowledge by the list of results returned from the vector search.
get_knowledge = list(results)

# Add all the qualifying sentences from the vector search.
search_result = ""
for result in get_knowledge:
    search_result += f"result: {result.get('sentences')}\n"



In [None]:
source_information = search_result
combined_information = (
    f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."
)

In [None]:
from huggingface_hub import notebook_login
from huggingface_hub import login
notebook_login()

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

In [None]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=1000)
print(tokenizer.decode(response[0]))