In [None]:
from langchain.docstore.document import Document
from langchain_text_splitters import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [None]:
import os
os.environ["HF_HOME"] = "~/scratch/hf-cache"
from unsloth import FastLanguageModel
import torch
max_seq_length = 8192 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

with torch.no_grad():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Meta-Llama-3.1-70B-Instruct-bnb-4bit",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        token = "<YOUR HF TOKEN>", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

In [None]:
!nvidia-smi

In [None]:
gen_config = model.generation_config

# Print all hyperparameters
print(gen_config)

In [None]:
my_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [None]:
occupation = "politician"
CHROMA_SOURCE = "json_" + occupation
CHROMA_CHUNK_SIZE = 1000
CHROMA_PERSIST_DIRECTORY = "./chroma_db"
CHROMA_COLLECTION_NAME = occupation + "_json_files"
target_collection_name = occupation + "_json_files" 
embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDINGS_MODEL)
EMBEDDINGS_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
chroma_db_from_disk = Chroma(persist_directory=CHROMA_PERSIST_DIRECTORY,                             
                            collection_name=target_collection_name,
                            embedding_function=embeddings)

In [None]:
#chroma_db_from_disk._client.delete_collection(target_collection_name)

In [None]:
collections = chroma_db_from_disk._client.list_collections()
collections

In [None]:
from pymongo.mongo_client import MongoClient
import json
import re
import time
search_res_cleaned = []
pattern = r'\bID\b'
occupation = 'politician'
uri = "mongodb://localhost:27017/"
client = MongoClient(uri)
db = client['wikidata_json']
collection = db[occupation]
gt_user = "YOUR GT USER"
results = collection.find()
count = 0
for res in results:
    cleaned_res = {}
    for key, value in res.items():
        if not re.search(pattern, key):
            cleaned_res[key] = value
    search_res_cleaned.append(cleaned_res) 

In [None]:
import time
import json
final_stats = []
with open("Analysis.txt", "w") as f:
    for i in range(0, len(search_res_cleaned)-100):
        try:
            last_name = search_res_cleaned[i]["family name"][0]
        except:
            last_name = ""

        try:
            first_name = search_res_cleaned[i]["given name"][0]
        except:
            first_name = ""

        start_time = time.time()
        with torch.no_grad():
            FastLanguageModel.for_inference(model) # Enable native 2x faster inference
            inputs = tokenizer(
            [
                my_prompt.format("""Translate this English JSON File into a Hindi Language JSON File. 
                Do not any other extra text or note after the conversion of JSON. 
                Make sure everything in your response is in Hindi strictly and there are no additional NOTE afterwards""", # instruction
                    search_res_cleaned[i], # input
                    "", # output - leave this blank for generation!
                )
            ], return_tensors = "pt").to("cuda")
            print("Number of Tokens in Input: " + str(inputs['input_ids'].shape[1]))
            outputs = model.generate(**inputs, max_new_tokens = 4096, use_cache = True)
            print("Number of Tokens in output: " + str(outputs.shape[1]))
            final_output = tokenizer.batch_decode(outputs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time: {execution_time:.4f} seconds")

        start_time_2 = time.time()
        docs = [Document(page_content=str(final_output), metadata={"source": CHROMA_SOURCE, "first_name": first_name, "last_name": last_name })]
        text_splitter = CharacterTextSplitter(chunk_size=CHROMA_CHUNK_SIZE, chunk_overlap=0)
        docs = text_splitter.split_documents(docs)
        print(len(docs))
        embeddings = SentenceTransformerEmbeddings(model_name=EMBEDDINGS_MODEL)
        local_chroma = Chroma.from_documents(docs, embeddings, persist_directory=CHROMA_PERSIST_DIRECTORY, collection_name=CHROMA_COLLECTION_NAME)
        end_time_2 = time.time()
        execution_time_2 = end_time_2 - start_time_2
        print(f"Execution time: {execution_time_2:.4f} seconds")

        op = { "First Name": first_name, "Last Name": last_name, "Time for JSON Conversion": execution_time,
              "Time For Chroma Write": execution_time_2, "Input Tokens": inputs['input_ids'].shape[1], 
              "Output Tokens": outputs.shape[1] }
        json.dump(op, f, indent=4)