In [None]:
import os
import pandas as pd
import tqdm as notebook_tqdm
from langchain_core.prompts import PromptTemplate
import json
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

In [None]:
max_seq_length = 128000


def load_lama31(
    model_name: str = "unsloth/Meta-Llama-3.1-8B-Instruct", max_seq_length: int = 128000
):
    # Loading LLaMA with a 128k context window
    if (
        os.path.exists(os.path.join(model_name, "tokenizer_config.json"))
        and os.path.exists(os.path.join(model_name, "tokenizer.json"))
        and os.path.exists(os.path.join(model_name, "special_tokens_map.json"))
    ):

        model_name_1 = model_name
        print(f"Modelfound at repo {model_name}")

    else:
        model_name_1 = "unsloth/Meta-Llama-3.1-8B-Instruct"
        print(
            f"Model not found at repo {model_name}. Loading default model {model_name_1}"
        )

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name_1,
        max_seq_length=max_seq_length,  # Set context window size to 128k tokens
        dtype=None,
        load_in_4bit=True,  # To reduce memory usage
    )

    print("Model loaded to VRAM Sucessfully")
    model = peft_model(model)

    return model, tokenizer


def tokenize_data(texts, tokenizer, max_length=max_seq_length):
    return tokenizer(
        texts,
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )


def peft_model(model):
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",  # Optimized memory usage
    )
    return model

In [None]:
def unsloth_chat_inf(model, tokenizer):
    tokenizer1 = get_chat_template(
        tokenizer,
        chat_template="llama-3.1",
    )
    model = FastLanguageModel.for_inference(model)
    return model, tokenizer1


# print("sam")

In [None]:
model_path = "/media/qult/volume/Taimour/Github Repos/Entity-Extraction-Fine-Tuning/Llama Inference/model"

model, token = load_lama31(model_path, max_seq_length)

In [None]:
from transformers import TextStreamer
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    token,
    chat_template="llama-3.1",
    mapping={
        "role": "from",
        "content": "value",
        "user": "human",
        "assistant": "gpt",
    },  # ShareGPT style
)
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

In [None]:
JSON_PATH = "/media/qult/volume/Taimour/Real Time Machine Telemetry Dashboard/data/aggreate_metrics.json"

with open(JSON_PATH, "r") as file:
    input_data = json.load(file)

In [None]:
cityname = "Karachi"

prompt_general = f"""
You are an advanced data analysis assistant specialized in weather and climate data. Your task is to analyze detailed weather data for various cities and provide structured, insightful responses tailored to specific user queries. Using the provided dataset as context, generate insights about the weather in a city of interest. In this case, the focus is on {cityname}.
"""

In [None]:
# combine the prompt with the user data
prompt = str(input_data) + prompt_general

In [None]:
# I want to extract entities from the raw data provided by the user in the json format and provide the extracted entities in the specified format as described in the prompt.
messages = [
    {"from": "human", "value": prompt},
]
inputs = tokenizer.apply_chat_template(
    messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
).to("cuda")

text_streamer = TextStreamer(tokenizer)
results = model.generate(
    input_ids=inputs, streamer=text_streamer, max_new_tokens=10000, use_cache=True
)

In [None]:
# Decode the generated text and remove unnecessary tokens
response = tokenizer.decode(results[0], skip_special_tokens=True)

# Print the exact response
print(response)