<a href="https://colab.research.google.com/github/thesis17/Afaan-Oromoo-chatGPT/blob/main/testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
artemminiailo_medicalconversations2disease_path = kagglehub.dataset_download('artemminiailo/medicalconversations2disease')

print('Data source import complete.')


In [None]:
!pip install datasets transformers bitsandbytes peft > /dev/null

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset, load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, PeftConfig
from transformers import (AutoModelForCausalLM,
                          DataCollatorForLanguageModeling,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          Trainer,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv("/kaggle/input/ai4bharat-indicsentiment/train.csv")
test_df = pd.read_csv("/kaggle/input/ai4bharat-indicsentiment/test.csv")
train_df.fillna("Neutral", inplace=True)

train_df, test_df = train_test_split(train_df, test_size=0.1, stratify=train_df[["label", "language"]], random_state=42)
print(train_df.shape)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

# 1. Create Label Mapping
unique_labels = dataset["train"].unique("label")
num_labels = len(unique_labels)

In [None]:
print(f"Unique Labels: {unique_labels}") # Debug print
print(f"Number of labels: {num_labels}") # Debug print

In [None]:
# Define the prompt generation functions
def generate_prompt(sent, lable):
    return f"""
            Classify the text into 'Positive', 'Negative', and return the answer as the predicted sentiment.
text: {sent}
label: {lable}""".strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text into 'Positive', 'Negative', and return the answer as the predicted sentiment.
text: {data_point}
label: """.strip()

In [None]:
base_model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1
model.config.pad_token_id = model.config.eos_token_id

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
X_test = dataset["test"].to_pandas()
y_true = X_test.loc[:,'label']

In [None]:
def predict(test, model, tokenizer):
    global unique_labels
    y_pred = []
    categories = unique_labels
    model.config.pad_token_id = model.config.eos_token_id

    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["sentence"]
        prompt = f"""
            Classify the text into 'Positive', 'Negative', and return the answer as the predicted sentiment.
text: {prompt}
label: """.strip()

        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # Generate output from the model
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=2, temperature=0.1)

        # Decode output
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract label from generated text
        answer = answer.split("label:")[-1].strip()

        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")

    return y_pred

y_pred = predict(X_test, model, tokenizer)

In [None]:
def evaluate(y_true, y_pred):
    global unique_labels
    labels = unique_labels
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data

    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    y_true = set(y_true_mapped)  # Get unique labels

    for label in y_true:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

evaluate(y_true, y_pred)

In [None]:
def tokenize_fn(examples):
    inputs = [generate_prompt(sent, lable) for sent, lable in zip(examples["sentence"], examples["label"])]

    tokenized_inputs = tokenizer(
        inputs,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_fn, batched=True, batch_size=8, num_proc=4)

tokenized_dataset = tokenized_dataset.map(
    lambda x: {"length": len(x["input_ids"])},
    num_proc=4
).sort("length")

tokenized_dataset = tokenized_dataset.select_columns(["input_ids", "attention_mask"])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
modules

In [None]:
output_dir="llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    r=4,
    lora_alpha=16,
    target_modules=['k_proj', 'v_proj', 'v_proj'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

peft_model = get_peft_model(model, peft_config)
peft_model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    # fp16_full_eval=False,  # Disable FP16 for evaluation
    logging_steps=5,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to=["none"],
    eval_accumulation_steps=4,
    eval_steps=5
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
y_pred = predict(X_test, peft_model, tokenizer)
evaluate(y_true, y_pred)

In [None]:
def predict_sentiment(text):
    sentence = generate_test_prompt(text)
    inputs = tokenizer(sentence, return_tensors="pt").to("cuda") # Move input to GPU
    outputs = model.generate(**inputs, max_new_tokens=128) # Adjust max_new_tokens as needed
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    output = decoded_output.split("label:")[1]
    return "Positive" if "Positive" in output else "Negative" if "Negative" in output else "Neutral"

# Example
example_text = "जीवन अच्छा है।"
predicted_sentiment = predict_sentiment(example_text)

print(f"Sentence: {example_text}")
print(f"Predicted Sentiment: {predicted_sentiment}")