In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%pip install -U transformers accelerate

In [None]:
# directory path of everything 
!ls /kaggle/input/llama-3.1/transformers/8b-instruct/1/

In [None]:
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline
import torch

base_model = "/kaggle/input/llama-3.1/transformers/8b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
# check if gpu is available
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available, using CPU.")


In [None]:
messages = [{"role": "user", "content": "What is the tallest building in the world?"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=120, do_sample=True)
print(outputs[0]["generated_text"])

In [None]:
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U accelerate
%pip install -U peft
%pip install -U trl

In [None]:
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune llama-3.1-8b-it on Sentiment Analysis Dataset', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

print("dependencies installed")

In [None]:
import pandas as pd
import os
import gc

# Write df to disk to save RAM storage due to training issues 
input_path = "/kaggle/input/sentiment-analysis-for-mental-health/Combined Data.csv"
output_path = "/kaggle/working/Filtered_Data.parquet"


chunksize = 50000  

first_chunk = not os.path.exists(output_path)  


for chunk in pd.read_csv(input_path, index_col="Unnamed: 0", chunksize=chunksize):
    
    chunk.loc[:, "status"] = chunk["status"].str.replace("Bi-Polar", "Bipolar")
    chunk = chunk[~chunk["status"].isin(["Personality disorder", "Stress", "Suicidal"])]

    
    if first_chunk:
        chunk.to_parquet(output_path, compression="gzip", index=False, engine="pyarrow")
        first_chunk = False
    else:
        
        existing_df = pd.read_parquet(output_path, engine="pyarrow")
        combined_df = pd.concat([existing_df, chunk], ignore_index=True)
        combined_df.to_parquet(output_path, compression="gzip", index=False, engine="pyarrow")

    
    del chunk
    gc.collect()  

print("df loaded to disk")

In [None]:
import pandas as pd

# df is now on disk instead of RAM
output_path = "/kaggle/working/Filtered_Data.parquet"

df = pd.read_parquet(output_path, engine="pyarrow")

print(df.head())


In [None]:
# we should show emphasize empathetic care when...
def map_to_empathy_label(row):
    if row['status'] in ['Depression', 'Anxiety', 'Bipolar']:
        return 'Empathy'  
    else:
        return 'No Empathy'

df['empathy_label'] = df.apply(map_to_empathy_label, axis=1)

def generate_prompt(data_point):
    return f"""
            Classify the text into Empathy or No Empathy, and return the answer as the corresponding label.
text: {data_point["statement"]}
label: {data_point["empathy_label"]}""".strip()

def generate_test_prompt(data_point):
     return f"""
            Classify the text into Empathy or No Empathy, and return the answer as the corresponding label.
text: {data_point["statement"]}
label: """.strip()


df = df.sample(frac=1, random_state=85).reset_index(drop=True).head(1500)

train_size = 0.8
eval_size = 0.1

train_end = int(train_size * len(df))
eval_end = train_end + int(eval_size * len(df))

X_train = df[:train_end]
X_eval = df[train_end:eval_end]
X_test = df[eval_end:]


X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)
y_true = X_test.loc[:,'empathy_label']  
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])


X_train['empathy_label'].value_counts()


In [5]:
# convert train and eval to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [None]:
base_model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["Empathy", "No Empathy"]  
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=2, 
                        temperature=0.1)
        
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("label:")[-1].strip()
        
        
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

y_pred = predict(X_test, model, tokenizer)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

def evaluate(y_true, y_pred):
    labels = ["Empathy", "No Empathy"]
    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)  
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.3f}')
    
    
    unique_labels = set(y_true_mapped)  
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.3f}')
        
    
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))))
    print('\nClassification Report:')
    print(class_report)
    
    
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)


evaluate(y_true, y_pred)


In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
modules

In [None]:
# pre-process data 

# to satisfy max_seq_length req for trainer
def preprocess_function(example):
    return tokenizer(example["text"], truncation=True, max_length=512)

train_data = train_data.map(preprocess_function)
eval_data = eval_data.map(preprocess_function)

train_data = train_data.shuffle(seed=42).select(range(int(len(train_data) * 0.5)))
eval_data = eval_data.shuffle(seed=42).select(range(int(len(eval_data) * 0.5)))


X_train.to_parquet("/kaggle/working/X_train.parquet", compression='gzip')
X_eval.to_parquet("/kaggle/working/X_eval.parquet", compression='gzip')
print("x_train and x_eval written to disk")

In [None]:
output_dir="llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    
    num_train_epochs=1,                       
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,            
    gradient_checkpointing=True,              
    optim="paged_adamw_32bit",
    logging_steps=1,                         
    learning_rate=2e-4,                       
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        
    max_steps=-1,
    warmup_ratio=0.03,                        
    group_by_length=False,
    lr_scheduler_type="cosine",               
    report_to="wandb",                  
    eval_strategy="steps",              
    eval_steps = 0.2
)

# adjusted for build
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    processing_class=tokenizer,  
)


In [None]:
# need to free up memory in cuda 
import os
import torch
import gc

# reduce fragmentation to help out with RAM
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

torch.cuda.empty_cache()
gc.collect()

In [None]:
trainer.train()
print("training complete")

In [None]:
import sys
import pandas as pd

# list all variables that use memory (particularly those that take up most space)
vars_in_mem = {k: sys.getsizeof(v) for k, v in globals().items()}
sorted_vars = sorted(vars_in_mem.items(), key=lambda x: x[1], reverse=True)

pd.DataFrame(sorted_vars[:10], columns=["Variable", "Size (bytes)"])


In [None]:
# finish weights and biases run
wandb.finish()
model.config.use_cache = True

In [None]:
# Save trained model and tokenizer
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("fintuned model and tokenizer written to disk")

In [None]:
y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)