# Obesity Classifier


## Setup

- includes
  - determining the computing device
  - model name
  - csv path
  - destinated json line path

In [4]:
import json
import re
from pprint import pprint
import evaluate
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel, get_peft_model, PeftModel
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    LlamaModel,
    AutoConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
import numpy as np
 
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-3.2-1B" 
CSV_PATH = "ObesityDataSet.csv"
TRAIN_JSON_PATH = "ObesityTrainDataSet.jsonl"
TEST_JSON_PATH = "ObesityTestDataSet.jsonl"
RANDOM_SEED = 42

torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
torch.cuda.reset_max_memory_allocated()

## hugging face log in

- hugging face requires login for access

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

## data processing

- convert the csv file to json line file
- split the data into train data and test data, 8:2 ratio

In [5]:

# label maps
id2label = {0:"Insufficient_Weight", 1:"Normal_Weight" ,2:"Overweight_Level_I" , 3:"Overweight_Level_II" , 4:"Obesity_Type_I" , 5:"Obesity_Type_II" , 6:"Obesity_Type_III" }
label2id = {v:k for k,v in id2label.items()}

df = pd.read_csv(CSV_PATH)
df = df.rename(columns={"NObeyesdad": "label"})
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

# Convert label column from string to integers using label2id
train_df["label"] = train_df["label"].map(label2id)
test_df["label"] = test_df["label"].map(label2id)

train_df.to_json(TRAIN_JSON_PATH, orient="records", lines=True)
test_df.to_json(TEST_JSON_PATH, orient="records", lines=True)

## load dataset

- load both test and train datasets into dataset variable

In [6]:
dataset = load_dataset("json", data_files={"train": TRAIN_JSON_PATH, "test": TEST_JSON_PATH})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label'],
        num_rows: 1688
    })
    test: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label'],
        num_rows: 423
    })
})

## loading model & tokenizer & data collator

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    ).to(DEVICE)

model

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
   

In [8]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [9]:
# Tokenize the dataset
def tokenize_function(examples):
    texts = [
        f"gender: {g}, age: {a}, height: {h}, weight: {w}, family_history_with_overweight: {f}, favc: {favc}, fcvc: {fcvc}, ncp: {ncp}, caec: {caec}, smoke: {s}, ch2o: {ch2o}, scc: {scc}, faf: {faf}, tue: {tue}, calc: {calc}, mtrans: {mtrans}"
        for g, a, h, w, f, favc, fcvc, ncp, caec, s, ch2o, scc, faf, tue, calc, mtrans in zip(
            examples["Gender"], 
            examples["Age"], 
            examples["Height"], 
            examples["Weight"], 
            examples["family_history_with_overweight"], 
            examples["FAVC"], 
            examples["FCVC"], 
            examples["NCP"], 
            examples["CAEC"], 
            examples["SMOKE"], 
            examples["CH2O"], 
            examples["SCC"], 
            examples["FAF"], 
            examples["TUE"], 
            examples["CALC"], 
            examples["MTRANS"]
        )
    ]

    encoding = tokenizer(
        texts, 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )
    
    # Convert labels to PyTorch tensor
    encoding["labels"] = [label for label in examples["label"]]
    
    return encoding


In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
print(dataset["train"][0])

{'Gender': 'Female', 'Age': 21.0, 'Height': 1.63, 'Weight': 60.0, 'family_history_with_overweight': 'yes', 'FAVC': 'yes', 'FCVC': 3.0, 'NCP': 3.0, 'CAEC': 'Always', 'SMOKE': 'yes', 'CH2O': 2.0, 'SCC': 'no', 'FAF': 2.0, 'TUE': 0.0, 'CALC': 'Sometimes', 'MTRANS': 'Public_Transportation', 'label': 1}


## tokenize the dataset

In [12]:
# tokenize dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1688 [00:00<?, ? examples/s]

Map:   0%|          | 0/423 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1688
    })
    test: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 423
    })
})

## evaluate function

In [13]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.tensor(predictions).to(DEVICE)
    labels = torch.tensor(labels).to(DEVICE)

    predictions = torch.argmax(predictions, dim=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## test data processing

In [24]:
def dict_to_string(data):
    if isinstance(data, str):
        data = json.loads(data)
    
    if not data:
        return ""
    
    formatted_string =  ", ".join([f"{k}: {v}" for k, v in data.items() if k != "label"])
    return {"input": formatted_string}

test_inputs = dataset["test"].map(dict_to_string)
test_inputs

Dataset({
    features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label', 'input'],
    num_rows: 423
})

## testing untrained model

In [13]:
print("Untrained model predictions:")
print("--------------------------")
isCorret_untrained = 0;
total_untrained = 0;
accuracy_untrained = 0;
for text in test_inputs:
    total_untrained += 1
    inputs = tokenizer.encode(text["input"], return_tensors="pt").to(DEVICE)
    logits = model(inputs).logits
    predictions = torch.argmax(logits, dim=1)
    if predictions == text["label"]:
        isCorret_untrained += 1

accuracy_untrained = isCorret_untrained / total_untrained
print(f"Accuracy: {accuracy_untrained}")

Untrained model predictions:
--------------------------
Accuracy: 0.15130023640661938


## Train model

In [15]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
)
peft_config

LoraConfig(task_type='SEQ_CLS', peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'o_proj', 'k_proj', 'v_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, eva_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [16]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,718,272 || all params: 1,237,549,056 || trainable%: 0.1388


In [17]:
# hyperparameters
lr = 1e-4
batch_size = 1
num_epochs = 2

# Explicitly set padding token in the model config
model.config.pad_token_id = tokenizer.pad_token_id

In [18]:
# define training arguments
training_args = TrainingArguments(
    output_dir= MODEL_NAME + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=8,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy = "epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    seed=RANDOM_SEED,
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

  trainer = Trainer(


In [20]:
model.gradient_checkpointing_enable()

In [21]:
print("Pad token ID in tokenizer:", tokenizer.pad_token_id)
print("Pad token ID in model config:", model.config.pad_token_id)

Pad token ID in tokenizer: 128256
Pad token ID in model config: 128256


In [22]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.786767,{'accuracy': 0.2576832151300236}
2,No log,1.681527,{'accuracy': 0.3877068557919622}




TrainOutput(global_step=422, training_loss=1.7925796870371742, metrics={'train_runtime': 336.2494, 'train_samples_per_second': 10.04, 'train_steps_per_second': 1.255, 'total_flos': 1.011053739638784e+16, 'train_loss': 1.7925796870371742, 'epoch': 2.0})

In [23]:
# save model
model.save_pretrained(MODEL_NAME + "-lora-text-classification")



## testing trained model

In [25]:
print("trained model predictions:")
print("--------------------------")
isCorret_trained = 0;
total_trained = 0;
accuracy_trained = 0;
for text in test_inputs:
    total_trained += 1
    inputs = tokenizer.encode(text["input"], return_tensors="pt").to(DEVICE)
    logits = model(inputs).logits
    predictions = torch.argmax(logits, dim=1)
    if predictions == text["label"]:
        isCorret_trained += 1

accuracy_trained = isCorret_trained / total_trained
print(f"Accuracy: {accuracy_trained}")

trained model predictions:
--------------------------
Accuracy: 0.3049645390070922
