# Obesity Classifier


## Setup

- includes
  - determining the computing device
  - model name
  - csv path
  - destinated json line path

In [88]:
import json
import re
from pprint import pprint
import evaluate
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    LlamaModel,
    AutoConfig,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments
)
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
import numpy as np
 
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-3.2-1B" 
CSV_PATH = "ObesityDataSet.csv"
TRAIN_JSON_PATH = "ObesityTrainDataSet.jsonl"
TEST_JSON_PATH = "ObesityTestDataSet.jsonl"
RANDOM_SEED = 42

## hugging face log in

- hugging face requires login for access

In [9]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## data processing

- convert the csv file to json line file
- split the data into train data and test data, 8:2 ratio

In [10]:

df = pd.read_csv(CSV_PATH)
df = df.rename(columns={"NObeyesdad": "label"})
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

train_df.to_json(TRAIN_JSON_PATH, orient="records", lines=True)
test_df.to_json(TEST_JSON_PATH, orient="records", lines=True)

## load dataset

- load both test and train datasets into dataset variable

In [11]:
dataset = load_dataset("json", data_files={"train": TRAIN_JSON_PATH, "test": TEST_JSON_PATH})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label'],
        num_rows: 1688
    })
    test: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label'],
        num_rows: 423
    })
})

## loading model & tokenizer & data collator

In [None]:
# label maps
id2label = {0:"Insufficient_Weight", 1:"Normal_Weight" ,2:"Overweight_Level_I" , 3:"Overweight_Level_II" , 4:"Obesity_Type_I" , 5:"Obesity_Type_II" , 6:"Obesity_Type_III" }
label2id = {"Insufficient_Weight":0, "Normal_Weight":1 ,"Overweight_Level_I":2 , "Overweight_Level_II":3 , "Obesity_Type_I":4 , "Obesity_Type_II":5 , "Obesity_Type_III":6 }

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    )

model

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
   

In [78]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

# add pad token if none exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))



In [82]:
## tokenize function
def tokenize_function(examples):
    
    # Create a list of formatted text sequences
    texts = [
        f"gender: {g}, age: {a}, height: {h}, weight: {w}, family_history_with_overweight: {f}, favc: {favc}, fcvc: {fcvc}, ncp: {ncp}, caec: {caec}, smoke: {s}, ch2o: {ch2o}, scc: {scc}, faf: {faf}, tue: {tue}, calc: {calc}, mtrans: {mtrans}"
        for g, a, h, w, f, favc, fcvc, ncp, caec, s, ch2o, scc, faf, tue, calc, mtrans in zip(
            examples["Gender"], 
            examples["Age"], 
            examples["Height"], 
            examples["Weight"], 
            examples["family_history_with_overweight"], 
            examples["FAVC"], 
            examples["FCVC"], 
            examples["NCP"], 
            examples["CAEC"], 
            examples["SMOKE"], 
            examples["CH2O"], 
            examples["SCC"], 
            examples["FAF"], 
            examples["TUE"], 
            examples["CALC"], 
            examples["MTRANS"]
        )
    ]

    # Tokenize the batch
    return tokenizer(
        texts, 
        truncation=True, 
        padding="max_length", 
        max_length=512, 
        return_tensors="np" # Return PyTorch tensors
    )


In [83]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## tokenize the train dataset

In [84]:
# tokenize dataset
tokenized_dataset = dataset["train"].map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/1688 [00:00<?, ? examples/s]

Dataset({
    features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1688
})

## evaluate function

In [89]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## testing untrained model

In [None]:
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)
    print(text + ":", id2label[predictions.tolist()])

Untrained model predictions:
--------------------------


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)