# Obesity Classifier


## Setup

- includes
  - determining the computing device
  - model name
  - csv path
  - destinated json line path

In [1]:
import json
import re
from pprint import pprint
 
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    LlamaModel
)
from trl import SFTTrainer
from sklearn.model_selection import train_test_split
 
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "meta-llama/Llama-3.2-1B" 
CSV_PATH = "ObesityDataSet.csv"
TRAIN_JSON_PATH = "ObesityTrainDataSet.jsonl"
TEST_JSON_PATH = "ObesityTestDataSet.jsonl"
RANDOM_SEED = 42



## data processing

- convert the csv file to json line file
- split the data into train data and test data, 8:2 ratio

In [2]:

df = pd.read_csv(CSV_PATH)
df = df.rename(columns={"NObeyesdad": "label"})
train_df, test_df = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

train_df.to_json(TRAIN_JSON_PATH, orient="records", lines=True)
test_df.to_json(TEST_JSON_PATH, orient="records", lines=True)

## load dataset

- load both test and train datasets into dataset variable

In [3]:
dataset = load_dataset("json", data_files={"train": TRAIN_JSON_PATH, "test": TEST_JSON_PATH})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label'],
        num_rows: 1688
    })
    test: Dataset({
        features: ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'label'],
        num_rows: 423
    })
})

## hugging face log in

- hugging face requires login for access

In [6]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

## loading model & tokenizer

In [7]:
# label maps
id2label = {0:"Insufficient_Weight", 1:"Normal_Weight" ,2:"Overweight_Level_I" , 3:"Overweight_Level_II" , 4:"Obesity_Type_I" , 5:"Obesity_Type_II" , 6:"Obesity_Type_III" }
label2id = {"Insufficient_Weight":0, "Normal_Weight":1 ,"Overweight_Level_I":2 , "Overweight_Level_II":3 , "Obesity_Type_I":4 , "Obesity_Type_II":5 , "Obesity_Type_III":6 }

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
    )

model

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
   