1. Library yang digunakan

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pickle

2. Load Dataset

In [3]:
def load_dataset(filepath):
    try:
        data = pd.read_csv(filepath)
        return data
    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise

3. Data Prepocessing

In [4]:
def preprocess_data(data):
    # Define categorical and numerical columns
    categorical_cols = ['Gender', 'CALC', 'FAVC', 'SMOKE', 'family_history_with_overweight', 'CAEC', 'MTRANS']
    numerical_cols = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
    
    # Check for missing values and handle them (e.g., fill with the median for numerical columns)
    data[numerical_cols] = data[numerical_cols].fillna(data[numerical_cols].median())
    data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

    # Encode categorical columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
        label_encoders[col] = le

    # Standardize numerical columns
    scaler = StandardScaler()
    data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

    # Encode target column
    target_encoder = LabelEncoder()
    data['NObeyesdad'] = target_encoder.fit_transform(data['NObeyesdad'])

    return data, label_encoders, scaler, target_encoder

4. Split Data

In [5]:
def split_data(data):
    features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'Gender', 'CALC', 'FAVC', 'SMOKE', 
                'family_history_with_overweight', 'CAEC', 'MTRANS']
    X = data[features]
    y = data['NObeyesdad']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    return X_train, X_test, y_train, y_test

5. Model Latih

In [6]:
def train_model(X_train, y_train):
    model = RandomForestClassifier(random_state=42, n_estimators=100)
    model.fit(X_train, y_train)
    return model

6. save model

In [7]:
def save_model_pickle(model, label_encoders, scaler, target_encoder, filepath):
    with open(filepath, 'wb') as f:
        pickle.dump({
            "model": model,
            "label_encoders": label_encoders,
            "scaler": scaler,
            "target_encoder": target_encoder
        }, f)

7. Load data pickle

In [8]:
def load_model_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

8. Main script

In [9]:
if __name__ == "__main__":
    # Load data
    filepath = "ObesityDataSet_raw_and_data_sinthetic.csv"  # Replace with the correct path to your file
    try:
        data = load_dataset(filepath)
    except Exception as e:
        print(f"Dataset could not be loaded: {e}")
        exit()

    # Preprocess data
    processed_data, label_encoders, scaler, target_encoder = preprocess_data(data)

    # Split data
    X_train, X_test, y_train, y_test = split_data(processed_data)

    # Train model
    model = train_model(X_train, y_train)

    # Evaluate model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=target_encoder.classes_))

    # Save model using pickle
    save_model_pickle(model, label_encoders, scaler, target_encoder, "obesity_model.pkl")
    print("Model saved as obesity_model.pkl")

Model Accuracy: 93.53%
Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.95      0.94      0.95        86
      Normal_Weight       0.80      0.89      0.84        93
     Obesity_Type_I       0.98      0.95      0.97       102
    Obesity_Type_II       0.97      0.99      0.98        88
   Obesity_Type_III       1.00      0.99      0.99        98
 Overweight_Level_I       0.89      0.84      0.87        88
Overweight_Level_II       0.97      0.94      0.95        79

           accuracy                           0.94       634
          macro avg       0.94      0.93      0.94       634
       weighted avg       0.94      0.94      0.94       634

Model saved as obesity_model.pkl
