# Import libraries

In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pickle
import plotly.express as px
import streamlit as st

import time
from tqdm import tqdm  # For progress bar
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score  
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load data

In [17]:
csv_file = "/Users/shilpashribharath/Documents/MSDSP/498_Capstone/en.openfoodfacts.org.products.csv"

# Start timer
start_time = time.time()

chunk_size = 5000  # Number of rows per chunk
chunks = []  # Store processed chunks

# Estimate the total number of rows in the file (optional for progress bar)
total_lines = sum(1 for _ in open(csv_file, "r", encoding="utf-8"))
num_chunks = total_lines // chunk_size

# Read CSV in chunks with tqdm progress bar
for chunk in tqdm(pd.read_csv(csv_file, sep="\t", on_bad_lines="skip", encoding="utf-8", 
                              low_memory=False, chunksize=chunk_size), total=num_chunks, desc="Processing"):
    chunks.append(chunk)  # Store chunks

# Combine all chunks into a single DataFrame (optional)
df = pd.concat(chunks, ignore_index=True)

# End timer
end_time = time.time()

print(f"Total rows processed: {len(df)}")
print(f"Time taken: {end_time - start_time:.2f} seconds")
print(df.head())


Processing: 731it [01:12, 10.13it/s]                         


Total rows processed: 3651753
Time taken: 130.02 seconds
  code                                                url  creator  \
0   54  http://world-en.openfoodfacts.org/product/0000...  kiliweb   
1   63  http://world-en.openfoodfacts.org/product/0000...  kiliweb   
2  114  http://world-en.openfoodfacts.org/product/0000...  kiliweb   
3    1  http://world-en.openfoodfacts.org/product/0000...      inf   
4  105  http://world-en.openfoodfacts.org/product/0000...  kiliweb   

    created_t      created_datetime  last_modified_t last_modified_datetime  \
0  1582569031  2020-02-24T18:30:31Z       1733085204   2024-12-01T20:33:24Z   
1  1673620307  2023-01-13T14:31:47Z       1732913331   2024-11-29T20:48:51Z   
2  1580066482  2020-01-26T19:21:22Z       1737247862   2025-01-19T00:51:02Z   
3  1634745456  2021-10-20T15:57:36Z       1738676541   2025-02-04T13:42:21Z   
4  1572117743  2019-10-26T19:22:23Z       1738073570   2025-01-28T14:12:50Z   

   last_modified_by  last_updated_t last_update

## Data Pre-processing

In [18]:
# Define the features for outlier removal
features_clean = ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'trans-fat_100g', 'cholesterol_100g', 
                  'carbohydrates_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 
                  'sodium_100g', 'potassium_100g', 'calcium_100g', 'iron_100g', 
                  'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'sugars_100g']

# Compute IQR for each feature
Q1 = df[features_clean].quantile(0.25)
Q3 = df[features_clean].quantile(0.75)
IQR = Q3 - Q1

# Define bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df_cleaned = df[~((df[features_clean] < lower_bound) | (df[features_clean] > upper_bound)).any(axis=1)]

print(f"Original dataset size: {df.shape[0]}")
print(f"Dataset size after outlier removal: {df_cleaned.shape[0]}")

Original dataset size: 3651753
Dataset size after outlier removal: 2708861


In [20]:
df_model = df_cleaned[~df_cleaned['nutriscore_grade'].isin(['unknown', '25', '1531'])]

# Random forest classifier model

In [21]:
features = ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 
            'sodium_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'sugars_100g']

# Define target variable
target = 'nutriscore_grade'

# Fill missing values in features with 0
df_filled = df_model.copy()  # Create a copy to avoid modifying the original DataFrame
df_filled[features] = df_filled[features].fillna(0)  # Replace NaNs with 0

# Replace 'not-applicable' with 'NA' as a string
df_filled[target] = df[target].replace("not-applicable", "NA")

In [22]:
df_filled = df_filled.dropna(subset=[target])

In [23]:
df_filled.shape

(788487, 206)

In [24]:
# Encode the categorical target variable
label_encoder = LabelEncoder()
df_filled[target] = label_encoder.fit_transform(df_filled[target])  # Converts a, b, c, d, e -> 0, 1, 2, 3, 4

In [25]:
# Check mapping of encoded values to original labels
mapping = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))
print(mapping)

{0: 'NA', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e'}


In [None]:
df_filled.to_pickle('/Users/shilpashribharath/Documents/MSDSP/498_Capstone/model_test_model_df.pkl')

In [3]:
# Option code to load from pickle file instead loading from csv. Comment this code if you are loading from csv
df_filled = pd.read_pickle("/Users/shilpashribharath/Documents/MSDSP/498_Capstone/model_test_model_df.pkl")

In [5]:
features = ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 
            'sodium_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'sugars_100g']

# Define target variable
target = 'nutriscore_grade'
df_filled[features + [target]].head()

Unnamed: 0,energy_100g,fat_100g,saturated-fat_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,sugars_100g,nutriscore_grade
6,295.0,0.3,0.2,0.0,12.0,0.1,0.04,0.0,3.9,1
10,962.0,11.0,2.0,9.0,22.0,0.95,0.38,0.0,0.98,3
16,1852.0,0.5,2.6,1.4,0.5,0.53,0.212,22.666667,25.0,5
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
19,908.0,13.0,6.7,0.0,8.8,0.8,0.32,8.2,3.6,4


In [6]:
# Define features
features = ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'fiber_100g', 'proteins_100g',  
            'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'sugars_100g']

# Fill missing salt_100g with calculated value from sodium_100g / 400
df_filled['salt_100g'] = df_filled['salt_100g'].fillna(df_filled['sodium_100g'] / 0.4)

# Add salt_100g to the feature list instead of sodium_100g
features.append('salt_100g')

# Define target variable
target = 'nutriscore_grade'

# Fill remaining missing values in features with 0
df_filled[features] = df_filled[features].fillna(0)

# Replace 'not-applicable' with 'NA' in the target column
df_filled[target] = df_filled[target].replace("not-applicable", "NA")

In [7]:
df_filled[features + [target]].head()

Unnamed: 0,energy_100g,fat_100g,saturated-fat_100g,fiber_100g,proteins_100g,fruits-vegetables-nuts-estimate-from-ingredients_100g,sugars_100g,salt_100g,nutriscore_grade
6,295.0,0.3,0.2,0.0,12.0,0.0,3.9,0.1,1
10,962.0,11.0,2.0,9.0,22.0,0.0,0.98,0.95,3
16,1852.0,0.5,2.6,1.4,0.5,22.666667,25.0,0.53,5
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
19,908.0,13.0,6.7,0.0,8.8,8.2,3.6,0.8,4


In [8]:
# Split dataset into training and testing sets
X = df_filled[features]
y = df_filled[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalize the features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Train a Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Predictions
y_pred = clf.predict(X_test_scaled)

# Hyperparameter tuning

In [9]:
# Define optimized hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 200],  
    'max_depth': [10, 20, None],  
    'min_samples_split': [2, 5],  
    'min_samples_leaf': [1, 2],  
    'max_features': ['sqrt'],  
    'bootstrap': [True]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42, warm_start=True)

# Optimized RandomizedSearchCV (Faster)
random_search = RandomizedSearchCV(
    rf, param_distributions=param_dist, 
    n_iter=10,  # Reduce from 20 to 10
    cv=3,  # Reduce from 5 to 3 folds
    verbose=2, 
    n_jobs=-1,  
    random_state=42
)

# Train the model
random_search.fit(X_train_scaled, y_train)

# Best model and parameters
best_rf = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Make predictions
y_pred = best_rf.predict(X_test_scaled)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Optimized Accuracy:", accuracy)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time= 1.1min
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  34.5s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  35.8s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  37.9s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=5, n_estimators=200; t



[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  35.4s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  35.0s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  33.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time= 1.3min
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time= 1.7min
[CV] END bootstrap=True, max_depth=20, max_features

In [16]:
import pickle

# Define the filename
filename = "random_forest_model.pkl"

# Save the trained model
with open(filename, "wb") as file:
    pickle.dump(clf, file)

print(f"Model saved as {filename}")

Model saved as random_forest_model.pkl


## Storing Model & Results

In [None]:
# Storing the Randomforest Model pickle file
with open('/Users/shilpashribharath/Documents/MSDSP/498_Capstone/random_forest_model.pkl', 'wb') as f:
    pickle.dump(clf, f)

# XGBoost

In [43]:
# Option code to load from pickle file instead loading from csv. Comment this code if you are loading from csv
df_filled = pd.read_pickle("/Users/shilpashribharath/Documents/MSDSP/498_Capstone/model_test_model_df.pkl")

In [45]:
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define feature columns
features = ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 
            'sodium_100g', 'fruits-vegetables-nuts-estimate-from-ingredients_100g', 'sugars_100g']

# Define target variable
target = 'nutriscore_grade'

# Fill missing values (if any)
df_filled[features] = df_filled[features].fillna(0)

# Split dataset
X = df_filled[features]
y = df_filled[target]  # Already label-encoded (0 to 4)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=5, random_state=42, use_label_encoder=False)

# Hyperparameter tuning using RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_model, param_distributions=param_grid, 
    n_iter=20, scoring='accuracy', cv=5, verbose=1, n_jobs=-1, random_state=42
)
random_search.fit(X_train, y_train)

# Get the best XGBoost model
best_xgb = random_search.best_estimator_

# Make predictions
y_pred = best_xgb.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ XGBoost Model Accuracy: {accuracy:.4f}")
print("\n📌 Classification Report:\n", classification_report(y_test, y_pred))

# Save the best model
best_xgb.save_model("xgboost_nutriscore_model.json")

# Print predicted labels
print("🔹 Predicted NutriScore Grades (Encoded):", y_pred)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

✅ XGBoost Model Accuracy: 0.8747

📌 Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.88      0.77     13213
           1       0.89      0.79      0.84     20580
           2       0.81      0.74      0.77     20050
           3       0.88      0.90      0.89     38796
           4       0.94      0.94      0.94     38840
           5       0.94      0.91      0.92     26219

    accuracy                           0.87    157698
   macro avg       0.85      0.86      0.85    157698
weighted avg       0.88      0.87      0.88    157698

🔹 Predicted NutriScore Grades (Encoded): [3 2 2 ... 3 3 3]


# LightGBM model

In [46]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV

# Initialize LightGBM model
lgb_model = lgb.LGBMClassifier(objective="multiclass", num_class=5, random_state=42)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 5, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

random_search = RandomizedSearchCV(lgb_model, param_grid, n_iter=20, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
random_search.fit(X_train, y_train)

# Evaluate performance
best_lgb = random_search.best_estimator_
y_pred = best_lgb.predict(X_test)
print(f"LightGBM Accuracy: {accuracy_score(y_test, y_pred):.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.079725 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2230
[LightGBM] [Info] Number of data points in the train set: 504631, number of used features: 9
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069834 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2233
[LightGBM] [Info] Number of data points in the train set: 504632, number of used features: 9
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110268 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2234
[LightGBM] [Info] Start training from score -2.479515
[LightGBM] [Info] Number of data points in the