<a href="https://colab.research.google.com/github/trieuhaivo/cap5771-project/blob/main/t06_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.cluster import KMeans
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [12]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)        # Show table width
pd.set_option('display.max_colwidth', None) # Show full content of each column
pd.set_option('display.max_rows', None)

In [13]:
TeamID = 6
np.random.seed(TeamID)

# 1. Exploratory Data Analysis - EDA

## 1.1 Data Structure

### Import data

In [14]:
URL_THO = 'https://github.com/trieuhaivo/cap5771-project/blob/702c1a607f8afde178bbf36b815cbf588e63a056/t06_data.csv'

In [15]:
!wget $URL_THO

--2025-11-14 03:38:34--  https://github.com/trieuhaivo/cap5771-project/blob/702c1a607f8afde178bbf36b815cbf588e63a056/t06_data.csv
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘t06_data.csv.1’

t06_data.csv.1          [ <=>                ] 176.83K  --.-KB/s    in 0.1s    

2025-11-14 03:38:34 (1.31 MB/s) - ‘t06_data.csv.1’ saved [181078]



In [16]:
df = pd.read_csv('t06_data.csv')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 38, saw 2


In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.sample(5)

### Data Shape

In [None]:
df.shape

### Data Types

In [None]:
df.dtypes

### Attribute Values

In [None]:
df.info()

### Missing Values

In [None]:
df.isnull().sum()

=> Have few missing values

## 1.2 Target Distribution

In [None]:
df['f_FPro_class'].unique()

In [None]:
df['f_FPro_class'].value_counts().sort_index()

## 1.3 Descriptive Statistics

### Summary Statistics

In [None]:
df.describe().transpose().round(3)

=> different variables have different scales, all of them have outliers (max)

### Boxplots

In [None]:
ax = df.boxplot(figsize=(15, 10))

ax.tick_params(axis='x', rotation=15)

plt.tight_layout()

plt.show()

In [None]:
numeric_columns = df.select_dtypes(include=['number']).columns
n_cols = 2
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 9*n_rows))
axes = axes.flatten() if n_rows > 1 else axes

for i, col in enumerate(numeric_columns):
    if i < len(axes):
        sns.boxplot(y=df[col], ax=axes[i], color='cyan')
        axes[i].set_title(f'Boxplot of {col}')

for j in range(i+1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

=> A lot of outliers

### Histograms

In [None]:
axes = df.hist(figsize=(16, 16), color='crimson')

for ax in axes.flatten():
    ax.tick_params(axis='x', rotation=15)

plt.tight_layout()
plt.show()

=> highly skewed data, most variables have small values, from 0 to 10

# 2. Data Preprocessing

## 2.1 Target Construction

### Convert target variable to binary values: 0, 1, 2 -> 1 ; 3 -> 0

In [None]:
df['target'] = np.where(df['f_FPro_class'] == 3, 0, 1)

In [None]:
df.head()

## 2.2 Feature Creation & Selection

In [None]:
df['store'].unique()

In [None]:
df['store'].value_counts()

In [None]:
df['food category'].unique()

In [None]:
df['food category'].value_counts()

In [None]:
df['brand'].unique()

In [None]:
df['brand'].value_counts()

### Drop unnecessary features: original_ID, name, brand, f_FPro_class

In [None]:
columns_to_drop = ['original_ID', 'name', 'brand', 'f_FPro_class']
df = df.drop(columns = columns_to_drop)

In [None]:
df.head()

### Clean column names: lowercase

In [None]:
!pip install skimpy

In [None]:
from skimpy import clean_columns

df = clean_columns(df)
df.columns.tolist()

In [None]:
df.head()

### Handle missing values

In [None]:
df.isnull().sum()

In [None]:
df[df.isnull().any(axis=1)][['food_category', 'sugars_total', 'fiber_total_dietary', 'fatty_acids_total_saturated']].head(10)

#### Fill by median of each category

In [None]:
# Fill missing values ​​according to each food_category
df_filled = df.copy()

for col in ['sugars_total', 'fiber_total_dietary', 'fatty_acids_total_saturated']:
    # Calculate median for each category
    category_medians = df.groupby('food_category')[col].median()

    # Fill missing values ​​with the median of the corresponding category
    for category in df['food_category'].unique():
        mask = (df['food_category'] == category) & (df[col].isnull())
        if mask.any():
            df_filled.loc[mask, col] = category_medians[category]

#### Check distributions and values after imputation

In [None]:
# Compare distribution before and after filling missing values
original_data = df.dropna()
filled_data = df_filled

fig, axes = plt.subplots(3, 2, figsize=(15, 12))

for i, col in enumerate(['sugars_total', 'fiber_total_dietary', 'fatty_acids_total_saturated']):
    # Original Distribution (no missing values)
    axes[i, 0].hist(original_data[col], bins=30, alpha=0.7, color='blue', label='Original')
    axes[i, 0].set_title(f'Original {col} Distribution')

    # Distribution after filling missing values
    axes[i, 1].hist(filled_data[col], bins=30, alpha=0.7, color='green', label='After Fill')
    axes[i, 1].set_title(f'Filled {col} Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Visualize the filled values
filled_indices = df[df['sugars_total'].isnull()].index

plt.figure(figsize=(12, 8))
for i, col in enumerate(['sugars_total', 'fiber_total_dietary', 'fatty_acids_total_saturated'], 1):
    plt.subplot(2, 2, i)

    # Plot all data
    plt.scatter(df_filled.index, df_filled[col], alpha=0.3, label='All data')

    # Highlight the filled values
    plt.scatter(filled_indices, df_filled.loc[filled_indices, col],
                color='red', s=50, label='Filled values', zorder=5)

    plt.title(f'{col} - Filled Values Highlighted')
    plt.legend()

plt.tight_layout()
plt.show()

### Encode categorical features

In [None]:
df_filled.head()

#### One Hot Encoding for store

In [None]:
store_encoded_df = pd.get_dummies(df_filled[['store']], prefix='store', drop_first=True)

store_encoded_df.columns = store_encoded_df.columns.str.lower()

df_filled = pd.concat([df_filled, store_encoded_df], axis=1)

df_filled = df_filled.drop('store', axis=1)

In [None]:
df_filled.head()

### Extract text features

### Normalize features

## 2.3 Handling Class Imbalance

### Oversampling

### Undersampling

### SMOTE

# 3. Classification models

## Baseline Model

## Model Building

### Decision Tree

### Random Forest

## Model Selection & Evaluation

### Data Split

### Random Seed

### Hyperparameter Tuning

## Evaluation metrics

### Accuracy

### Precision

### Recall

### F1 Score

### ROC-AUC

# 4. Outlier Detection

## Clustering

## Elbow Method