<a href="https://colab.research.google.com/github/syedokun/HAD7001/blob/main/Datathon_1/Datathon_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [1]:
import pandas as pd

source_df = pd.read_csv('/content/Datathon1-Dataset2 - HAD7001.csv')
source_df

Unnamed: 0,Age,Gender,Calorie_Intake,Sugar_Intake,Physical_Activity,Fast_Food_Frequency,Screen_Time,Height,Weight,BMI
0,56,Female,3091,90,160,4,1,1.569070,81.199394,32.981343
1,69,Female,2090,106,27,0,9,1.629449,74.273566,27.973877
2,46,Male,2938,11,45,2,3,1.637726,71.453466,26.640417
3,32,Female,3022,48,112,1,8,1.514198,98.163045,42.813720
4,60,Female,2350,83,115,0,7,1.410559,57.323914,28.810685
...,...,...,...,...,...,...,...,...,...,...
23530,51,Female,2858,65,37,1,9,1.774514,56.917788,18.075460
23531,20,Female,2573,27,27,1,1,1.677173,57.277270,20.362283
23532,26,Female,2246,37,44,0,3,1.661042,80.314959,29.109537
23533,22,Male,1588,24,101,4,7,1.577156,43.365424,17.433880


## EDA and preprocessing

In [2]:
source_df.describe()

Unnamed: 0,Age,Calorie_Intake,Sugar_Intake,Physical_Activity,Fast_Food_Frequency,Screen_Time,Height,Weight,BMI
count,23535.0,23535.0,23535.0,23535.0,23535.0,23535.0,23535.0,23535.0,23535.0
mean,48.532993,2495.845634,64.824559,89.496707,2.018441,5.486127,1.650279,70.019142,26.127641
std,17.872736,577.798752,32.004657,52.12081,1.414634,3.454036,0.119714,15.016866,6.87551
min,18.0,1500.0,10.0,0.0,0.0,0.0,1.196086,10.010016,3.442184
25%,33.0,1991.0,37.0,45.0,1.0,2.0,1.569815,59.859242,21.337576
50%,48.0,2495.0,65.0,90.0,2.0,6.0,1.650122,70.022501,25.661507
75%,64.0,2994.0,93.0,135.0,3.0,8.0,1.73147,80.038009,30.329032
max,79.0,3499.0,119.0,179.0,4.0,11.0,2.154243,126.493504,70.475419


In [6]:
categorical_cols = source_df.select_dtypes(exclude=['number']).columns

for col in categorical_cols:
    print(f"Column: {col}")
    print(f"Unique values: {source_df[col].unique()}")
    print(f"Value counts:\n{source_df[col].value_counts()}")
    print("-" * 20)

Column: Gender
Unique values: ['Female' 'Male']
Value counts:
Gender
Female    14896
Male       8639
Name: count, dtype: int64
--------------------


In [8]:
missing_values = source_df.isnull().sum()

print("Missing Values:")
print(missing_values)

Missing Values:
Age                    0
Gender                 0
Calorie_Intake         0
Sugar_Intake           0
Physical_Activity      0
Fast_Food_Frequency    0
Screen_Time            0
Height                 0
Weight                 0
BMI                    0
dtype: int64


In [11]:
# 1. Underweight where BMI < 18.5
# 2. Normal where 18.5 <= BMI <= 25
# 3. Overweight where BMI > 25

def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi <= 25:
        return 'Normal'
    else:
        return 'Overweight'

source_df['BMI_Category'] = source_df['BMI'].apply(categorize_bmi)

## Fitting KNN

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [13]:
le = LabelEncoder()
source_df['Gender'] = le.fit_transform(source_df['Gender'])

In [15]:
X = source_df[['Age', 'Gender', 'Calorie_Intake', 'Sugar_Intake', 'Physical_Activity', 'Fast_Food_Frequency', 'Screen_Time', 'Height', 'Weight']]
y = source_df['BMI_Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [17]:
y_pred = knn.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1311  226   25]
 [ 170 2362    0]
 [ 226    1  386]]
              precision    recall  f1-score   support

      Normal       0.77      0.84      0.80      1562
  Overweight       0.91      0.93      0.92      2532
 Underweight       0.94      0.63      0.75       613

    accuracy                           0.86      4707
   macro avg       0.87      0.80      0.83      4707
weighted avg       0.87      0.86      0.86      4707



## Ablation study

In [19]:
feature_columns = ['Age', 'Gender', 'Calorie_Intake', 'Sugar_Intake', 'Physical_Activity', 'Fast_Food_Frequency', 'Screen_Time', 'Height', 'Weight']

In [None]:
for i in range(len(feature_columns)):
    temp_feature_columns = feature_columns[:i] + feature_columns[i+1:]
    print("Feature columns used:")
    print(temp_feature_columns)
    print("-" * 20)

    X = source_df[temp_feature_columns]
    y = source_df['BMI_Category']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    print(f"Ablation study removing column: {feature_columns[i]}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("-" * 20)
    print("-" * 20)

Height and weight predictably have the highest effects on the prediction.

## Fitting KNN discarding Height and Weight

In [23]:
X = source_df[['Age', 'Gender', 'Calorie_Intake', 'Sugar_Intake', 'Physical_Activity', 'Fast_Food_Frequency', 'Screen_Time']]
y = source_df['BMI_Category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 579  963   20]
 [ 982 1514   36]
 [ 227  377    9]]
              precision    recall  f1-score   support

      Normal       0.32      0.37      0.35      1562
  Overweight       0.53      0.60      0.56      2532
 Underweight       0.14      0.01      0.03       613

    accuracy                           0.45      4707
   macro avg       0.33      0.33      0.31      4707
weighted avg       0.41      0.45      0.42      4707



## Ablation study removing Height and Weight

In [21]:
feature_columns = ['Age', 'Gender', 'Calorie_Intake', 'Sugar_Intake', 'Physical_Activity', 'Fast_Food_Frequency', 'Screen_Time']

In [22]:
for i in range(len(feature_columns)):
    temp_feature_columns = feature_columns[:i] + feature_columns[i+1:]
    print("Feature columns used:")
    print(temp_feature_columns)
    print("-" * 20)

    X = source_df[temp_feature_columns]
    y = source_df['BMI_Category']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    print(f"Ablation study removing column: {feature_columns[i]}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("-" * 20)
    print("-" * 20)

Feature columns used:
['Gender', 'Calorie_Intake', 'Sugar_Intake', 'Physical_Activity', 'Fast_Food_Frequency', 'Screen_Time']
--------------------
Ablation study removing column: Age
[[ 582  957   23]
 [ 914 1583   35]
 [ 236  373    4]]
              precision    recall  f1-score   support

      Normal       0.34      0.37      0.35      1562
  Overweight       0.54      0.63      0.58      2532
 Underweight       0.06      0.01      0.01       613

    accuracy                           0.46      4707
   macro avg       0.31      0.33      0.32      4707
weighted avg       0.41      0.46      0.43      4707

--------------------
--------------------
Feature columns used:
['Age', 'Calorie_Intake', 'Sugar_Intake', 'Physical_Activity', 'Fast_Food_Frequency', 'Screen_Time']
--------------------
Ablation study removing column: Gender
[[ 561  977   24]
 [ 939 1545   48]
 [ 220  383   10]]
              precision    recall  f1-score   support

      Normal       0.33      0.36      0.34   

## Optimal K testing