In [58]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vijayashreer/food-preferences")

print("Path to dataset files:", path)

Path to dataset files: /Users/jeongho/.cache/kagglehub/datasets/vijayashreer/food-preferences/versions/1


In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os


df = pd.read_csv(os.path.join(path, "Food_Preference.csv"))

In [60]:
df = df.dropna()

In [61]:
df.isna().sum()

Timestamp         0
Participant_ID    0
Gender            0
Nationality       0
Age               0
Food              0
Juice             0
Dessert           0
dtype: int64

In [62]:
df = df.drop(["Timestamp", "Participant_ID"], axis=1)

In [63]:
df["Age"]

0      24
1      22
2      31
3      25
4      27
       ..
283    27
284    24
285    25
286    27
287    27
Name: Age, Length: 284, dtype: int64

In [64]:
age_bins = pd.qcut(df["Age"], q=2, labels=[0, 1])

In [65]:
pd.concat([df["Age"], age_bins], axis=1)

Unnamed: 0,Age,Age.1
0,24,0
1,22,0
2,31,1
3,25,0
4,27,0
...,...,...
283,27,0
284,24,0
285,25,0
286,27,0


In [66]:
df["Age"] = age_bins

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 284 entries, 0 to 287
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   Gender       284 non-null    object  
 1   Nationality  284 non-null    object  
 2   Age          284 non-null    category
 3   Food         284 non-null    object  
 4   Juice        284 non-null    object  
 5   Dessert      284 non-null    object  
dtypes: category(1), object(5)
memory usage: 13.7+ KB


In [68]:
categorical_feature = list(df.select_dtypes(["object"]).columns)

{column: list(df[column].unique()) for column in categorical_feature}

{'Gender': ['Male', 'Female'],
 'Nationality': ['Indian',
  'Pakistani ',
  'Tanzanian',
  'Indonesia',
  'Pakistan',
  'Maldivian ',
  'MY',
  'Malaysian',
  'Malaysian ',
  'Indonesian ',
  'Maldivian',
  'MALAYSIAN',
  'Malaysia ',
  'Pakistani',
  'Canadian',
  'Nigerian ',
  'Algerian ',
  'Korean ',
  'Seychellois',
  'Indonesain',
  'Indonesian',
  'Malaysia',
  'Japan',
  'China',
  'Mauritian',
  'Yemen'],
 'Food': ['Traditional food', 'Western Food'],
 'Juice': ['Fresh Juice', 'Carbonated drinks'],
 'Dessert': ['Maybe', 'Yes', 'No']}

In [69]:
binary_features = ["Gender", "Food", "Juice"]

ordinal_feature = ["Dessert"]

nominal_features = ["Nationality"]


def binary_encode(df, column, positive_label):
    df = df.copy()
    df[column] = df[column].apply(lambda x: 1 if x == positive_label else 0)
    return df


def ordinal_encode(df, column, ordering):
    df = df.copy()
    df[column] = df[column].apply(lambda x: ordering.index(x))
    return df


def onehot_encode(df, column):
    dummies = pd.get_dummies(df[column], dtype=int)
    df = pd.concat([dummies, df], axis=1)
    df = df.drop(column, axis=1)
    return df

In [70]:
df = binary_encode(df, "Gender", "Male")
df = binary_encode(df, "Food", "Traditional food")
df = binary_encode(df, "Juice", "Fresh Juice")

dessert_ordering = ["No", "Maybe", "Yes"]
df = ordinal_encode(df, "Dessert", dessert_ordering)

df = onehot_encode(df, "Nationality")

In [76]:
y = df["Age"]
X = df.drop(["Age"], axis=1)

In [77]:
y

0      0
1      0
2      1
3      0
4      0
      ..
283    0
284    0
285    0
286    0
287    0
Name: Age, Length: 284, dtype: category
Categories (2, int64): [0 < 1]

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [79]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

In [80]:
model.score(X_train, y_train)

0.6960352422907489

In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

LR = LinearRegression()
DTR = DecisionTreeRegressor()
RFR = RandomForestRegressor()
KNR = KNeighborsRegressor()
MLP = MLPRegressor()
XGB = XGBRegressor()