In [1]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from pyspark.sql import SparkSession

In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PySpark and Neural Network") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/08 12:04:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/08 12:04:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Load the dataset
dataset_path = "../data/obesity_dataset.csv"
df = spark.read.csv(dataset_path, header = True, inferSchema = True)

                                                                                

## 1. Food between meals as predictor of obesity and overweight

In [4]:
# Select columns 'Food_Between_Meals' and 'Obese/Overweight'
data = df[['Food_Between_Meals', 'Obese/Overweight']]
data.head()

Row(Food_Between_Meals='Sometimes', Obese/Overweight='No')

In [5]:
# Convert to pandas dataframe
pandas_df = data.select("Food_Between_Meals", "Obese/Overweight").toPandas()
pandas_df.head()

Unnamed: 0,Food_Between_Meals,Obese/Overweight
0,Sometimes,No
1,Sometimes,No
2,Sometimes,No
3,Sometimes,Yes
4,Sometimes,Yes


In [6]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df['Food_Between_Meals'] = label_encoder.fit_transform(pandas_df['Food_Between_Meals'])
pandas_df['Obese/Overweight'] = label_encoder.fit_transform(pandas_df['Obese/Overweight'])

In [7]:
# Define features (X) and target (y)
X = pandas_df[['Food_Between_Meals']] 
y = pandas_df['Obese/Overweight']   

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [11]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [12]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5530 - loss: 0.6812 - val_accuracy: 0.8373 - val_loss: 0.6217
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 692us/step - accuracy: 0.8236 - loss: 0.6073 - val_accuracy: 0.8373 - val_loss: 0.5647
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 683us/step - accuracy: 0.8211 - loss: 0.5555 - val_accuracy: 0.8373 - val_loss: 0.5169
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step - accuracy: 0.7982 - loss: 0.5373 - val_accuracy: 0.8373 - val_loss: 0.4883
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step - accuracy: 0.7997 - loss: 0.5110 - val_accuracy: 0.8373 - val_loss: 0.4743
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 737us/step - accuracy: 0.8234 - loss: 0.4752 - val_accuracy: 0.8373 - val_loss: 0.4691
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x17769d940>

In [13]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 710us/step - accuracy: 0.8436 - loss: 0.4374
Neural Network Accuracy: 0.8368794322013855


## 2. Mode of transportation as predictor of obesity/overweight

In [14]:
# Select columns
data2 = df[['Mode_Transportation', 'Obese/Overweight']]
data2.head()

Row(Mode_Transportation='Public_Transportation', Obese/Overweight='No')

In [15]:
# Convert to pandas dataframe
pandas_df2 = data2.select("Mode_Transportation", "Obese/Overweight").toPandas()
pandas_df2.head()

Unnamed: 0,Mode_Transportation,Obese/Overweight
0,Public_Transportation,No
1,Public_Transportation,No
2,Public_Transportation,No
3,Walking,Yes
4,Public_Transportation,Yes


In [16]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df2['Mode_Transportation'] = label_encoder.fit_transform(pandas_df2['Mode_Transportation'])
pandas_df2['Obese/Overweight'] = label_encoder.fit_transform(pandas_df2['Obese/Overweight'])

In [17]:
# Define features (X) and target (y)
X = pandas_df2[['Mode_Transportation']]
y = pandas_df2['Obese/Overweight']   

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [19]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [21]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [22]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3744 - loss: 0.7196 - val_accuracy: 0.5888 - val_loss: 0.6748
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734us/step - accuracy: 0.6528 - loss: 0.6605 - val_accuracy: 0.7544 - val_loss: 0.6116
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 685us/step - accuracy: 0.7420 - loss: 0.6073 - val_accuracy: 0.7544 - val_loss: 0.5691
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 682us/step - accuracy: 0.7174 - loss: 0.6002 - val_accuracy: 0.7544 - val_loss: 0.5581
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 685us/step - accuracy: 0.7106 - loss: 0.6051 - val_accuracy: 0.7544 - val_loss: 0.5565
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 685us/step - accuracy: 0.7236 - loss: 0.5930 - val_accuracy: 0.7544 - val_loss: 0.5553
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x178212f30>

In [23]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 655us/step - accuracy: 0.7567 - loss: 0.5533
Neural Network Accuracy: 0.73758864402771


## 3. Alcohol intake as predictor of obesity/overweight

In [24]:
# Select columns
data3 = df[['Alcohol_Intake', 'Obese/Overweight']]
data3.head()

Row(Alcohol_Intake='no', Obese/Overweight='No')

In [25]:
# Convert to pandas dataframe
pandas_df3 = data3.select("Alcohol_Intake", "Obese/Overweight").toPandas()
pandas_df3.head()

Unnamed: 0,Alcohol_Intake,Obese/Overweight
0,no,No
1,Sometimes,No
2,Frequently,No
3,Frequently,Yes
4,Sometimes,Yes


In [26]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df3['Alcohol_Intake'] = label_encoder.fit_transform(pandas_df3['Alcohol_Intake'])
pandas_df3['Obese/Overweight'] = label_encoder.fit_transform(pandas_df3['Obese/Overweight'])

In [27]:
# Define features (X) and target (y)
X = pandas_df3[['Alcohol_Intake']]
y = pandas_df3['Obese/Overweight']   

In [28]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [29]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [30]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [31]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [32]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7309 - loss: 0.6643 - val_accuracy: 0.7544 - val_loss: 0.6162
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 743us/step - accuracy: 0.7216 - loss: 0.6197 - val_accuracy: 0.7544 - val_loss: 0.5788
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 692us/step - accuracy: 0.7345 - loss: 0.5856 - val_accuracy: 0.7544 - val_loss: 0.5620
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step - accuracy: 0.7460 - loss: 0.5754 - val_accuracy: 0.7544 - val_loss: 0.5585
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7292 - loss: 0.5806 - val_accuracy: 0.7544 - val_loss: 0.5572
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 705us/step - accuracy: 0.7394 - loss: 0.5734 - val_accuracy: 0.7544 - val_loss: 0.5558
Epoch 7/100
[1m43/43[0m 

<keras.src.callbacks.history.History at 0x1795e2a50>

In [33]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 707us/step - accuracy: 0.7515 - loss: 0.5554
Neural Network Accuracy: 0.7352246046066284


## 4. Smoking as predictor of obesity/overweight

In [34]:
# Select columns
data4 = df[['SMOKE', 'Obese/Overweight']]
data4.head()

Row(SMOKE='no', Obese/Overweight='No')

In [35]:
# Convert to pandas dataframe
pandas_df4 = data4.select("SMOKE", "Obese/Overweight").toPandas()
pandas_df4.head()

Unnamed: 0,SMOKE,Obese/Overweight
0,no,No
1,yes,No
2,no,No
3,no,Yes
4,no,Yes


In [36]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df4['SMOKE'] = label_encoder.fit_transform(pandas_df4['SMOKE'])
pandas_df4['Obese/Overweight'] = label_encoder.fit_transform(pandas_df4['Obese/Overweight'])

In [37]:
# Define features (X) and target (y)
X = pandas_df4[['SMOKE']]
y = pandas_df4['Obese/Overweight']   

In [38]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [39]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [40]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [41]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [42]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7517 - loss: 0.6686 - val_accuracy: 0.7544 - val_loss: 0.6238
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 723us/step - accuracy: 0.7283 - loss: 0.6287 - val_accuracy: 0.7544 - val_loss: 0.5866
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 687us/step - accuracy: 0.7074 - loss: 0.6114 - val_accuracy: 0.7544 - val_loss: 0.5683
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 705us/step - accuracy: 0.7397 - loss: 0.5776 - val_accuracy: 0.7544 - val_loss: 0.5608
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 679us/step - accuracy: 0.7251 - loss: 0.5880 - val_accuracy: 0.7544 - val_loss: 0.5594
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 683us/step - accuracy: 0.7289 - loss: 0.5846 - val_accuracy: 0.7544 - val_loss: 0.5593
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x17b488e00>

In [43]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 707us/step - accuracy: 0.7515 - loss: 0.5619
Neural Network Accuracy: 0.7352246046066284


## 5. Family history as predictor of obesity/overweight

In [44]:
# Select columns
data5 = df[['family_history_with_overweight', 'Obese/Overweight']]
data5.head()

Row(family_history_with_overweight='yes', Obese/Overweight='No')

In [45]:
# Convert to pandas dataframe
pandas_df5 = data5.select("family_history_with_overweight", "Obese/Overweight").toPandas()
pandas_df5.head()

Unnamed: 0,family_history_with_overweight,Obese/Overweight
0,yes,No
1,yes,No
2,yes,No
3,no,Yes
4,no,Yes


In [46]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df5['family_history_with_overweight'] = label_encoder.fit_transform(pandas_df5['family_history_with_overweight'])
pandas_df5['Obese/Overweight'] = label_encoder.fit_transform(pandas_df5['Obese/Overweight'])

In [47]:
# Define features (X) and target (y)
X = pandas_df5[['family_history_with_overweight']]
y = pandas_df5['Obese/Overweight']   

In [48]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [49]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [50]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [51]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [52]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5905 - loss: 0.6882 - val_accuracy: 0.8047 - val_loss: 0.6446
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 748us/step - accuracy: 0.8203 - loss: 0.6292 - val_accuracy: 0.8047 - val_loss: 0.6083
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 702us/step - accuracy: 0.7841 - loss: 0.6069 - val_accuracy: 0.8047 - val_loss: 0.5736
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.7968 - loss: 0.5690 - val_accuracy: 0.8047 - val_loss: 0.5409
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 728us/step - accuracy: 0.8146 - loss: 0.5264 - val_accuracy: 0.8047 - val_loss: 0.5153
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 692us/step - accuracy: 0.7980 - loss: 0.5170 - val_accuracy: 0.8047 - val_loss: 0.4992
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x17a3e2750>

In [53]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 645us/step - accuracy: 0.8514 - loss: 0.4196
Neural Network Accuracy: 0.8368794322013855


## 6. High-caloric food as predictor of obesity/overweight

In [54]:
# Select columns
data6 = df[['High_Caloric_Food', 'Obese/Overweight']]
data6.head()

Row(High_Caloric_Food='no', Obese/Overweight='No')

In [55]:
# Convert to pandas dataframe
pandas_df6 = data6.select("High_Caloric_Food", "Obese/Overweight").toPandas()
pandas_df6.head()

Unnamed: 0,High_Caloric_Food,Obese/Overweight
0,no,No
1,no,No
2,no,No
3,no,Yes
4,no,Yes


In [56]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df6['High_Caloric_Food'] = label_encoder.fit_transform(pandas_df6['High_Caloric_Food'])
pandas_df6['Obese/Overweight'] = label_encoder.fit_transform(pandas_df6['Obese/Overweight'])

In [57]:
# Define features (X) and target (y)
X = pandas_df6[['High_Caloric_Food']]
y = pandas_df6['Obese/Overweight']   

In [58]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [59]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [60]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [61]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [62]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3694 - loss: 0.7128 - val_accuracy: 0.7840 - val_loss: 0.6793
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 715us/step - accuracy: 0.7301 - loss: 0.6834 - val_accuracy: 0.7840 - val_loss: 0.6720
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 696us/step - accuracy: 0.7269 - loss: 0.6743 - val_accuracy: 0.7544 - val_loss: 0.6641
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 681us/step - accuracy: 0.7419 - loss: 0.6662 - val_accuracy: 0.7544 - val_loss: 0.6559
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 700us/step - accuracy: 0.7335 - loss: 0.6592 - val_accuracy: 0.7544 - val_loss: 0.6485
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 691us/step - accuracy: 0.7441 - loss: 0.6510 - val_accuracy: 0.7544 - val_loss: 0.6412
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x17c1986b0>

In [63]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 715us/step - accuracy: 0.7515 - loss: 0.5327
Neural Network Accuracy: 0.7352246046066284


## 7. Monitoring calories as predictor of obesity/overweight

In [64]:
# Select columns
data7 = df[['Monitor_Calories', 'Obese/Overweight']]
data7.head()

Row(Monitor_Calories='no', Obese/Overweight='No')

In [65]:
# Convert to pandas dataframe
pandas_df7 = data7.select("Monitor_Calories", "Obese/Overweight").toPandas()
pandas_df7.head()

Unnamed: 0,Monitor_Calories,Obese/Overweight
0,no,No
1,yes,No
2,no,No
3,no,Yes
4,no,Yes


In [66]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df7['Monitor_Calories'] = label_encoder.fit_transform(pandas_df7['Monitor_Calories'])
pandas_df7['Obese/Overweight'] = label_encoder.fit_transform(pandas_df7['Obese/Overweight'])

In [67]:
# Define features (X) and target (y)
X = pandas_df7[['Monitor_Calories']]
y = pandas_df7['Obese/Overweight']   

In [68]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [69]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [70]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [71]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [72]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5433 - loss: 0.6924 - val_accuracy: 0.7426 - val_loss: 0.6812
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - accuracy: 0.7281 - loss: 0.6789 - val_accuracy: 0.7426 - val_loss: 0.6668
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 721us/step - accuracy: 0.7211 - loss: 0.6658 - val_accuracy: 0.7426 - val_loss: 0.6486
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 700us/step - accuracy: 0.7161 - loss: 0.6507 - val_accuracy: 0.7426 - val_loss: 0.6270
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 687us/step - accuracy: 0.7324 - loss: 0.6269 - val_accuracy: 0.7426 - val_loss: 0.6028
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 697us/step - accuracy: 0.7272 - loss: 0.6082 - val_accuracy: 0.7426 - val_loss: 0.5823
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x17a108710>

In [73]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step - accuracy: 0.7547 - loss: 0.5537
Neural Network Accuracy: 0.7494089603424072


## 8. Gender as predictor of obesity

In [74]:
# Select columns
data8 = df[['Gender', 'Obese/Overweight']]
data8.head()

Row(Gender='Female', Obese/Overweight='No')

In [75]:
# Convert to pandas dataframe
pandas_df8 = data8.select("Gender", "Obese/Overweight").toPandas()
pandas_df8.head()

Unnamed: 0,Gender,Obese/Overweight
0,Female,No
1,Female,No
2,Male,No
3,Male,Yes
4,Male,Yes


In [76]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df8['Gender'] = label_encoder.fit_transform(pandas_df8['Gender'])
pandas_df8['Obese/Overweight'] = label_encoder.fit_transform(pandas_df8['Obese/Overweight'])

In [77]:
# Define features (X) and target (y)
X = pandas_df8[['Gender']]
y = pandas_df8['Obese/Overweight']   

In [78]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)

In [79]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [80]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [81]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [82]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5311 - loss: 0.6919 - val_accuracy: 0.7544 - val_loss: 0.6416
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 721us/step - accuracy: 0.7080 - loss: 0.6384 - val_accuracy: 0.7544 - val_loss: 0.5919
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 661us/step - accuracy: 0.7348 - loss: 0.5918 - val_accuracy: 0.7544 - val_loss: 0.5671
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.7273 - loss: 0.5849 - val_accuracy: 0.7544 - val_loss: 0.5595
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 666us/step - accuracy: 0.7061 - loss: 0.6005 - val_accuracy: 0.7544 - val_loss: 0.5577
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 689us/step - accuracy: 0.7318 - loss: 0.5795 - val_accuracy: 0.7544 - val_loss: 0.5568
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x17c49a2a0>

In [83]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 743us/step - accuracy: 0.7515 - loss: 0.5647
Neural Network Accuracy: 0.7352246046066284
