In [1]:
# Import libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from pyspark.sql import SparkSession

In [2]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("PySpark and Neural Network") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/09 17:50:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/09 17:50:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Load the dataset
dataset_path = "../data/obesity_dataset.csv"
df = spark.read.csv(dataset_path, header = True, inferSchema = True)

                                                                                

## 1. Height as predictor of obesity and overweight

In [4]:
# Select columns
data = df[['Height', 'Obese/Overweight']]
data.head()

Row(Height=1.62, Obese/Overweight='No')

In [5]:
# Convert to pandas dataframe
pandas_df = data.select("Height", "Obese/Overweight").toPandas()
pandas_df.head()

Unnamed: 0,Height,Obese/Overweight
0,1.62,No
1,1.52,No
2,1.8,No
3,1.8,Yes
4,1.78,Yes


In [6]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df['Obese/Overweight'] = label_encoder.fit_transform(pandas_df['Obese/Overweight'])

In [7]:
# Define features (X) and target (y)
X = pandas_df[['Height']] 
y = pandas_df['Obese/Overweight']   

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2) # Previous iteration of this step did not include stratify = y. Model improved with the addition of this parameter.


In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features)) # Doubling the amount of units did not improve accuracy

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu")) # Doubling the amount of units did not improve accuracy

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [14]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [15]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5840 - loss: 0.6786 - val_accuracy: 0.7544 - val_loss: 0.6233
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 717us/step - accuracy: 0.7405 - loss: 0.6126 - val_accuracy: 0.7544 - val_loss: 0.5793
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 653us/step - accuracy: 0.7552 - loss: 0.5749 - val_accuracy: 0.7544 - val_loss: 0.5695
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 670us/step - accuracy: 0.7183 - loss: 0.5912 - val_accuracy: 0.7544 - val_loss: 0.5662
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 680us/step - accuracy: 0.7074 - loss: 0.6075 - val_accuracy: 0.7544 - val_loss: 0.5635
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step - accuracy: 0.7406 - loss: 0.5666 - val_accuracy: 0.7544 - val_loss: 0.5622
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x1691c32c0>

In [16]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 708us/step - accuracy: 0.7515 - loss: 0.5550
Neural Network Accuracy: 0.7352246046066284


## 2. Age as predictor of obesity/overweight

In [17]:
# Select columns
data2 = df[['Age', 'Obese/Overweight']]
data2.head()

Row(Age=21.0, Obese/Overweight='No')

In [18]:
# Convert to pandas dataframe
pandas_df2 = data2.select("Age", "Obese/Overweight").toPandas()
pandas_df2.head()

Unnamed: 0,Age,Obese/Overweight
0,21.0,No
1,21.0,No
2,23.0,No
3,27.0,Yes
4,22.0,Yes


In [19]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df2['Obese/Overweight'] = label_encoder.fit_transform(pandas_df2['Obese/Overweight'])

In [20]:
# Define features (X) and target (y)
X = pandas_df2[['Age']]
y = pandas_df2['Obese/Overweight']   

In [21]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2) # Previous iteration of this step did not include stratify = y. Model improved with the addition of this parameter.


In [22]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features)) # Doubling the amount of units did not improve accuracy

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu")) # Doubling the amount of units did not improve accuracy

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [24]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [25]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7595 - loss: 0.6041 - val_accuracy: 0.7544 - val_loss: 0.5557
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 716us/step - accuracy: 0.7362 - loss: 0.5661 - val_accuracy: 0.7544 - val_loss: 0.5262
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 665us/step - accuracy: 0.7454 - loss: 0.5263 - val_accuracy: 0.7544 - val_loss: 0.5130
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 653us/step - accuracy: 0.7137 - loss: 0.5291 - val_accuracy: 0.7544 - val_loss: 0.5066
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.7508 - loss: 0.4799 - val_accuracy: 0.7544 - val_loss: 0.5039
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 674us/step - accuracy: 0.7342 - loss: 0.4972 - val_accuracy: 0.7574 - val_loss: 0.5012
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x169c079e0>

In [26]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 664us/step - accuracy: 0.7904 - loss: 0.4354
Neural Network Accuracy: 0.758865237236023


## 3. Meals per day as predictor of obesity/overweight

In [27]:
# Select columns
data3 = df[['Meals_Per_Day', 'Obese/Overweight']]
data3.head()

Row(Meals_Per_Day=3.0, Obese/Overweight='No')

In [28]:
# Convert to pandas dataframe
pandas_df3 = data3.select("Meals_Per_Day", "Obese/Overweight").toPandas()
pandas_df3.head()

Unnamed: 0,Meals_Per_Day,Obese/Overweight
0,3.0,No
1,3.0,No
2,3.0,No
3,3.0,Yes
4,1.0,Yes


In [29]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df3['Obese/Overweight'] = label_encoder.fit_transform(pandas_df3['Obese/Overweight'])

In [30]:
# Define features (X) and target (y)
X = pandas_df3[['Meals_Per_Day']]
y = pandas_df3['Obese/Overweight']   

In [31]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2) # Previous iteration of this step did not include stratify = y. Model improved with the addition of this parameter.


In [32]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [33]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features)) # Doubling the amount of units did not improve accuracy

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu")) # Doubling the amount of units did not improve accuracy

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [34]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [35]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7272 - loss: 0.6675 - val_accuracy: 0.7544 - val_loss: 0.6264
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 733us/step - accuracy: 0.7260 - loss: 0.6298 - val_accuracy: 0.7544 - val_loss: 0.5960
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 668us/step - accuracy: 0.7314 - loss: 0.6038 - val_accuracy: 0.7544 - val_loss: 0.5801
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 660us/step - accuracy: 0.7604 - loss: 0.5632 - val_accuracy: 0.7544 - val_loss: 0.5690
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 672us/step - accuracy: 0.7429 - loss: 0.5699 - val_accuracy: 0.7574 - val_loss: 0.5556
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 688us/step - accuracy: 0.7660 - loss: 0.5454 - val_accuracy: 0.7604 - val_loss: 0.5500
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x169324b30>

In [36]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 705us/step - accuracy: 0.7942 - loss: 0.4906
Neural Network Accuracy: 0.7777777910232544


## 4. Frequency of physical activity as predictor of obesity/overweight

In [37]:
# Select columns
data4 = df[['Frequency_Physical_Activity', 'Obese/Overweight']]
data4.head()

Row(Frequency_Physical_Activity=0.0, Obese/Overweight='No')

In [38]:
# Convert to pandas dataframe
pandas_df4 = data4.select("Frequency_Physical_Activity", "Obese/Overweight").toPandas()
pandas_df4.head()

Unnamed: 0,Frequency_Physical_Activity,Obese/Overweight
0,0.0,No
1,3.0,No
2,2.0,No
3,2.0,Yes
4,0.0,Yes


In [39]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df4['Obese/Overweight'] = label_encoder.fit_transform(pandas_df4['Obese/Overweight'])

In [40]:
# Define features (X) and target (y)
X = pandas_df4[['Frequency_Physical_Activity']]
y = pandas_df4['Obese/Overweight']   

In [41]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2) # Previous iteration of this step did not include stratify = y. Model improved with the addition of this parameter.


In [42]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [43]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features)) # Doubling the amount of units did not improve accuracy

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu")) # Doubling the amount of units did not improve accuracy

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [44]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [45]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7267 - loss: 0.6680 - val_accuracy: 0.7544 - val_loss: 0.6203
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 702us/step - accuracy: 0.7435 - loss: 0.6170 - val_accuracy: 0.7544 - val_loss: 0.5828
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 670us/step - accuracy: 0.7351 - loss: 0.5924 - val_accuracy: 0.7544 - val_loss: 0.5673
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 702us/step - accuracy: 0.7386 - loss: 0.5783 - val_accuracy: 0.7544 - val_loss: 0.5613
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 699us/step - accuracy: 0.7287 - loss: 0.5764 - val_accuracy: 0.7544 - val_loss: 0.5575
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 665us/step - accuracy: 0.7418 - loss: 0.5685 - val_accuracy: 0.7544 - val_loss: 0.5548
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x169ad7920>

In [46]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 706us/step - accuracy: 0.7276 - loss: 0.5392
Neural Network Accuracy: 0.7210401892662048


## 5. Water intake as predictor of obesity/overweight

In [47]:
# Select columns
data5 = df[['Water_Intake', 'Obese/Overweight']]
data5.head()

Row(Water_Intake=2.0, Obese/Overweight='No')

In [48]:
# Convert to pandas dataframe
pandas_df5 = data5.select("Water_Intake", "Obese/Overweight").toPandas()
pandas_df5.head()

Unnamed: 0,Water_Intake,Obese/Overweight
0,2.0,No
1,3.0,No
2,2.0,No
3,2.0,Yes
4,2.0,Yes


In [49]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df5['Obese/Overweight'] = label_encoder.fit_transform(pandas_df5['Obese/Overweight'])

In [50]:
# Define features (X) and target (y)
X = pandas_df5[['Water_Intake']]
y = pandas_df5['Obese/Overweight']   

In [51]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2) # Previous iteration of this step did not include stratify = y. Model improved with the addition of this parameter.


In [52]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [53]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features)) # Doubling the amount of units did not improve accuracy

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu")) # Doubling the amount of units did not improve accuracy

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [54]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [55]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3632 - loss: 0.7446 - val_accuracy: 0.6272 - val_loss: 0.6881
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 723us/step - accuracy: 0.6833 - loss: 0.6776 - val_accuracy: 0.7544 - val_loss: 0.6557
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 665us/step - accuracy: 0.7229 - loss: 0.6571 - val_accuracy: 0.7544 - val_loss: 0.6320
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 656us/step - accuracy: 0.7266 - loss: 0.6354 - val_accuracy: 0.7544 - val_loss: 0.6065
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 703us/step - accuracy: 0.7303 - loss: 0.6140 - val_accuracy: 0.7544 - val_loss: 0.5838
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 708us/step - accuracy: 0.7182 - loss: 0.6039 - val_accuracy: 0.7544 - val_loss: 0.5663
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x16c36d310>

In [56]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 726us/step - accuracy: 0.7515 - loss: 0.4813
Neural Network Accuracy: 0.7352246046066284


## 6. Vegetable intake as predictor of obesity/overweight

In [57]:
# Select columns
data6 = df[['Vegetable_Intake', 'Obese/Overweight']]
data6.head()

Row(Vegetable_Intake=2.0, Obese/Overweight='No')

In [58]:
# Convert to pandas dataframe
pandas_df6 = data6.select("Vegetable_Intake", "Obese/Overweight").toPandas()
pandas_df6.head()

Unnamed: 0,Vegetable_Intake,Obese/Overweight
0,2.0,No
1,3.0,No
2,2.0,No
3,3.0,Yes
4,2.0,Yes


In [59]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df6['Obese/Overweight'] = label_encoder.fit_transform(pandas_df6['Obese/Overweight'])

In [60]:
# Define features (X) and target (y)
X = pandas_df6[['Vegetable_Intake']]
y = pandas_df6['Obese/Overweight']   

In [61]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2) # Previous iteration of this step did not include stratify = y. Model improved with the addition of this parameter.


In [62]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [63]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features)) # Doubling the amount of units did not improve accuracy

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu")) # Doubling the amount of units did not improve accuracy

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [64]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [65]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6123 - loss: 0.6617 - val_accuracy: 0.7544 - val_loss: 0.6202
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 721us/step - accuracy: 0.7288 - loss: 0.6313 - val_accuracy: 0.7544 - val_loss: 0.5959
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 661us/step - accuracy: 0.7282 - loss: 0.6122 - val_accuracy: 0.7544 - val_loss: 0.5784
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 672us/step - accuracy: 0.7335 - loss: 0.5999 - val_accuracy: 0.7544 - val_loss: 0.5712
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 678us/step - accuracy: 0.7360 - loss: 0.5906 - val_accuracy: 0.7544 - val_loss: 0.5679
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 691us/step - accuracy: 0.7327 - loss: 0.5896 - val_accuracy: 0.7544 - val_loss: 0.5664
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x16c7da870>

In [66]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 734us/step - accuracy: 0.7507 - loss: 0.5614
Neural Network Accuracy: 0.7352246046066284


## 7. Technology use as predictor of obesity/overweight

In [67]:
# Select columns
data7 = df[['Technology_Use', 'Obese/Overweight']]
data7.head()

Row(Technology_Use=1.0, Obese/Overweight='No')

In [68]:
# Convert to pandas dataframe
pandas_df7 = data7.select("Technology_Use", "Obese/Overweight").toPandas()
pandas_df7.head()

Unnamed: 0,Technology_Use,Obese/Overweight
0,1.0,No
1,0.0,No
2,1.0,No
3,0.0,Yes
4,0.0,Yes


In [69]:
# Encode categorical data
label_encoder = LabelEncoder()
pandas_df7['Obese/Overweight'] = label_encoder.fit_transform(pandas_df7['Obese/Overweight'])

In [70]:
# Define features (X) and target (y)
X = pandas_df7[['Technology_Use']]
y = pandas_df7['Obese/Overweight']   

In [71]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2, stratify = y)
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2) # Previous iteration of this step did not include stratify = y. Model improved with the addition of this parameter.


In [72]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [73]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_features = len(X_train_scaled[0])

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = 20, activation = "relu", input_dim = input_features)) # Doubling the amount of units did not improve accuracy

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = 10, activation = "relu")) # Doubling the amount of units did not improve accuracy

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# Check the structure of the model
nn.summary()

In [74]:
# Compile the neural network
nn.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [75]:
# Train the neural network
nn.fit(X_train_scaled, y_train, epochs = 100, verbose = 1, validation_split = 0.2)

Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6485 - loss: 0.6652 - val_accuracy: 0.7544 - val_loss: 0.6309
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 730us/step - accuracy: 0.7442 - loss: 0.6147 - val_accuracy: 0.7544 - val_loss: 0.5927
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 661us/step - accuracy: 0.7139 - loss: 0.6092 - val_accuracy: 0.7544 - val_loss: 0.5760
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 658us/step - accuracy: 0.7093 - loss: 0.5966 - val_accuracy: 0.7544 - val_loss: 0.5659
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 650us/step - accuracy: 0.7319 - loss: 0.5776 - val_accuracy: 0.7544 - val_loss: 0.5583
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 651us/step - accuracy: 0.7224 - loss: 0.5837 - val_accuracy: 0.7544 - val_loss: 0.5547
Epoch 7/100
[1m43/43[0

<keras.src.callbacks.history.History at 0x16ce50a10>

In [76]:
# Evaluate the neural network
nn_loss, nn_accuracy = nn.evaluate(X_test_scaled, y_test)
print(f"Neural Network Accuracy: {nn_accuracy}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 713us/step - accuracy: 0.7515 - loss: 0.5595
Neural Network Accuracy: 0.7352246046066284
