In [13]:
# Install necessary packages if not available (uncomment if needed)
# !pip install tensorflow pandas scikit-learn numpy imbalanced-learn

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load dataset
file_path = "depression_data.csv"  # Adjust if necessary
df = pd.read_csv(file_path)

# Drop non-relevant columns
df = df.drop(columns=['Name'], errors='ignore')  # Ignore errors if 'Name' column doesn't exist

# Convert categorical columns to numerical using one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].astype(str)  # Ensure all categorical columns are strings

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(df_encoded)

        Age  Number of Children     Income  Marital Status_Married  \
0        31                   2   26265.67                    True   
1        55                   1   42710.36                    True   
2        78                   1  125332.79                   False   
3        58                   3    9992.78                   False   
4        18                   0    8595.08                   False   
...     ...                 ...        ...                     ...   
413763   68                   0  109233.43                    True   
413764   26                   0   96760.97                   False   
413765   57                   0   77353.26                    True   
413766   71                   2   24557.08                    True   
413767   62                   0  107125.74                   False   

        Marital Status_Single  Marital Status_Widowed  \
0                       False                   False   
1                       False                

In [14]:
# # Ensure target column exists
# if 'Chronic Medical Conditions' not in df_encoded.columns:
#     raise ValueError("Target variable 'Chronic Medical Conditions' not found. Check dataset formatting.")

# Separate features and target
X = df_encoded.drop(columns=['Chronic Medical Conditions_Yes'])
y = df_encoded['Chronic Medical Conditions_Yes']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Normalize numerical features
scaler = StandardScaler()
X_resampled = pd.DataFrame(scaler.fit_transform(X_resampled), columns=X.columns)

In [15]:
X.head()

Unnamed: 0,Age,Number of Children,Income,Marital Status_Married,Marital Status_Single,Marital Status_Widowed,Education Level_Bachelor's Degree,Education Level_High School,Education Level_Master's Degree,Education Level_PhD,...,Employment Status_Unemployed,Alcohol Consumption_Low,Alcohol Consumption_Moderate,Dietary Habits_Moderate,Dietary Habits_Unhealthy,Sleep Patterns_Good,Sleep Patterns_Poor,History of Mental Illness_Yes,History of Substance Abuse_Yes,Family History of Depression_Yes
0,31,2,26265.67,True,False,False,True,False,False,False,...,True,False,True,True,False,False,False,True,False,True
1,55,1,42710.36,True,False,False,False,True,False,False,...,False,False,False,False,True,False,False,True,False,False
2,78,1,125332.79,False,False,True,False,False,True,False,...,False,True,False,False,True,True,False,False,False,True
3,58,3,9992.78,False,False,False,False,False,True,False,...,True,False,True,True,False,False,True,False,False,False
4,18,0,8595.08,False,True,False,False,True,False,False,...,True,True,False,True,False,False,False,True,False,True


In [18]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Build the improved model with Batch Normalization and L2 Regularization
model = Sequential([
    Dense(256, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),

    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.2),

    Dense(1, activation='sigmoid')  # Binary classification output
])

# Compile the model with AdamW optimizer and learning rate scheduling
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=10000, decay_rate=0.9, staircase=True
)

model.compile(optimizer=AdamW(learning_rate=lr_schedule),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
epochs = 25  # More epochs for better learning
batch_size = 64  # Larger batch size for stability
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

# Save the model
model.save("chronic_medical_conditions_model.h5")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4ms/step - accuracy: 0.6996 - loss: 0.6853 - val_accuracy: 0.7282 - val_loss: 0.5248
Epoch 2/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4ms/step - accuracy: 0.7242 - loss: 0.5291 - val_accuracy: 0.7285 - val_loss: 0.5140
Epoch 3/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.7244 - loss: 0.5209 - val_accuracy: 0.7293 - val_loss: 0.5102
Epoch 4/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 6ms/step - accuracy: 0.7265 - loss: 0.5165 - val_accuracy: 0.7298 - val_loss: 0.5078
Epoch 5/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.7272 - loss: 0.5134 - val_accuracy: 0.7303 - val_loss: 0.5050
Epoch 6/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4ms/step - accuracy: 0.7281 - loss: 0.5096 - val_accuracy: 0.7306 - val_loss: 0.5030
Epoch 7/25



Test Accuracy: 0.7318


In [19]:
# Function for making predictions
def predict_condition(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    input_scaled = scaler.transform(input_df)
    prediction = model.predict(input_scaled)[0][0]
    return 'Likely Depressed' if prediction > 0.5 else 'Unlikely Depressed'

# Example prediction
sample_input = X_test.iloc[0].to_dict()
print(f"Predicted Condition: {predict_condition(sample_input)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 521ms/step
Predicted Condition: Unlikely Chronic Medical Condition
