In [22]:
# Install necessary packages if not available (uncomment if needed)
# !pip install tensorflow pandas scikit-learn numpy imbalanced-learn

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Load dataset
file_path = "depression_data.csv"  # Adjust if necessary
df = pd.read_csv(file_path)

# Drop non-relevant columns
df = df.drop(columns=['Name'], errors='ignore')  # Ignore errors if 'Name' column doesn't exist

df = df.drop(columns=['Dietary Habits', 'Sleep Patterns', 'History of Mental Illness', 'History of Substance Abuse', 'Family History of Depression'])

# Convert categorical columns to numerical using one-hot encoding
categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].astype(str)  # Ensure all categorical columns are strings

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print(df_encoded)

        Age  Number of Children     Income  Marital Status_Married  \
0        31                   2   26265.67                    True   
1        55                   1   42710.36                    True   
2        78                   1  125332.79                   False   
3        58                   3    9992.78                   False   
4        18                   0    8595.08                   False   
...     ...                 ...        ...                     ...   
413763   68                   0  109233.43                    True   
413764   26                   0   96760.97                   False   
413765   57                   0   77353.26                    True   
413766   71                   2   24557.08                    True   
413767   62                   0  107125.74                   False   

        Marital Status_Single  Marital Status_Widowed  \
0                       False                   False   
1                       False                

In [33]:
# # Ensure target column exists
# if 'Chronic Medical Conditions' not in df_encoded.columns:
#     raise ValueError("Target variable 'Chronic Medical Conditions' not found. Check dataset formatting.")

# Separate features and target
X = df_encoded.drop(columns=['Chronic Medical Conditions_Yes'])
y = df_encoded['Chronic Medical Conditions_Yes']

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Normalize numerical features
scaler = StandardScaler()
X_resampled = pd.DataFrame(scaler.fit_transform(X_resampled), columns=X.columns)

In [34]:
X.head()
np.savetxt("test.csv",X_test,delimiter=",")

In [25]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Build the improved model with Batch Normalization and L2 Regularization
model = Sequential([
    Dense(256, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),

    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.2),

    Dense(1, activation='sigmoid')  # Binary classification output
])

# Compile the model with AdamW optimizer and learning rate scheduling
initial_learning_rate = 0.001
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate, decay_steps=10000, decay_rate=0.9, staircase=True
)

model.compile(optimizer=AdamW(learning_rate=lr_schedule),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
epochs = 25  # More epochs for better learning
batch_size = 64  # Larger batch size for stability
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")

# Save the model
model.save("chronic_medical_conditions_model.h5")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 5ms/step - accuracy: 0.6815 - loss: 0.6989 - val_accuracy: 0.7092 - val_loss: 0.5340
Epoch 2/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 4ms/step - accuracy: 0.7068 - loss: 0.5387 - val_accuracy: 0.7089 - val_loss: 0.5297
Epoch 3/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 4ms/step - accuracy: 0.7067 - loss: 0.5349 - val_accuracy: 0.7073 - val_loss: 0.5285
Epoch 4/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 4ms/step - accuracy: 0.7077 - loss: 0.5305 - val_accuracy: 0.7093 - val_loss: 0.5267
Epoch 5/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 4ms/step - accuracy: 0.7068 - loss: 0.5295 - val_accuracy: 0.7060 - val_loss: 0.5261
Epoch 6/25
[1m6940/6940[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 5ms/step - accuracy: 0.7083 - loss: 0.5287 - val_accuracy: 0.7094 - val_loss: 0.5242
Epoch 7/25



Test Accuracy: 0.7094


In [31]:
# Function for making predictions
def predict_condition(input_data):
    input_df = pd.DataFrame([input_data], columns=X.columns)
    input_scaled = scaler.transform(input_df)
    print(input_scaled)
    np.savetxt("test.csv",input_scaled,delimiter=",")
    prediction = model.predict(input_scaled)[0][0]
    return 'Likely Depressed' if prediction > 0.5 else 'Unlikely Depressed'

# Example prediction
sample_input = X_test.iloc[0].to_dict()
print(f"Predicted Condition: {predict_condition(sample_input)}")

[[-2.60856067 -1.17267125 -1.23673376 -4.07604198 -1.72967426  4.72342283
  -2.20884662  2.46044362 -1.75735792 -1.27089286 -2.18841079  0.16633913
   1.34298722 -2.870721    1.89826349 -2.44098462  1.13057284]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Predicted Condition: Unlikely Depressed


In [28]:
from sklearn.metrics import classification_report, f1_score
# Generate classification report
y_pred_probs = model.predict(X_test)
y_pred_classes = (y_pred_probs > 0.5).astype(int)  # Convert probabilities to binary classes

classification_rep = classification_report(y_test, y_pred_classes, digits=2)
f1 = f1_score(y_test, y_pred_classes, average="weighted")

# Display classification report and F1 score
print("="*50)
print(f"F1 Score: {f1:.16f}\n")
print(classification_rep)

[1m3470/3470[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step
F1 Score: 0.6823414626402392

              precision    recall  f1-score   support

       False       0.63      1.00      0.78     55617
        True       1.00      0.42      0.59     55408

    accuracy                           0.71    111025
   macro avg       0.82      0.71      0.68    111025
weighted avg       0.82      0.71      0.68    111025

