In [1]:
import numpy as np
import pandas as pd
import re
import os
import zipfile

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Mount Google Drive to access your files
from google.colab import drive
drive.mount('/content/drive')

print("Setup Complete. Libraries imported and Google Drive mounted.")

Mounted at /content/drive
Setup Complete. Libraries imported and Google Drive mounted.


In [2]:
# Download the GloVe embeddings file
# This is a ~822MB file, so it might take a few minutes depending on Colab's network speed.
print("Downloading GloVe embeddings...")
!wget https://nlp.stanford.edu/data/glove.6B.zip

# Unzip the downloaded file
print("\nUnzipping the file...")
!unzip glove.6B.zip

print("\nDownload and unzipping complete. You should now see 'glove.6B.100d.txt' in your file list.")

Downloading GloVe embeddings...
--2025-10-04 00:41:00--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-10-04 00:41:00--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2025-10-04 00:43:39 (5.18 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]


Unzipping the file...
Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt     

In [3]:
# Create a dictionary to store the word vectors
embeddings_index = {}

# Path to the GloVe file
glove_file_path = 'glove.6B.100d.txt'

print(f"Loading word vectors from {glove_file_path}...")

# Open the file and load the data
with open(glove_file_path, encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} word vectors.")

Loading word vectors from glove.6B.100d.txt...
Loaded 400000 word vectors.


In [4]:
# 1. Load your dataset
# Make sure this path is correct for your Google Drive setup
file_path = '/content/drive/MyDrive/Depression detection dataset/BSMDD_main.xlsx'
df = pd.read_excel(file_path)

# Drop any rows with missing text to be safe
df.dropna(subset=['text_banglish'], inplace=True)
df.reset_index(drop=True, inplace=True)

# 2. Define the same text preprocessing function
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase, ensure it's a string
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation, numbers, and special characters
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# 3. Apply the preprocessing function to create a 'cleaned_text' column
df['cleaned_text'] = df['text_banglish'].apply(preprocess_text)

# 4. Tokenize the cleaned text
df['tokenized_text'] = df['cleaned_text'].apply(lambda x: x.split())

# Display the first few rows to verify the output
print("DataFrame after preprocessing and tokenization:")
print(df[['text_banglish', 'cleaned_text', 'tokenized_text', 'label']].head())

DataFrame after preprocessing and tokenization:
                                       text_banglish  \
0  manasika sharirikabhabe asustha klanta puro ji...   
1  daya sathe thakuna atyanta dirgha apanake pada...   
2  janatama sathe bhula loka kharapa jibana katiy...   
3  anetibha imreji spikarera anusarana biraktikar...   
4  anetibha imreji spikarera anusarana biraktikar...   

                                        cleaned_text  \
0  manasika sharirikabhabe asustha klanta puro ji...   
1  daya sathe thakuna atyanta dirgha apanake pada...   
2  janatama sathe bhula loka kharapa jibana katiy...   
3  anetibha imreji spikarera anusarana biraktikar...   
4  anetibha imreji spikarera anusarana biraktikar...   

                                      tokenized_text  label  
0  [manasika, sharirikabhabe, asustha, klanta, pu...      1  
1  [daya, sathe, thakuna, atyanta, dirgha, apanak...      1  
2  [janatama, sathe, bhula, loka, kharapa, jibana...      1  
3  [anetibha, imreji, spikarer

In [5]:
# --- 1. Use Keras Tokenizer to convert texts to integer sequences ---
texts = df['cleaned_text'].tolist()
labels = df['label'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
vocab_size = len(word_index) + 1 # +1 for the padding token
print(f"Found {len(word_index)} unique tokens in the dataset.")

# --- 2. Pad sequences to a uniform length ---
lengths = [len(s) for s in sequences]
maxlen = int(np.percentile(lengths, 95))
print(f"Padding sequences to a max length of: {maxlen}")

X = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
y = labels

# --- 3. Create the embedding matrix using GloVe vectors ---
embedding_dim = 100 # This must match the dimensionality of the GloVe vectors we loaded
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words found in embedding index will be put into the matrix.
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        # Words not found in embedding index will be all-zeros.
        misses += 1

print(f"\nEmbedding Matrix Shape: {embedding_matrix.shape}")
print(f"Converted {hits} words ({misses} misses)")
print(f"Coverage: {100 * hits / (hits + misses):.2f}% of the vocabulary is covered by GloVe.")


# --- 4. Split data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

print(f"\nTraining data shape (X_train): {X_train.shape}")
print(f"Testing data shape (X_test): {X_test.shape}")

Found 56826 unique tokens in the dataset.
Padding sequences to a max length of: 192

Embedding Matrix Shape: (56827, 100)
Converted 3578 words (53248 misses)
Coverage: 6.30% of the vocabulary is covered by GloVe.

Training data shape (X_train): (17528, 192)
Testing data shape (X_test): (4382, 192)


In [6]:
# --- Build the BiGRU Model ---

model = Sequential()

# 1. Embedding Layer
# We load our GloVe-based embedding matrix as the weights.
# We set trainable=False to keep the GloVe vectors frozen.
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim, # Should be 100
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))

# 2. Bidirectional GRU Layer
model.add(Bidirectional(GRU(units=64)))

# 3. Dropout for regularization
model.add(Dropout(0.3))

# 4. A standard Dense layer
model.add(Dense(32, activation='relu'))

# 5. Final Output Layer for binary classification
model.add(Dense(1, activation='sigmoid'))


# --- Compile the Model ---
# We use the same settings as the previous model for a fair comparison.
from tensorflow.keras.metrics import Precision, Recall

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])

# --- Print Model Summary ---
model.summary()



In [7]:
import time

# --- Train the Model ---

print("Starting model training with GloVe embeddings...")

# Use EarlyStopping to prevent overfitting and save the best model
# It will monitor the validation loss and stop if it doesn't improve for 3 epochs.
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define number of epochs and batch size
epochs = 20
batch_size = 32

# Start the timer
start_time = time.time()

# Train the model
history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

# Stop the timer
end_time = time.time()

# Calculate and print the training time
training_time = end_time - start_time
print(f"\nTraining finished in {training_time:.2f} seconds.")

Starting model training with GloVe embeddings...
Epoch 1/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 235ms/step - accuracy: 0.5955 - loss: 0.6549 - precision: 0.6011 - recall: 0.5470 - val_accuracy: 0.7188 - val_loss: 0.5622 - val_precision: 0.7230 - val_recall: 0.7099
Epoch 2/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 233ms/step - accuracy: 0.7196 - loss: 0.5535 - precision: 0.7089 - recall: 0.7371 - val_accuracy: 0.7414 - val_loss: 0.5440 - val_precision: 0.7934 - val_recall: 0.6533
Epoch 3/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 238ms/step - accuracy: 0.7561 - loss: 0.5128 - precision: 0.7501 - recall: 0.7658 - val_accuracy: 0.7545 - val_loss: 0.5245 - val_precision: 0.7956 - val_recall: 0.6852
Epoch 4/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 233ms/step - accuracy: 0.7706 - loss: 0.4944 - precision: 0.7602 - recall: 0.7897 - val_accuracy: 0.7718 - val_loss: 0.4933 - va

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time
import os
import numpy as np

# --- 1. Performance Evaluation on Test Set ---
print("--- Final Performance Evaluation (GloVe Model) ---")

# Get model predictions (probabilities are needed for ROC-AUC)
y_pred_probs = model.predict(X_test)

# Convert probabilities to binary class labels (0 or 1) for other metrics
y_pred = (y_pred_probs > 0.5).astype("int32")

# Calculate and print the metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_probs) # ROC-AUC is calculated on the prediction probabilities

print(f"\nOverall Test Metrics:")
print(f"Accuracy:      {accuracy:.4f}")
print(f"Precision:     {precision:.4f}")
print(f"Recall:        {recall:.4f}")
print(f"F1 Score:      {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}\n")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Depressed (0)', 'Depressed (1)']))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n" + "="*50 + "\n")


# --- 2. Practical Time Complexity Analysis ---
print("--- Time Complexity (GloVe Model) ---")

# We already have the training time
print(f"Total Training Time: {training_time:.2f} seconds (approx. {training_time/60:.1f} minutes)")

# Measure Inference Time
inference_start_time = time.time()
_ = model.predict(X_test)
inference_end_time = time.time()

total_inference_time = inference_end_time - inference_start_time
avg_inference_time_per_sample = total_inference_time / len(X_test)

print(f"Total Inference Time for {len(X_test)} samples: {total_inference_time:.4f} seconds")
print(f"Average Inference Time per Sample: {avg_inference_time_per_sample * 1000:.4f} milliseconds\n")
print("="*50 + "\n")


# --- 3. Practical Space Complexity Analysis ---
print("--- Space Complexity (GloVe Model) ---")

# Number of trainable parameters
total_params = model.count_params()
print(f"Total Model Parameters: {total_params:,}")

# Model size on disk (using a new name to avoid overwriting the Word2Vec model)
model_filename = "depression_detection_bigru_glove.keras"
model.save(model_filename)
model_size_bytes = os.path.getsize(model_filename)
model_size_mb = model_size_bytes / (1024 * 1024)

print(f"Model Size on Disk: {model_size_mb:.2f} MB")

--- Final Performance Evaluation (GloVe Model) ---
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 73ms/step

Overall Test Metrics:
Accuracy:      0.8069
Precision:     0.7774
Recall:        0.8604
F1 Score:      0.8168
ROC-AUC Score: 0.8879

Classification Report:
                   precision    recall  f1-score   support

Not Depressed (0)       0.84      0.75      0.80      2190
    Depressed (1)       0.78      0.86      0.82      2192

         accuracy                           0.81      4382
        macro avg       0.81      0.81      0.81      4382
     weighted avg       0.81      0.81      0.81      4382

Confusion Matrix:
[[1650  540]
 [ 306 1886]]


--- Time Complexity (GloVe Model) ---
Total Training Time: 1740.48 seconds (approx. 29.0 minutes)
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 47ms/step
Total Inference Time for 4382 samples: 6.5849 seconds
Average Inference Time per Sample: 1.5027 milliseconds


--- Space Complexity (GloV

In [9]:
# The model is already trained, so let's print its summary again.
# This will now show the complete, built version of the model.
model.summary()

In [10]:
import pickle

# Save the tokenizer object used for the GloVe model
with open('tokenizer_glove.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Tokenizer for GloVe model saved to 'tokenizer_glove.pickle'")

Tokenizer for GloVe model saved to 'tokenizer_glove.pickle'
