In [None]:
import pandas as pd
import numpy as np
import re
import os
import time
import pickle

# For Keras model building
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall

# For data splitting and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, roc_auc_score

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/MyDrive/Depression detection dataset/BSMDD_main.xlsx'
df = pd.read_excel(file_path)

# Drop any rows where the text is missing and reset index
df.dropna(subset=['text_banglish'], inplace=True)
df.reset_index(drop=True, inplace=True)

print("Libraries imported, Drive mounted, and dataset loaded successfully.")
df.info()

Mounted at /content/drive
Libraries imported, Drive mounted, and dataset loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21910 entries, 0 to 21909
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_bengali   21910 non-null  object
 1   text_banglish  21910 non-null  object
 2   label          21910 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 513.6+ KB


In [None]:
# --- 1. Define the text preprocessing function ---
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase and ensure it's a string
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation, numbers, etc.
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# --- 2. Apply the function to create a 'cleaned_text' column ---
df['cleaned_text'] = df['text_banglish'].apply(preprocess_text)

# --- 3. Safeguard: Remove any rows that became empty after cleaning ---
original_rows = len(df)
df = df[df['cleaned_text'] != ''].reset_index(drop=True)
if original_rows > len(df):
    print(f"Removed {original_rows - len(df)} rows that were empty after preprocessing.")

# --- 4. Display the result to verify ---
print("DataFrame after preprocessing:")
print(df[['text_banglish', 'cleaned_text', 'label']].head())

Removed 2 rows that were empty after preprocessing.
DataFrame after preprocessing:
                                       text_banglish  \
0  manasika sharirikabhabe asustha klanta puro ji...   
1  daya sathe thakuna atyanta dirgha apanake pada...   
2  janatama sathe bhula loka kharapa jibana katiy...   
3  anetibha imreji spikarera anusarana biraktikar...   
4  anetibha imreji spikarera anusarana biraktikar...   

                                        cleaned_text  label  
0  manasika sharirikabhabe asustha klanta puro ji...      1  
1  daya sathe thakuna atyanta dirgha apanake pada...      1  
2  janatama sathe bhula loka kharapa jibana katiy...      1  
3  anetibha imreji spikarera anusarana biraktikar...      1  
4  anetibha imreji spikarera anusarana biraktikar...      1  


In [None]:
# --- 1. Download the GloVe Embeddings ---
# We'll use the 100-dimensional version trained on 6 billion tokens.
print("Downloading GloVe embeddings... (This is an 822MB file and may take several minutes)")
!wget --quiet http://nlp.stanford.edu/data/glove.6B.zip

print("\nUnzipping the file...")
!unzip -q glove.6B.zip # The -q flag makes the output less verbose


# --- 2. Parse the GloVe File and Load into a Dictionary ---
# We create a dictionary that maps words (strings) to their embedding vectors (numpy arrays).
glove_file = 'glove.6B.100d.txt'
embedding_dim = 100 # This must match the GloVe file we are using (e.g., 100d)
glove_embeddings = {}

print(f"\nLoading word vectors from '{glove_file}' into memory...")
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

print(f"Successfully loaded {len(glove_embeddings):,} word vectors.")

Downloading GloVe embeddings... (This is an 822MB file and may take several minutes)

Unzipping the file...

Loading word vectors from 'glove.6B.100d.txt' into memory...
Successfully loaded 400,000 word vectors.


In [None]:
# --- 1. Use Keras Tokenizer to convert texts to integer sequences ---
texts = df['cleaned_text'].tolist()
labels = df['label'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
keras_vocab_size = len(word_index) + 1
print(f"Vocabulary size for Keras Tokenizer (our dataset): {keras_vocab_size}")


# --- 2. Pad sequences to a uniform length ---
lengths = [len(s) for s in sequences]
maxlen = int(np.percentile(lengths, 95))
print(f"Padding sequences to a max length of: {maxlen}")

X = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
y = labels


# --- 3. Create the embedding matrix from the GloVe model ---
# This matrix will be used as the initial weights for the Keras Embedding layer.
hits = 0
misses = 0
embedding_matrix = np.zeros((keras_vocab_size, embedding_dim)) # embedding_dim is 100

for word, i in word_index.items():
    # Get the vector for the word from our loaded GloVe embeddings
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        # Words not found in the GloVe vocabulary will be all-zeros.
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print(f"\nShape of the embedding matrix: {embedding_matrix.shape}")
print(f"Converted {hits} words from our vocabulary ({misses} misses).")


# --- 4. Split data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

print(f"\nTraining data shape (X_train): {X_train.shape}")
print(f"Testing data shape (X_test): {X_test.shape}")

Vocabulary size for Keras Tokenizer (our dataset): 56827
Padding sequences to a max length of: 192

Shape of the embedding matrix: (56827, 100)
Converted 3578 words from our vocabulary (53248 misses).

Training data shape (X_train): (17526, 192)
Testing data shape (X_test): (4382, 192)


In [None]:
# --- 1. Build the GRU Model ---

model = Sequential()

# Embedding Layer (loaded with our GloVe-based weights)
# We set trainable=False because the embeddings are already pre-trained.
model.add(Embedding(input_dim=keras_vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=True))

# GRU Layer
model.add(GRU(units=64))

# Dropout for regularization to help prevent overfitting
model.add(Dropout(0.3))

# A standard fully-connected Dense layer
model.add(Dense(32, activation='relu'))

# Final Output Layer for binary classification (using sigmoid for probabilities)
model.add(Dense(1, activation='sigmoid'))


# --- 2. Compile the Model ---
# We configure the model for training with an optimizer, loss function, and metrics.
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])


# --- 3. Print Model Summary ---
# This summary is useful for understanding the model's structure and parameter count.
model.summary()



In [None]:
# --- 1. Set up Training Parameters ---
print("Starting model training...")

# Add a callback for Early Stopping to prevent overfitting.
# It will stop training if the validation loss doesn't improve for 3 consecutive epochs
# and will restore the weights from the best epoch.
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define number of epochs and batch size
epochs = 20
batch_size = 32


# --- 2. Train the Model and Measure Time ---
# Start the timer
start_time = time.time()

# Train the model on the training data
history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test), # Evaluate on test data at the end of each epoch
                    callbacks=[early_stopping])

# Stop the timer
end_time = time.time()

# Calculate and print the training time
training_time = end_time - start_time
print(f"\nTraining finished in {training_time:.2f} seconds (approx {training_time/60:.1f} minutes).")

Starting model training...
Epoch 1/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m132s[0m 232ms/step - accuracy: 0.4966 - loss: 0.6929 - precision: 0.4957 - recall: 0.5239 - val_accuracy: 0.4993 - val_loss: 0.6914 - val_precision: 0.4998 - val_recall: 0.9922
Epoch 2/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 237ms/step - accuracy: 0.5157 - loss: 0.6850 - precision: 0.5415 - recall: 0.1990 - val_accuracy: 0.5016 - val_loss: 0.6898 - val_precision: 0.5009 - val_recall: 0.9922
Epoch 3/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 228ms/step - accuracy: 0.5307 - loss: 0.6639 - precision: 0.5509 - recall: 0.3543 - val_accuracy: 0.8432 - val_loss: 0.3858 - val_precision: 0.8388 - val_recall: 0.8499
Epoch 4/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 221ms/step - accuracy: 0.8872 - loss: 0.2976 - precision: 0.8749 - recall: 0.9038 - val_accuracy: 0.8733 - val_loss: 0.3058 - val_precision: 0.8791 - 

In [None]:
# --- 1. Performance Evaluation on Test Set ---
print("--- Final Performance Evaluation (Fine-Tuned GRU with GloVe) ---")

# Get model's prediction probabilities on the test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to binary class labels (0 or 1) using a 0.5 threshold
y_pred = (y_pred_probs > 0.5).astype("int32")

# Calculate and print the final metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, y_pred_probs)

print(f"\nOverall Test Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC-ROC:   {auc_roc:.4f}\n")

# Display a detailed classification report including precision and recall
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Depressed (0)', 'Depressed (1)']))

# Display the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\n" + "="*50 + "\n")


# --- 2. Practical Time Complexity Analysis ---
print("--- Time Complexity ---")

# We already have the training time from the previous step
print(f"Total Training Time: {training_time:.2f} seconds (approx. {training_time/60:.1f} minutes)")

# Measure Inference Time on the entire test set
inference_start_time = time.time()
_ = model.predict(X_test)
inference_end_time = time.time()

total_inference_time = inference_end_time - inference_start_time
avg_inference_time_per_sample = total_inference_time / len(X_test)

print(f"Total Inference Time for {len(X_test)} samples: {total_inference_time:.4f} seconds")
print(f"Average Inference Time per Sample: {avg_inference_time_per_sample * 1000:.4f} milliseconds\n")
print("="*50 + "\n")


# --- 3. Practical Space Complexity Analysis ---
print("--- Space Complexity ---")

# Number of total parameters in the model
total_params = model.count_params()
print(f"Total Model Parameters: {total_params:,}")

# Save the model to a file to check its size on disk (using a new name)
model_filename = "depression_detection_gru_glove_finetuned.keras"
model.save(model_filename)
model_size_bytes = os.path.getsize(model_filename)
model_size_mb = model_size_bytes / (1024 * 1024)

print(f"Model Size on Disk: {model_size_mb:.2f} MB")

--- Final Performance Evaluation (Fine-Tuned GRU with GloVe) ---
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 37ms/step

Overall Test Metrics:
Accuracy:  0.8733
F1 Score:  0.8724
AUC-ROC:   0.9446

Classification Report:
                   precision    recall  f1-score   support

Not Depressed (0)       0.87      0.88      0.87      2190
    Depressed (1)       0.88      0.87      0.87      2192

         accuracy                           0.87      4382
        macro avg       0.87      0.87      0.87      4382
     weighted avg       0.87      0.87      0.87      4382

Confusion Matrix:
[[1929  261]
 [ 294 1898]]


--- Time Complexity ---
Total Training Time: 902.00 seconds (approx. 15.0 minutes)
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step
Total Inference Time for 4382 samples: 5.1718 seconds
Average Inference Time per Sample: 1.1802 milliseconds


--- Space Complexity ---
Total Model Parameters: 5,716,685
Model Size on Disk: 65.46

In [None]:
model.summary()

In [None]:
import os
import pickle

# --- 1. Save the Tokenizer to a file in the Colab environment ---
# This tokenizer is essential for preprocessing new data with this fine-tuned model.
tokenizer_filename = 'tokenizer_glove_finetuned.pickle'
with open(tokenizer_filename, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Keras Tokenizer saved to '{tokenizer_filename}'")


# --- 2. Copy the model artifacts to your Google Drive ---
# Define the destination folder in your Google Drive
destination_folder = '/content/drive/MyDrive/Depression detection dataset/saved_models_finetuned_gru_glove/'

# Create the folder if it doesn't already exist
os.makedirs(destination_folder, exist_ok=True)

# List of all the files we want to save from this experiment
files_to_copy = [
    'depression_detection_gru_glove_finetuned.keras', # Saved in the evaluation step
    'tokenizer_glove_finetuned.pickle'                # Just saved now
]

# Loop through the files and copy them to your Drive
for filename in files_to_copy:
  source_path = f'./{filename}'
  destination_path = os.path.join(destination_folder, filename)
  if os.path.exists(source_path):
    !cp "{source_path}" "{destination_path}"
    print(f"Successfully copied '{filename}' to your Google Drive.")
  else:
    print(f"Warning: '{filename}' not found. Please ensure the previous cells were run.")

print(f"\nYour fine-tuned model files are now safely stored in your Google Drive!")

Keras Tokenizer saved to 'tokenizer_glove_finetuned.pickle'
Successfully copied 'depression_detection_gru_glove_finetuned.keras' to your Google Drive.
Successfully copied 'tokenizer_glove_finetuned.pickle' to your Google Drive.

Your fine-tuned model files are now safely stored in your Google Drive!
