In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m76.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
import re
import time
import os
import pickle

# For Word2Vec model
from gensim.models import Word2Vec

# For Keras model building
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Precision, Recall

# For data splitting and evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
file_path = '/content/drive/MyDrive/Depression detection dataset/BSMDD_main.xlsx'
df = pd.read_excel(file_path)

# Drop any rows where the text is missing and reset index
df.dropna(subset=['text_banglish'], inplace=True)
df.reset_index(drop=True, inplace=True)

print("Libraries imported, Drive mounted, and dataset loaded successfully.")
df.info()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Libraries imported, Drive mounted, and dataset loaded successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21910 entries, 0 to 21909
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text_bengali   21910 non-null  object
 1   text_banglish  21910 non-null  object
 2   label          21910 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 513.6+ KB


In [None]:
# --- 1. Define the text preprocessing function ---
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase and ensure it's a string
    text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation, numbers, etc.
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    return text

# --- 2. Apply the function to create a 'cleaned_text' column ---
df['cleaned_text'] = df['text_banglish'].apply(preprocess_text)

# --- 3. Tokenize the cleaned text into lists of words ---
df['tokenized_text'] = df['cleaned_text'].apply(lambda x: x.split())

# --- 4. Safeguard: Remove any rows that became empty after cleaning ---
original_rows = len(df)
df = df[df['cleaned_text'] != ''].reset_index(drop=True)
if original_rows > len(df):
    print(f"Removed {original_rows - len(df)} rows that were empty after preprocessing.")

# --- 5. Display the result to verify ---
print("DataFrame after preprocessing and tokenization:")
print(df[['text_banglish', 'cleaned_text', 'tokenized_text', 'label']].head())

Removed 2 rows that were empty after preprocessing.
DataFrame after preprocessing and tokenization:
                                       text_banglish  \
0  manasika sharirikabhabe asustha klanta puro ji...   
1  daya sathe thakuna atyanta dirgha apanake pada...   
2  janatama sathe bhula loka kharapa jibana katiy...   
3  anetibha imreji spikarera anusarana biraktikar...   
4  anetibha imreji spikarera anusarana biraktikar...   

                                        cleaned_text  \
0  manasika sharirikabhabe asustha klanta puro ji...   
1  daya sathe thakuna atyanta dirgha apanake pada...   
2  janatama sathe bhula loka kharapa jibana katiy...   
3  anetibha imreji spikarera anusarana biraktikar...   
4  anetibha imreji spikarera anusarana biraktikar...   

                                      tokenized_text  label  
0  [manasika, sharirikabhabe, asustha, klanta, pu...      1  
1  [daya, sathe, thakuna, atyanta, dirgha, apanak...      1  
2  [janatama, sathe, bhula, loka, kharap

In [None]:
# Isolate the tokenized sentences for training
sentences = df['tokenized_text'].tolist()

# --- Word2Vec Model Training ---
# Define the model parameters based on your successful BiGRU script
embedding_dim = 100  # Dimensionality of the word vectors
window_size = 5      # Context window size
min_word_count = 1   # Minimum word count to be included

# sg=1 trains the Skip-Gram model, which often works well.
print("Training Word2Vec model... (This might take a minute or two)")
w2v_model = Word2Vec(sentences=sentences,
                     vector_size=embedding_dim,
                     window=window_size,
                     min_count=min_word_count,
                     workers=4, # Use 4 CPU cores for training
                     sg=1)

print("Word2Vec model trained successfully.")

# --- Sanity Check ---
# Check the vocabulary size
vocab_size = len(w2v_model.wv.index_to_key)
print(f"\nVocabulary size: {vocab_size}")

# Test the model by finding the most similar words to a sample word.
# 'ami' ('I' in Bengali) is a common word and likely to be in the vocabulary.
try:
    sample_word = 'ami'
    similar_words = w2v_model.wv.most_similar(sample_word)
    print(f"\nWords most similar to '{sample_word}':")
    print(similar_words)
except KeyError:
    print(f"\nCould not perform similarity check. The word '{sample_word}' was not in the vocabulary.")

Training Word2Vec model... (This might take a minute or two)
Word2Vec model trained successfully.

Vocabulary size: 56826

Words most similar to 'ami':
[('intaraayakashanera', 0.9455628991127014), ('kishorakishorike', 0.9404802918434143), ('penapala', 0.9387423992156982), ('asamrakshita', 0.9377596974372864), ('brendana', 0.9375109076499939), ('skulakaleja', 0.9345955848693848), ('anugamidera', 0.9344710111618042), ('partiibhente', 0.933462381362915), ('ranadauna', 0.932837963104248), ('narbha', 0.9325851202011108)]


In [None]:
# --- 1. Use Keras Tokenizer to convert texts to integer sequences ---
# We use the cleaned text column, as it's a single string per row.
texts = df['cleaned_text'].tolist()
labels = df['label'].values

tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
# The vocab size for the Embedding layer is the number of unique words + 1 (for the 0 padding token)
keras_vocab_size = len(word_index) + 1
print(f"Vocabulary size for Keras Tokenizer: {keras_vocab_size}")


# --- 2. Pad sequences to a uniform length ---
# We'll use the 95th percentile length to avoid excessive padding due to a few very long texts.
lengths = [len(s) for s in sequences]
maxlen = int(np.percentile(lengths, 95))
print(f"Padding sequences to a max length of: {maxlen}")

X = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')
y = labels


# --- 3. Create the embedding matrix from our Word2Vec model ---
# This matrix will be used as the initial weights for the Keras Embedding layer.
embedding_matrix = np.zeros((keras_vocab_size, embedding_dim)) # embedding_dim is 100

for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_vector = w2v_model.wv[word]
        embedding_matrix[i] = embedding_vector

print(f"Shape of the embedding matrix: {embedding_matrix.shape}")


# --- 4. Split data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y) # stratify ensures similar class distribution

print(f"\nTraining data shape (X_train): {X_train.shape}")
print(f"Testing data shape (X_test): {X_test.shape}")

Vocabulary size for Keras Tokenizer: 56827
Padding sequences to a max length of: 192
Shape of the embedding matrix: (56827, 100)

Training data shape (X_train): (17526, 192)
Testing data shape (X_test): (4382, 192)


In [None]:
# --- 1. Build the GRU Model ---

model = Sequential()

# Embedding Layer (loaded with our Word2Vec weights)
# We set trainable=False because the embeddings are already pre-trained.
model.add(Embedding(input_dim=keras_vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=maxlen,
                    trainable=False))

# GRU Layer (This is the main change from your BiGRU script)
model.add(GRU(units=64))

# Dropout for regularization to help prevent overfitting
model.add(Dropout(0.3))

# A standard fully-connected Dense layer
model.add(Dense(32, activation='relu'))

# Final Output Layer for binary classification (using sigmoid for probabilities)
model.add(Dense(1, activation='sigmoid'))


# --- 2. Compile the Model ---
# We configure the model for training with an optimizer, loss function, and metrics.
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])


# --- 3. Print Model Summary ---
# This is very useful for your paper to show the model's structure and parameters.
model.summary()



In [None]:
# --- 1. Set up Training Parameters ---
print("Starting model training...")

# Add a callback for Early Stopping to prevent overfitting.
# This will stop training if the validation loss doesn't improve for 3 consecutive epochs
# and will restore the weights from the best epoch.
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Define number of epochs and batch size
epochs = 20
batch_size = 32


# --- 2. Train the Model and Measure Time ---
# Start the timer
start_time = time.time()

# Train the model on the training data
history = model.fit(X_train, y_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    validation_data=(X_test, y_test), # Evaluate on test data at the end of each epoch
                    callbacks=[early_stopping])

# Stop the timer
end_time = time.time()

# Calculate and print the training time
training_time = end_time - start_time
print(f"\nTraining finished in {training_time:.2f} seconds (approx {training_time/60:.1f} minutes).")

Starting model training...
Epoch 1/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 140ms/step - accuracy: 0.5201 - loss: 0.6914 - precision: 0.5706 - recall: 0.1436 - val_accuracy: 0.5087 - val_loss: 0.6852 - val_precision: 0.5045 - val_recall: 0.9886
Epoch 2/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 136ms/step - accuracy: 0.5333 - loss: 0.6809 - precision: 0.5365 - recall: 0.5067 - val_accuracy: 0.5043 - val_loss: 0.6894 - val_precision: 0.5023 - val_recall: 0.9786
Epoch 3/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 141ms/step - accuracy: 0.5384 - loss: 0.6766 - precision: 0.5497 - recall: 0.4226 - val_accuracy: 0.8517 - val_loss: 0.3670 - val_precision: 0.7945 - val_recall: 0.9489
Epoch 4/20
[1m548/548[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 135ms/step - accuracy: 0.8771 - loss: 0.3136 - precision: 0.8627 - recall: 0.8948 - val_accuracy: 0.8875 - val_loss: 0.2818 - val_precision: 0.8610 - val_

In [None]:
from sklearn.metrics import roc_auc_score

# --- 1. Performance Evaluation on Test Set ---
print("--- Final Performance Evaluation ---")

# Get model's prediction probabilities on the test set
y_pred_probs = model.predict(X_test)

# Convert probabilities to binary class labels (0 or 1) using a 0.5 threshold
y_pred = (y_pred_probs > 0.5).astype("int32")

# Calculate and print the final metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
# Note: AUC is calculated on the prediction probabilities, not the binary predictions
auc_roc = roc_auc_score(y_test, y_pred_probs)

print(f"\nOverall Test Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"AUC-ROC:   {auc_roc:.4f}\n")

# Display a detailed classification report including precision and recall
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Depressed (0)', 'Depressed (1)']))

# Display the confusion matrix
print("Confusion Matrix:")
# Structure: [[True Negative, False Positive], [False Negative, True Positive]]
print(confusion_matrix(y_test, y_pred))
print("\n" + "="*50 + "\n")


# --- 2. Practical Time Complexity Analysis ---
print("--- Time Complexity ---")

# We already have the training time from the previous step
print(f"Total Training Time: {training_time:.2f} seconds (approx. {training_time/60:.1f} minutes)")

# Measure Inference Time on the entire test set
inference_start_time = time.time()
_ = model.predict(X_test)
inference_end_time = time.time()

total_inference_time = inference_end_time - inference_start_time
avg_inference_time_per_sample = total_inference_time / len(X_test)

print(f"Total Inference Time for {len(X_test)} samples: {total_inference_time:.4f} seconds")
print(f"Average Inference Time per Sample: {avg_inference_time_per_sample * 1000:.4f} milliseconds\n")
print("="*50 + "\n")


# --- 3. Practical Space Complexity Analysis ---
print("--- Space Complexity ---")

# Number of total parameters in the model
total_params = model.count_params()
print(f"Total Model Parameters: {total_params:,}")

# Save the model to a file to check its size on disk
model_filename = "depression_detection_gru.keras"
model.save(model_filename)
model_size_bytes = os.path.getsize(model_filename)
model_size_mb = model_size_bytes / (1024 * 1024)

print(f"Model Size on Disk: {model_size_mb:.2f} MB")

--- Final Performance Evaluation ---
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step

Overall Test Metrics:
Accuracy:  0.9096
F1 Score:  0.9100
AUC-ROC:   0.9670

Classification Report:
                   precision    recall  f1-score   support

Not Depressed (0)       0.91      0.91      0.91      2190
    Depressed (1)       0.91      0.91      0.91      2192

         accuracy                           0.91      4382
        macro avg       0.91      0.91      0.91      4382
     weighted avg       0.91      0.91      0.91      4382

Confusion Matrix:
[[1983  207]
 [ 189 2003]]


--- Time Complexity ---
Total Training Time: 778.66 seconds (approx. 13.0 minutes)
[1m137/137[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step
Total Inference Time for 4382 samples: 3.6408 seconds
Average Inference Time per Sample: 0.8309 milliseconds


--- Space Complexity ---
Total Model Parameters: 5,716,685
Model Size on Disk: 22.10 MB


In [None]:
model.summary()