In [4]:
#pip install transformers dataset

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

# Load the dataset

DATA_PATH = 'D:\\Mental-Health-Classification\\data\\mental_heath_unbanlanced.csv'
data = pd.read_csv(DATA_PATH)


# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

TEXT_COL = 'text'
TARGET_COL = 'status'

assert TEXT_COL in data.columns, f"{TEXT_COL} not in columns: {data.columns}" # Validate your dataset has the required columns before proceeding
assert TARGET_COL in data.columns, f"{TARGET_COL} not in columns: {data.columns}"

First few rows of the dataset:
   Unique_ID                                               text   status
0        0.0                                         oh my gosh  Anxiety
1        1.0  trouble sleeping, confused mind, restless hear...  Anxiety
2        2.0  All wrong, back off dear, forward doubt. Stay ...  Anxiety
3        3.0  I've shifted my focus to something else but I'...  Anxiety
4        4.0  I'm restless and restless, it's been a month n...  Anxiety


In [8]:
# Data Balance Check
X = data[TEXT_COL]
y = data[TARGET_COL]
print("\nClass distribution in the dataset:")
print(data[TARGET_COL].value_counts())

# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Encode Labels

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Compute Class Weights
classes = np.unique(y_train_encoded)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_encoded)

class_weight_dict = dict(zip(classes, class_weights))

for class_id, weight in class_weight_dict.items():
    class_name = label_encoder.classes_[class_id]
    count = np.sum(y_train_encoded == class_id)
    print(f"  {class_name:12s} (class {class_id}): weight={weight:.4f}, count={count:,}")


Class distribution in the dataset:
status
Normal        18391
Depression    14506
Suicidal      11212
Anxiety        5503
Name: count, dtype: int64
  Anxiety      (class 0): weight=2.2540, count=4,402
  Depression   (class 1): weight=0.8550, count=11,605
  Normal       (class 2): weight=0.6744, count=14,713
  Suicidal     (class 3): weight=1.1063, count=8,969
