In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


Loading Preprocessed Data

In [2]:
df = pd.read_csv(r"C:\Users\Zoya\Desktop\DISSERTATION\notebooks\multi_label_mental_health_data.csv")
df["text"] = df["text"].fillna("")
analyzer = SentimentIntensityAnalyzer()
df["vader_score"] = df["text"].apply(lambda x: analyzer.polarity_scores(x)["compound"])

Defining Features and Labels

In [3]:
print(df.columns.tolist())

['text', 'depression', 'anxiety', 'suicide', 'casual', 'labels', 'vader_score']


In [4]:
text_data = df["text"]
labels = df[["depression", "anxiety", "suicide", "casual"]]
vader_score = df[["vader_score"]]

In [5]:
X_train_text, X_val_text, y_train, y_val, X_train_vader, X_val_vader = train_test_split(
    text_data, labels, vader_score, test_size=0.2, random_state=42
)

In [6]:
X_train = pd.DataFrame({'text': X_train_text}).join(X_train_vader.reset_index(drop=True))
X_val = pd.DataFrame({'text': X_val_text}).join(X_val_vader.reset_index(drop=True))

Feature Transformer for TF-IDF + Vader

In [7]:
text_vectorizer = TfidfVectorizer(max_features=5000)
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_vectorizer, 'text'),
        ('vader', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0.0)),
            ('scaler', StandardScaler())
        ]), ['vader_score'])
    ]
)


In [8]:
classifier = Pipeline([
    ('features', preprocessor),
    ('clf', MultiOutputClassifier(LogisticRegression(solver='liblinear')))
])
classifier.fit(X_train, y_train)

In [10]:
y_pred_prob = classifier.predict_proba(X_val)
y_pred_prob = np.array([prob[:, 1] for prob in y_pred_prob]).T
y_pred = (y_pred_prob >= 0.5).astype(int)

In [11]:
print("Classification Report:\n", classification_report(y_val, y_pred, target_names=labels.columns))
print("F1 Score (macro):", f1_score(y_val, y_pred, average='macro'))
print("Accuracy:", accuracy_score(y_val, y_pred))

Classification Report:
               precision    recall  f1-score   support

  depression       0.72      0.35      0.47      2204
     anxiety       0.91      0.68      0.78      2191
     suicide       0.73      0.52      0.60      2166
      casual       0.80      0.67      0.73      2239

   micro avg       0.79      0.56      0.65      8800
   macro avg       0.79      0.56      0.65      8800
weighted avg       0.79      0.56      0.65      8800
 samples avg       0.54      0.56      0.55      8800

F1 Score (macro): 0.6455624382984463
Accuracy: 0.5320454545454546


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
