In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack


In [2]:
# Step 1: Load Data
df_train = pd.read_csv('../resource/Mental-Health-Twitter-Tokenized/train.csv')
df_val = pd.read_csv('../resource/Mental-Health-Twitter-Tokenized/val.csv')
df_test = pd.read_csv('../resource/Mental-Health-Twitter-Tokenized/test.csv')

# Prepare the features and target variable
X_train_text = df_train['processed_tokens']
X_val_text = df_val['processed_tokens']
X_test_text = df_test['processed_tokens']

y_train = df_train['label']
y_val = df_val['label']
y_test = df_test['label']


In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# Download stopwords if not already
nltk.download('stopwords')

# Step 1: Load Data
df_train = pd.read_csv('../resource/Mental-Health-Twitter-Tokenized/train.csv')
df_val = pd.read_csv('../resource/Mental-Health-Twitter-Tokenized/val.csv')
df_test = pd.read_csv('../resource/Mental-Health-Twitter-Tokenized/test.csv')

# Combine train and val for RandomizedSearchCV tuning
df_full = pd.concat([df_train, df_val], ignore_index=True)

X_full_text = df_full['processed_tokens']
y_full = df_full['label']
X_test_text = df_test['processed_tokens']
y_test = df_test['label']

# Step 2: Improved Text Preprocessing
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_tokens(token_str):
    tokens = eval(token_str)
    cleaned = [
        stemmer.stem(w.lower())
        for w in tokens
        if w.isalpha() and w.lower() not in stop_words
    ]
    return ' '.join(cleaned)

X_full_text = X_full_text.apply(clean_tokens)
X_test_text = X_test_text.apply(clean_tokens)

# Step 3: Build a Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', RandomForestClassifier(random_state=42))
])

# Step 4: Hyperparameter Tuning
param_dist = {
    'clf__n_estimators': [100, 300, 500],
    'clf__max_depth': [10, 30, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__max_features': ['sqrt', 'log2']
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='f1_weighted',
    verbose=2,
    n_jobs=-1
)

# Step 5: Train with Hyperparameter Tuning
search.fit(X_full_text, y_full)    

# Step 6: Evaluate on Test Set
y_test_pred = search.predict(X_test_text)
print("Final Test Set Evaluation:")
print(classification_report(y_test, y_test_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nanphattongsirisukool/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END clf__max_depth=10, clf__max_features=log2, clf__min_samples_split=5, clf__n_estimators=100; total time=   1.2s
[CV] END clf__max_depth=10, clf__max_features=log2, clf__min_samples_split=5, clf__n_estimators=100; total time=   1.1s
[CV] END clf__max_depth=10, clf__max_features=log2, clf__min_samples_split=5, clf__n_estimators=100; total time=   1.2s
[CV] END clf__max_depth=10, clf__max_features=log2, clf__min_samples_split=2, clf__n_estimators=500; total time=   3.4s
[CV] END clf__max_depth=10, clf__max_features=log2, clf__min_samples_split=2, clf__n_estimators=500; total time=   3.5s
[CV] END clf__max_depth=10, clf__max_features=log2, clf__min_samples_split=2, clf__n_estimators=500; total time=   3.6s
[CV] END clf__max_depth=10, clf__max_features=sqrt, clf__min_samples_split=2, clf__n_estimators=300; total time=   2.4s
[CV] END clf__max_depth=10, clf__max_features=sqrt, clf__min_samples_split=2, clf__n_estimators=300

In [23]:
X_train_combined = X_train_tfidf
X_val_combined = X_val_tfidf
X_test_combined = X_test_tfidf

In [27]:
# Step 3: Numeric Features (followers, retweets, etc.)
numeric_features_train = df_train[['followers', 'friends', 'favourites', 'statuses', 'retweets', 'URLs', 'Mentions']]
numeric_features_val = df_val[['followers', 'friends', 'favourites', 'statuses', 'retweets', 'URLs', 'Mentions']]
numeric_features_test = df_test[['followers', 'friends', 'favourites', 'statuses', 'retweets', 'URLs', 'Mentions']]

# Standardize the numeric features
scaler = StandardScaler()
X_train_numeric = scaler.fit_transform(numeric_features_train)
X_val_numeric = scaler.transform(numeric_features_val)
X_test_numeric = scaler.transform(numeric_features_test)

# Step 4: Combine Text and Numeric Features
X_train_combined = X_train_tfidf
X_val_combined = X_val_tfidf
X_test_combined = X_test_tfidf


In [28]:
# Step 5: Train the Model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_combined, y_train)

# Step 6: Validation - Evaluate on the validation set
y_val_pred = model.predict(X_val_combined)
print("Validation Set Evaluation:")
print(classification_report(y_val, y_val_pred))

# # Step 7: After Validation, Train on the Full Training + Validation Data
# X_full_train = hstack([X_train_combined, X_val_combined])
# y_full_train = pd.concat([y_train, y_val], axis=0)

# # Re-train the model on the full training + validation data
# model.fit(X_full_train, y_full_train)

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.73      0.72      0.72      1481
           1       0.72      0.73      0.73      1484

    accuracy                           0.72      2965
   macro avg       0.72      0.72      0.72      2965
weighted avg       0.72      0.72      0.72      2965



In [29]:
# Step 8: Final Testing - Evaluate on the test set
y_test_pred = model.predict(X_test_combined)
print("Test Set Evaluation (Final Model):")
print(classification_report(y_test, y_test_pred))

Test Set Evaluation (Final Model):
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      1482
           1       0.71      0.72      0.71      1483

    accuracy                           0.71      2965
   macro avg       0.71      0.71      0.71      2965
weighted avg       0.71      0.71      0.71      2965

