In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


df_train = pd.read_csv('../resource/Mental-Health-Twitter-Preprocessed/train.csv')
df_val = pd.read_csv('../resource/Mental-Health-Twitter-Preprocessed/val.csv')

# Bert

In [1]:
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
import torch
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def extract_bert_features(df, tokenizer=None, model=None, max_length=128):
    """
    Extracts BERT embeddings, VADER sentiment, and numeric metadata features.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame containing text and metadata
        tokenizer (BertTokenizer, optional): Pre-loaded BERT tokenizer
        model (BertModel, optional): Pre-loaded BERT model
        max_length (int): Maximum sequence length for BERT
        
    Returns:
        combined_df (pd.DataFrame): Combined features (BERT + sentiment + metadata)
        tokenizer (BertTokenizer): Tokenizer for reuse
        model (BertModel): Model for reuse
    """
    # 1. Load BERT if not provided
    if tokenizer is None or model is None:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        model = BertModel.from_pretrained('bert-base-uncased')
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        model = model.to(device)
    
    # 2. Get BERT embeddings
    def get_bert_embeddings(text):
        inputs = tokenizer(text, return_tensors='pt', 
                         truncation=True, padding=True,
                         max_length=max_length)
        inputs = {k:v.to(model.device) for k,v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        return outputs.last_hidden_state[:,0,:].cpu().numpy().flatten()
    
    bert_embeddings = df['post_text'].apply(get_bert_embeddings)
    bert_df = pd.DataFrame(bert_embeddings.tolist(), 
                          columns=[f'bert_{i}' for i in range(768)],
                          index=df.index)

    # 3. VADER sentiment (unchanged)
    analyzer = SentimentIntensityAnalyzer()
    sentiment_scores = df['post_text'].apply(lambda x: pd.Series(analyzer.polarity_scores(x)))
    sentiment_scores.columns = ['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound']

    # 4. Numeric metadata (unchanged)
    # numeric_cols = ['followers', 'friends', 'favourites', 'statuses', 'retweets']
    # metadata_df = df[numeric_cols].reset_index(drop=True)

    # 5. Combine features
    combined_df = pd.concat([bert_df, sentiment_scores], axis=1)
    
    return combined_df, tokenizer, model

def train_logistic_regression(train_features, train_labels):
    """
    Trains a logistic regression model on BERT features
    
    Parameters:
        train_features (pd.DataFrame): Combined features DataFrame
        train_labels (pd.Series): Target labels
        
    Returns:
        model (LogisticRegression): Trained logistic regression model
    """
    model = LogisticRegression(max_iter=1000)
    model.fit(train_features, train_labels)
    return model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Separate labels
y_train = df_train['label']
y_val = df_val['label']

# Extract BERT features (this will take longer than BoW)
X_train, tokenizer, bert_model = extract_bert_features(df_train)
X_val, _, _ = extract_bert_features(df_val, tokenizer=tokenizer, model=bert_model)

# Optional: Scale features (recommended for BERT)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Train and evaluate the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)  # or X_train if not scaling
y_pred = model.predict(X_val_scaled)  # or X_val if not scaling

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.70      0.71      1481
           1       0.71      0.73      0.72      1484

    accuracy                           0.72      2965
   macro avg       0.72      0.72      0.72      2965
weighted avg       0.72      0.72      0.72      2965

