### False Positive Test Set Evaluation
[Go to BERT](#1)      
[Go to RoBERTa](#2)       
[Go to Mental-RoBERTa](#3)      

In [2]:
import pandas as pd
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, BertTokenizer, AutoModel, AutoTokenizer, BertModel, AutoConfig, RobertaPreTrainedModel, AutoModelForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
from transformers.modeling_outputs import SequenceClassifierOutput

# ===== Load and clean dataset =====
df = pd.read_csv('../dataset/false_positive_test_set_with_personality.csv')

df['Post'] = df['Post'].str.strip("[]'")


  from .autonotebook import tqdm as notebook_tqdm


#### 1

In [24]:
# BERT with raw 2000 samples

model_dir = "../saved_models/bert_raw"

model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# ===== Inference =====
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[15 74]
 [ 0  0]]

False Positive Rate (FPR): 0.8315

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.17      0.29        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.17        89
   macro avg       0.50      0.08      0.14        89
weighted avg       1.00      0.17      0.29        89

Evaluation completed and results saved.


In [6]:
# BERT with original labels 2000 samples + personality (benchmark)
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

# ===== Load and clean dataset =====
df = pd.read_csv('../dataset/false_positive_test_set_with_personality.csv')

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/bert_benchmark"

# Custom model combining BERT + personality features
class BertWithPersonality(BertPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

        self.init_weights()

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits

config = BertConfig.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    hidden_dropout_prob=0.3,          
    attention_probs_dropout_prob=0.3  
)
model = BertWithPersonality.from_pretrained(model_dir, config=config)
#model.load_state_dict(torch.load(f"{model_dir}/pytorch_model.bin")) 
tokenizer = BertTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference =====
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask, personality_feats=personality_feats)
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[14 75]
 [ 0  0]]

False Positive Rate (FPR): 0.8427

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.16      0.27        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.16        89
   macro avg       0.50      0.08      0.14        89
weighted avg       1.00      0.16      0.27        89

Evaluation completed and results saved.


In [7]:
# BERT with relabeled 2000 samples

model_dir = "../saved_models/bert_relabeled"

model = BertForSequenceClassification.from_pretrained(model_dir)
tokenizer = BertTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# ===== Inference =====
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[57 32]
 [ 0  0]]

False Positive Rate (FPR): 0.3596

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.64      0.78        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.64        89
   macro avg       0.50      0.32      0.39        89
weighted avg       1.00      0.64      0.78        89

Evaluation completed and results saved.


In [23]:
# BERT with relabeled 2000 samples + personality
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, BertConfig, BertPreTrainedModel
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/bert_personality"

# Custom model combining BERT + personality features
class BertWithPersonality(BertPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

        self.init_weights()

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits
    
config = BertConfig.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)
model = BertWithPersonality.from_pretrained(model_dir, config=config)
model.load_state_dict(torch.load(f"{model_dir}/pytorch_model.bin")) 
tokenizer = BertTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference =====
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask, personality_feats=personality_feats)
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")



Confusion Matrix:
[[48 41]
 [ 0  0]]

False Positive Rate (FPR): 0.4607

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.54      0.70        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.54        89
   macro avg       0.50      0.27      0.35        89
weighted avg       1.00      0.54      0.70        89

Evaluation completed and results saved.


In [22]:
# BERT with NLI
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# ===== Load BERT suicide classifier =====
model_dir = "../saved_models/bert_relabeled"
bert_model = BertForSequenceClassification.from_pretrained(model_dir)
bert_tokenizer = BertTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device).eval()

# ===== Load pretrained NLI model =====
nli_model_name = "tasksource/deberta-small-long-nli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device).eval()
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

# ===== Helper: NLI filtering =====
negative_hypotheses = [
    "The author discusses suicide in general, awareness, or prevention, not personal suicidal thoughts.",
    "The post shares support, resources, or hotlines, not the author's own suicidal intent.",
    "The author reflects to inspire or thank others, not describing current suicidal thoughts."
]

def nli_negative_filter(post, threshold=0.65):
    max_neg_entail_prob = 0
    for hypo in negative_hypotheses:
        inputs = nli_tokenizer(
            post,
            hypo,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = nli_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]  # [entail, neutral, contradict]
            entail_prob = probs[0].item()  # entailment
        max_neg_entail_prob = max(max_neg_entail_prob, entail_prob)
    
    # If the max negative entailment is high, the post is likely non-suicidal
    return max_neg_entail_prob >= threshold

# ===== Run inference with BERT classifier =====
encodings = bert_tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

with torch.no_grad():
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== Apply NLI filtering for BERT suicide predictions =====
final_predictions = []
for text, pred in zip(df['Post'], predictions):
    if pred == 1:  # initially predicted as suicide
        is_negative = nli_negative_filter(text, threshold=0.62)
        final_predictions.append(0 if is_negative else 1)
    else:
        final_predictions.append(0)

# ===== Evaluation =====
true_labels = [0] * len(df)  # all ground truth non-suicide
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in final_predictions]

cm = confusion_matrix(true_labels, final_predictions)
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, final_predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir + '/false_positive_test_with_nli.csv', index=False)
print("Evaluation completed and results saved with NLI filtering.")



Confusion Matrix:
[[65 24]
 [ 0  0]]

False Positive Rate (FPR): 0.2697

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.73      0.84        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.73        89
   macro avg       0.50      0.37      0.42        89
weighted avg       1.00      0.73      0.84        89

Evaluation completed and results saved with NLI filtering.


In [21]:
# BERT with relabeled 2000 samples + personality + NLI
import torch
import torch.nn as nn
from transformers import BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification, BertPreTrainedModel, BertConfig
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/bert_personality"

# Custom model combining BERT + personality features
class BertWithPersonality(BertPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = BertModel.from_pretrained("bert-base-uncased", config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

        self.init_weights()

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits
    
config = BertConfig.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    hidden_dropout_prob=0.3,          
    attention_probs_dropout_prob=0.3  
)
model = BertWithPersonality.from_pretrained("bert-base-uncased", config=config)
model.load_state_dict(torch.load(f"{model_dir}/pytorch_model.bin")) 
tokenizer = BertTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Load pretrained NLI model =====
nli_model_name = "tasksource/deberta-small-long-nli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device).eval()
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

# ===== Helper: NLI filtering =====
negative_hypotheses = [
    "The author discusses suicide in general, awareness, or prevention, not personal suicidal thoughts.",
    "The post shares support, resources, or hotlines, not the author's own suicidal intent.",
    "The author reflects to inspire or thank others, not describing current suicidal thoughts."
]

def nli_negative_filter(post, threshold=0.65):
    max_neg_entail_prob = 0
    for hypo in negative_hypotheses:
        inputs = nli_tokenizer(
            post,
            hypo,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = nli_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]  # [entail, neutral, contradict]
            entail_prob = probs[0].item()  # entailment
        max_neg_entail_prob = max(max_neg_entail_prob, entail_prob)
    
    # If the max negative entailment is high, the post is likely non-suicidal
    return max_neg_entail_prob >= threshold

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference with NLI filtering =====
filtered_predictions = []

for i, post in enumerate(df['Post'].tolist()):
    # BERT + personality prediction
    with torch.no_grad():
        bert_logits = model(
            input_ids[i].unsqueeze(0),
            attention_mask=attention_mask[i].unsqueeze(0),
            personality_feats=personality_feats[i].unsqueeze(0)
        )
        bert_pred = torch.argmax(bert_logits, dim=1).item()

    if bert_pred == 1:  # initially predicted as suicide
        is_negative = nli_negative_filter(post, threshold=0.62)
        filtered_predictions.append(0 if is_negative else 1)
    else:
        filtered_predictions.append(0)


# ===== Metrics =====
true_labels = [0] * len(df)  # all non-suicidal in this FP test set

cm = confusion_matrix(true_labels, filtered_predictions)
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix after NLI filtering:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, filtered_predictions,
                            target_names=["Non-suicide", "Suicide"],
                            zero_division=0))

# ===== Save results =====
df['Final_Predicted_Label'] = ["suicide" if p == 1 else "non-suicide" for p in filtered_predictions]
df.to_csv(model_dir + '/false_positive_test_results_with_nli.csv', index=False)
print("Evaluation completed and results saved.")

Some weights of BertWithPersonality were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Confusion Matrix after NLI filtering:
[[58 31]
 [ 0  0]]

False Positive Rate (FPR): 0.3483

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.65      0.79        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.65        89
   macro avg       0.50      0.33      0.39        89
weighted avg       1.00      0.65      0.79        89

Evaluation completed and results saved.


#### 2

In [3]:
# roBERTa with raw 2000 samples

model_dir = "../saved_models/roberta_raw"

model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = RobertaTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# ===== Inference =====
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[10 79]
 [ 0  0]]

False Positive Rate (FPR): 0.8876

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.11      0.20        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.11        89
   macro avg       0.50      0.06      0.10        89
weighted avg       1.00      0.11      0.20        89

Evaluation completed and results saved.


In [9]:
# RoBERTa with original labels 2000 samples + personality (benchmark)
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

# ===== Load and clean dataset =====
df = pd.read_csv('../dataset/false_positive_test_set_with_personality.csv')

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/roberta_raw_personality"

# Custom model combining BERT + personality features
class RoBertaWithPersonality(RobertaPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = BertModel.from_pretrained(model_dir, config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

        self.init_weights()

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits

config = AutoConfig.from_pretrained(
    model_dir,
    num_labels=2,
    hidden_dropout_prob=0.3,          
    attention_probs_dropout_prob=0.3  
)
model = RoBertaWithPersonality.from_pretrained(model_dir, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference =====
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask, personality_feats=personality_feats)
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[ 6 83]
 [ 0  0]]

False Positive Rate (FPR): 0.9326

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.07      0.13        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.07        89
   macro avg       0.50      0.03      0.06        89
weighted avg       1.00      0.07      0.13        89

Evaluation completed and results saved.


In [10]:
# roBERTa with relabeled 2000 samples

model_dir = "../saved_models/roberta_relabeled"

model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = RobertaTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# ===== Inference =====
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[41 48]
 [ 0  0]]

False Positive Rate (FPR): 0.5393

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.46      0.63        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.46        89
   macro avg       0.50      0.23      0.32        89
weighted avg       1.00      0.46      0.63        89

Evaluation completed and results saved.


In [11]:
# RoBERTa with relabeled 2000 samples + personality
import torch
import torch.nn as nn
from transformers import RobertaTokenizerFast, RobertaModel, RobertaConfig
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/roberta_personality"

class RoBertaWithPersonality(RobertaPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = RobertaModel.from_pretrained(model_dir, config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = RobertaConfig.from_pretrained(model_dir, num_labels=2)
model = RoBertaWithPersonality.from_pretrained(model_dir, config=config)
model.to(device).eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference =====
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask, personality_feats=personality_feats)
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Some weights of RobertaModel were not initialized from the model checkpoint at ../saved_models/roberta_personality and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bi

Confusion Matrix:
[[73 16]
 [ 0  0]]

False Positive Rate (FPR): 0.1798

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.82      0.90        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.82        89
   macro avg       0.50      0.41      0.45        89
weighted avg       1.00      0.82      0.90        89

Evaluation completed and results saved.


In [3]:
# RoBERTa with NLI
import torch
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# ===== Load BERT suicide classifier =====
model_dir = "../saved_models/roberta_relabeled"
bert_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
bert_tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device).eval()

# ===== Load pretrained NLI model =====
nli_model_name = "tasksource/deberta-small-long-nli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device).eval()
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

# ===== Helper: NLI filtering =====
negative_hypotheses = [
    "The author discusses suicide in general, awareness, or prevention, not personal suicidal thoughts.",
    "The post shares support, resources, or hotlines, not the author's own suicidal intent.",
    "The author reflects to inspire or thank others, not describing current suicidal thoughts."
]

def nli_negative_filter(post, threshold=0.65):
    max_neg_entail_prob = 0
    for hypo in negative_hypotheses:
        inputs = nli_tokenizer(
            post,
            hypo,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = nli_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]  # [entail, neutral, contradict]
            entail_prob = probs[0].item()  # entailment
        max_neg_entail_prob = max(max_neg_entail_prob, entail_prob)
    
    # If the max negative entailment is high, the post is likely non-suicidal
    return max_neg_entail_prob >= threshold

# ===== Run inference with BERT classifier =====
encodings = bert_tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

with torch.no_grad():
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== Apply NLI filtering for BERT suicide predictions =====
final_predictions = []
for text, pred in zip(df['Post'], predictions):
    if pred == 1:  # initially predicted as suicide
        is_negative = nli_negative_filter(text, threshold=0.65)
        final_predictions.append(0 if is_negative else 1)
    else:
        final_predictions.append(0)

# ===== Evaluation =====
true_labels = [0] * len(df)  # all ground truth non-suicide
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in final_predictions]

cm = confusion_matrix(true_labels, final_predictions)
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, final_predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir + '/false_positive_test_with_nli.csv', index=False)
print("Evaluation completed and results saved with NLI filtering.")



Confusion Matrix:
[[52 37]
 [ 0  0]]

False Positive Rate (FPR): 0.4157

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.58      0.74        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.58        89
   macro avg       0.50      0.29      0.37        89
weighted avg       1.00      0.58      0.74        89

Evaluation completed and results saved with NLI filtering.


In [None]:
# RoBERTa with relabeled 2000 samples + personality + NLI
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/roberta_personality"

class RoBertaWithPersonality(RobertaPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = AutoModel.from_pretrained(model_dir, config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits

tokenizer = AutoTokenizer.from_pretrained(model_dir)    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_dir, num_labels=2)
model = RoBertaWithPersonality.from_pretrained(model_dir, config=config)
model.to(device).eval()

# ===== Load pretrained NLI model =====
nli_model_name = "tasksource/deberta-small-long-nli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device).eval()
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

# ===== Helper: NLI filtering =====
negative_hypotheses = [
    "The author discusses suicide in general, awareness, or prevention, not personal suicidal thoughts.",
    "The post shares support, resources, or hotlines, not the author's own suicidal intent.",
    "The author reflects to inspire or thank others, not describing current suicidal thoughts."
]

def nli_negative_filter(post, threshold=0.65):
    max_neg_entail_prob = 0
    for hypo in negative_hypotheses:
        inputs = nli_tokenizer(
            post,
            hypo,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = nli_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]  # [entail, neutral, contradict]
            entail_prob = probs[0].item()  # entailment
        max_neg_entail_prob = max(max_neg_entail_prob, entail_prob)
    
    # If the max negative entailment is high, the post is likely non-suicidal
    return max_neg_entail_prob >= threshold

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference with NLI filtering =====
filtered_predictions = []

for i, post in enumerate(df['Post'].tolist()):
    # BERT + personality prediction
    with torch.no_grad():
        bert_logits = model(
            input_ids[i].unsqueeze(0),
            attention_mask=attention_mask[i].unsqueeze(0),
            personality_feats=personality_feats[i].unsqueeze(0)
        )
        bert_pred = torch.argmax(bert_logits, dim=1).item()

    if bert_pred == 1:  # initially predicted as suicide
        is_negative = nli_negative_filter(post, threshold=0.65)
        filtered_predictions.append(0 if is_negative else 1)
    else:
        filtered_predictions.append(0)


# ===== Metrics =====
true_labels = [0] * len(df)  # all non-suicidal in this FP test set

cm = confusion_matrix(true_labels, filtered_predictions)
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix after NLI filtering:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, filtered_predictions,
                            target_names=["Non-suicide", "Suicide"],
                            zero_division=0))

# ===== Save results =====
df['Final_Predicted_Label'] = ["suicide" if p == 1 else "non-suicide" for p in filtered_predictions]
df.to_csv(model_dir + '/false_positive_test_results_with_nli.csv', index=False)
print("Evaluation completed and results saved.")

Some weights of RobertaModel were not initialized from the model checkpoint at ../saved_models/roberta_personality and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bi

Confusion Matrix after NLI filtering:
[[76 13]
 [ 0  0]]

False Positive Rate (FPR): 0.1461

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.85      0.92        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.85        89
   macro avg       0.50      0.43      0.46        89
weighted avg       1.00      0.85      0.92        89

Evaluation completed and results saved.


#### 3

In [12]:
# Mental-RoBERTa with raw 2000 samples

model_dir = "../saved_models/mental_roberta_raw"

model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = RobertaTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# ===== Inference =====
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[ 6 83]
 [ 0  0]]

False Positive Rate (FPR): 0.9326

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.07      0.13        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.07        89
   macro avg       0.50      0.03      0.06        89
weighted avg       1.00      0.07      0.13        89

Evaluation completed and results saved.


In [15]:
# Mental-RoBERTa with original labels 2000 samples + personality (benchmark)
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

# ===== Load and clean dataset =====
df = pd.read_csv('../dataset/false_positive_test_set_with_personality.csv')

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/mental_roberta_raw_personality"

class RoBertaWithPersonality(RobertaPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = RobertaModel.from_pretrained(model_dir, config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits

config = AutoConfig.from_pretrained(
    model_dir,
    num_labels=2,
    hidden_dropout_prob=0.3,          
    attention_probs_dropout_prob=0.3  
)
model = RoBertaWithPersonality.from_pretrained(model_dir, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference =====
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask, personality_feats=personality_feats)
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Some weights of RobertaModel were not initialized from the model checkpoint at ../saved_models/mental_roberta_raw_personality and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.outp

Confusion Matrix:
[[ 6 83]
 [ 0  0]]

False Positive Rate (FPR): 0.9326

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.07      0.13        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.07        89
   macro avg       0.50      0.03      0.06        89
weighted avg       1.00      0.07      0.13        89

Evaluation completed and results saved.


In [13]:
# Mental-RoBERTa with relabeled 2000 samples

model_dir = "../saved_models/mental_roberta_relabeled"

model = RobertaForSequenceClassification.from_pretrained(model_dir)
tokenizer = RobertaTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

# ===== Inference =====
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Confusion Matrix:
[[64 25]
 [ 0  0]]

False Positive Rate (FPR): 0.2809

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.72      0.84        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.72        89
   macro avg       0.50      0.36      0.42        89
weighted avg       1.00      0.72      0.84        89

Evaluation completed and results saved.


In [16]:
# Mental-RoBERTa with relabeld 2000 samples + personality (benchmark)
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

# ===== Load and clean dataset =====
df = pd.read_csv('../dataset/false_positive_test_set_with_personality.csv')

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/mental_roberta_personality"

class RoBertaWithPersonality(RobertaPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = RobertaModel.from_pretrained(model_dir, config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits

config = AutoConfig.from_pretrained(
    model_dir,
    num_labels=2,
    hidden_dropout_prob=0.3,          
    attention_probs_dropout_prob=0.3  
)
model = RoBertaWithPersonality.from_pretrained(model_dir, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference =====
with torch.no_grad():
    logits = model(input_ids, attention_mask=attention_mask, personality_feats=personality_feats)
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== All ground-truth labels are 0 (non-suicide) =====
true_labels = [0] * len(df)

# ===== Add predictions to DataFrame =====
# Map numeric labels to target names
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in predictions]

# ===== Metrics =====
cm = confusion_matrix(true_labels, predictions)
tn, fp, fn, tp = cm.ravel()  # Here fn, tp will be 0 because true_labels are all 0

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir+ '/false_positive_test_results.csv', index=False)
print("Evaluation completed and results saved.")


Some weights of RobertaModel were not initialized from the model checkpoint at ../saved_models/mental_roberta_personality and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.d

Confusion Matrix:
[[53 36]
 [ 0  0]]

False Positive Rate (FPR): 0.4045

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.60      0.75        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.60        89
   macro avg       0.50      0.30      0.37        89
weighted avg       1.00      0.60      0.75        89

Evaluation completed and results saved.


In [20]:
# Mental-RaBERTa with NLI
import torch
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# ===== Load Mental-RaBERTa suicide classifier =====
model_dir = "../saved_models/mental_roberta_relabeled"
bert_model = AutoModelForSequenceClassification.from_pretrained(model_dir)
bert_tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device).eval()

# ===== Load pretrained NLI model =====
nli_model_name = "tasksource/deberta-small-long-nli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device).eval()
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

# ===== Helper: NLI filtering =====
negative_hypotheses = [
    "The author discusses suicide in general, awareness, or prevention, not personal suicidal thoughts.",
    "The post shares support, resources, or hotlines, not the author's own suicidal intent.",
    "The author reflects to inspire or thank others, not describing current suicidal thoughts."
]

def nli_negative_filter(post, threshold=0.65):
    max_neg_entail_prob = 0
    for hypo in negative_hypotheses:
        inputs = nli_tokenizer(
            post,
            hypo,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = nli_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]  # [entail, neutral, contradict]
            entail_prob = probs[0].item()  # entailment
        max_neg_entail_prob = max(max_neg_entail_prob, entail_prob)
    
    # If the max negative entailment is high, the post is likely non-suicidal
    return max_neg_entail_prob >= threshold

# ===== Run inference with BERT classifier =====
encodings = bert_tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)

with torch.no_grad():
    outputs = bert_model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

# ===== Apply NLI filtering for BERT suicide predictions =====
final_predictions = []
for text, pred in zip(df['Post'], predictions):
    if pred == 1:  # initially predicted as suicide
        is_negative = nli_negative_filter(text, threshold=0.65)
        final_predictions.append(0 if is_negative else 1)
    else:
        final_predictions.append(0)

# ===== Evaluation =====
true_labels = [0] * len(df)  # all ground truth non-suicide
label_map = {0: "non-suicide", 1: "suicide"}
df['Predicted_Label'] = [label_map[p] for p in final_predictions]

cm = confusion_matrix(true_labels, final_predictions)
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, final_predictions, target_names=["Non-suicide", "Suicide"], zero_division=0))

# ===== Save results =====
df.to_csv(model_dir + '/false_positive_test_with_nli.csv', index=False)
print("Evaluation completed and results saved with NLI filtering.")



Confusion Matrix:
[[67 22]
 [ 0  0]]

False Positive Rate (FPR): 0.2472

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.75      0.86        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.75        89
   macro avg       0.50      0.38      0.43        89
weighted avg       1.00      0.75      0.86        89

Evaluation completed and results saved with NLI filtering.


In [21]:
# Mental-RoBERTa with relabeled 2000 samples + personality + NLI
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, auc

personality_cols = ["extraversion", "agreeableness", "neuroticism"]
personality_feats = torch.tensor(df[personality_cols].values, dtype=torch.float)

model_dir = "../saved_models/mental_roberta_personality"

class RoBertaWithPersonality(RobertaPreTrainedModel):
    def __init__(self, config, personality_feat_dim=3, num_labels=2):
        super().__init__(config)
        self.bert = RobertaModel.from_pretrained(model_dir, config=config)
        bert_hidden_size = config.hidden_size

        self.classifier = nn.Sequential(
            nn.Linear(bert_hidden_size + personality_feat_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_labels)
        )

    def forward(self, input_ids, attention_mask, personality_feats, labels=None):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        combined = torch.cat((cls_output, personality_feats), dim=1)
        logits = self.classifier(combined)

        return logits
    
config = AutoConfig.from_pretrained(
    model_dir,
    num_labels=2,
    hidden_dropout_prob=0.3,          
    attention_probs_dropout_prob=0.3  
)
model = RoBertaWithPersonality.from_pretrained(model_dir, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ===== Load pretrained NLI model =====
nli_model_name = "tasksource/deberta-small-long-nli"
nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(device).eval()
nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)

# ===== Helper: NLI filtering =====
negative_hypotheses = [
    "The author discusses suicide in general, awareness, or prevention, not personal suicidal thoughts.",
    "The post shares support, resources, or hotlines, not the author's own suicidal intent.",
    "The author reflects to inspire or thank others, not describing current suicidal thoughts."
]

def nli_negative_filter(post, threshold=0.65):
    max_neg_entail_prob = 0
    for hypo in negative_hypotheses:
        inputs = nli_tokenizer(
            post,
            hypo,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)
        with torch.no_grad():
            outputs = nli_model(**inputs)
            probs = torch.softmax(outputs.logits, dim=-1)[0]  # [entail, neutral, contradict]
            entail_prob = probs[0].item()  # entailment
        max_neg_entail_prob = max(max_neg_entail_prob, entail_prob)
    
    # If the max negative entailment is high, the post is likely non-suicidal
    return max_neg_entail_prob >= threshold

# ===== Tokenize =====
encodings = tokenizer(
    df['Post'].tolist(),
    add_special_tokens=True,
    return_attention_mask=True,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
personality_feats = personality_feats.to(device)

# ===== Inference with NLI filtering =====
filtered_predictions = []

for i, post in enumerate(df['Post'].tolist()):
    # BERT + personality prediction
    with torch.no_grad():
        bert_logits = model(
            input_ids[i].unsqueeze(0),
            attention_mask=attention_mask[i].unsqueeze(0),
            personality_feats=personality_feats[i].unsqueeze(0)
        )
        bert_pred = torch.argmax(bert_logits, dim=1).item()

    if bert_pred == 1:  # initially predicted as suicide
        is_negative = nli_negative_filter(post, threshold=0.65)
        filtered_predictions.append(0 if is_negative else 1)
    else:
        filtered_predictions.append(0)


# ===== Metrics =====
true_labels = [0] * len(df)  # all non-suicidal in this FP test set

cm = confusion_matrix(true_labels, filtered_predictions)
tn, fp, fn, tp = cm.ravel()

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

print("Confusion Matrix after NLI filtering:")
print(cm)
print(f"\nFalse Positive Rate (FPR): {fpr:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, filtered_predictions,
                            target_names=["Non-suicide", "Suicide"],
                            zero_division=0))

# ===== Save results =====
df['Final_Predicted_Label'] = ["suicide" if p == 1 else "non-suicide" for p in filtered_predictions]
df.to_csv(model_dir + '/false_positive_test_results_with_nli.csv', index=False)
print("Evaluation completed and results saved.")

Some weights of RobertaModel were not initialized from the model checkpoint at ../saved_models/mental_roberta_personality and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.d

Confusion Matrix after NLI filtering:
[[60 29]
 [ 0  0]]

False Positive Rate (FPR): 0.3258

Classification Report:
              precision    recall  f1-score   support

 Non-suicide       1.00      0.67      0.81        89
     Suicide       0.00      0.00      0.00         0

    accuracy                           0.67        89
   macro avg       0.50      0.34      0.40        89
weighted avg       1.00      0.67      0.81        89

Evaluation completed and results saved.
