In [32]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import re
import pandas as pd

df = pd.read_csv("Suicide_Ideation_Dataset(Twitter-based).csv")
df.columns


Index(['Tweet', 'Suicide'], dtype='object')

In [33]:
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'http\S+', '', text)  
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

df.dropna(subset=['Tweet'], inplace=True)
df.rename(columns={"Tweet": "text", "Suicide": "label"}, inplace=True)
df["text"] = df["text"].apply(clean_text)
df["label"] = df["label"].apply(lambda x: 1 if "Potential Suicide" in x else 0)


In [34]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_texts(texts, labels, max_length=128):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    return encodings, labels


In [35]:
class SuicideDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42)

In [36]:
train_encodings, train_labels = tokenize_texts(train_texts, train_labels)
val_encodings, val_labels = tokenize_texts(val_texts, val_labels)

train_dataset = SuicideDataset(train_encodings, train_labels)
val_dataset = SuicideDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
optimizer = AdamW(model.parameters(), lr=5e-5)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



In [38]:
text = "I am feeling very happy."
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
output=outputs.logits
print(output)  

tensor([[ 0.1151, -0.1131]], grad_fn=<AddmmBackward0>)


In [None]:
import torch
import torch.nn.functional as F
logits = output  
probs = F.softmax(logits, dim=1)  
print("Probabilities:", probs)


Probabilities: tensor([[0.5568, 0.4432]], grad_fn=<SoftmaxBackward0>)


In [40]:
predicted_class = torch.argmax(probs, dim=1).item()  

print("Predicted Class:", predicted_class)

label_map = {0: "No Risk", 1: "High Risk"}  

print("Prediction:", label_map[predicted_class])


Predicted Class: 0
Prediction: No Risk
