In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import joblib
import torch
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report, accuracy_score
from torch.utils.data import Dataset, DataLoader

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df = pd.read_csv("reddit_depression_suicidewatch.csv")

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Map labels to three categories
def categorize_label(label, text):
    if label == 'SuicideWatch':
        return 'High'
    elif any(word in text for word in ['hopeless', 'worthless', 'no purpose', 'depressed','lonely']):
        return 'Medium'
    else:
        return 'Low'

df['risk_level'] = df.apply(lambda row: categorize_label(row['label'], row['clean_text']), axis=1)

# Encode labels
df['risk_level'] = df['risk_level'].map({'Low': 0, 'Medium': 1, 'High': 2})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['risk_level'], test_size=0.2, random_state=42)

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx], padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = TextDataset(X_train.tolist(), y_train.tolist())
test_dataset = TextDataset(X_test.tolist(), y_test.tolist())

# Load pre-trained RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train model
trainer.train()

# Save Model
model.save_pretrained('roberta_depression_model')
tokenizer.save_pretrained('roberta_depression_model')

# Function for Prediction
def predict_risk_level(text):
    model = RobertaForSequenceClassification.from_pretrained('roberta_depression_model')
    tokenizer = RobertaTokenizer.from_pretrained('roberta_depression_model')
    encoding = tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    outputs = model(**encoding)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return ['Low', 'Medium', 'High'][prediction]

# Example Usage
example_text = "I feel hopeless and I don't see a way out."
print("Prediction:", predict_risk_level(example_text))




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sheet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


In [6]:
import pandas as pd
import numpy as np
import re
import nltk
import joblib
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df = pd.read_csv("reddit_depression_suicidewatch.csv")

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

df['clean_text'] = df['text'].apply(clean_text)

def categorize_label(label, text):
    if label == 'SuicideWatch':
        return 'High'
    elif any(word in text for word in ['hopeless', 'worthless', 'no purpose', 'depressed','lonely']):
        return 'Medium'
    else:
        return 'Low'

df['risk_level'] = df.apply(lambda row: categorize_label(row['label'], row['clean_text']), axis=1)

df['risk_level'] = df['risk_level'].map({'Low': 0, 'Medium': 1, 'High': 2})

MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 200
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(df['clean_text'])
X_sequences = tokenizer.texts_to_sequences(df['clean_text'])
X_padded = pad_sequences(X_sequences, maxlen=MAX_SEQUENCE_LENGTH)

X_train, X_test, y_train, y_test = train_test_split(X_padded, df['risk_level'], test_size=0.2, random_state=42)

model = Sequential([
    Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax') 
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))
model.save('depression_risk_nn_model.h5')
joblib.dump(tokenizer, 'tokenizer.pkl')

def predict_risk_level(text):
    model = tf.keras.models.load_model('depression_risk_nn_model.h5')
    tokenizer = joblib.load('tokenizer.pkl')
    sequence = tokenizer.texts_to_sequences([clean_text(text)])
    padded = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
    prediction = np.argmax(model.predict(padded))
    return ['Low', 'Medium', 'High'][prediction]

# Example Usage
example_text = "I am very happy ."
print("Prediction:", predict_risk_level(example_text))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sheet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 217ms/step - accuracy: 0.5543 - loss: 0.8692 - val_accuracy: 0.7137 - val_loss: 0.6046
Epoch 2/5
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 214ms/step - accuracy: 0.7449 - loss: 0.5807 - val_accuracy: 0.7125 - val_loss: 0.5959
Epoch 3/5
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 263ms/step - accuracy: 0.7944 - loss: 0.4893 - val_accuracy: 0.7127 - val_loss: 0.6355
Epoch 4/5
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 253ms/step - accuracy: 0.8232 - loss: 0.4437 - val_accuracy: 0.7007 - val_loss: 0.7114
Epoch 5/5
[1m510/510[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 232ms/step - accuracy: 0.8572 - loss: 0.3793 - val_accuracy: 0.6865 - val_loss: 0.7247




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 360ms/step
Prediction: High
