In [2]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm

df = pd.read_csv('subreddits(3).csv')

# Map string labels to numerical values
label_map = {"depression": 0, "Anxiety": 1}
df['Subreddit'] = df['Subreddit'].map(label_map)


In [3]:

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)


In [4]:

# Load the pre-trained Sentence-BERT tokenizer and model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

# Tokenize the text and create DataLoader for training and validation sets
def tokenize_data(data_frame, max_length=128, batch_size=16):
    input_texts = data_frame['Text'].tolist()
    labels = data_frame['Subreddit'].tolist()

    inputs = tokenizer(input_texts, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt', return_attention_mask=True)
    labels = torch.tensor(labels)

    dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return data_loader

train_dataloader = tokenize_data(train_df)
val_dataloader = tokenize_data(val_df)



In [6]:
# Define training parameters
epochs = 5
lr = 3e-5
warmup_steps = 500
total_steps = len(train_dataloader) * epochs

optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)




In [7]:

# Training and Validation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
best_val_accuracy = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1} Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    average_train_loss = train_loss / len(train_dataloader)

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Epoch {epoch + 1} Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels).item()
            total_samples += len(labels)

    average_val_loss = val_loss / len(val_dataloader)
    val_accuracy = correct_predictions / total_samples

    print(f"Epoch {epoch + 1}:")
  
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), 'best_model.pth')

print(f"*********Accuracy: {best_val_accuracy:.2%}")


Epoch 1 Training: 100%|████████████████████████████████████████████████████████████| 1000/1000 [30:01<00:00,  1.80s/it]
Epoch 1 Validation: 100%|████████████████████████████████████████████████████████████| 250/250 [02:32<00:00,  1.63it/s]


Epoch 1:


Epoch 2 Training: 100%|████████████████████████████████████████████████████████████| 1000/1000 [29:29<00:00,  1.77s/it]
Epoch 2 Validation: 100%|████████████████████████████████████████████████████████████| 250/250 [02:35<00:00,  1.61it/s]


Epoch 2:


Epoch 3 Training: 100%|████████████████████████████████████████████████████████████| 1000/1000 [29:49<00:00,  1.79s/it]
Epoch 3 Validation: 100%|████████████████████████████████████████████████████████████| 250/250 [02:33<00:00,  1.62it/s]


Epoch 3:


Epoch 4 Training: 100%|████████████████████████████████████████████████████████████| 1000/1000 [28:49<00:00,  1.73s/it]
Epoch 4 Validation: 100%|████████████████████████████████████████████████████████████| 250/250 [02:18<00:00,  1.81it/s]


Epoch 4:


Epoch 5 Training: 100%|██████████████████████████████████████████████████████████| 1000/1000 [1:18:42<00:00,  4.72s/it]
Epoch 5 Validation: 100%|████████████████████████████████████████████████████████████| 250/250 [08:00<00:00,  1.92s/it]

Epoch 5:
*********Accuracy: 87.12%



