In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
pip install transformers


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install torch

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the CSV data
data = pd.read_csv('subreddits(3).csv')  
texts = data['Text'].tolist()
labels = data['Subreddit'].tolist()



In [2]:
# Preprocess the data and split it into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)



In [3]:
# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 classes: depression, anxiety



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from sklearn.preprocessing import LabelEncoder
# Tokenize the input texts and convert them into tensors
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt', max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt', max_length=128)

#  Encode labels to numbers
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

# Convert to tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)


In [6]:
train_labels

tensor([0, 0, 0,  ..., 0, 0, 1])

In [7]:
from torch.cuda.amp import autocast, GradScaler

# Fine-tune the BERT model 
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
model.train()

# Reduce batch size
batch_size = 16
num_batches = len(train_labels) // batch_size

# Gradient Accumulation
accumulation_steps = 4
total_steps = 3 * num_batches // accumulation_steps

scaler = GradScaler()  # Mixed Precision Training

for step in range(total_steps):
    start_idx = step * batch_size * accumulation_steps
    end_idx = start_idx + batch_size * accumulation_steps

    optimizer.zero_grad()
    for idx in range(start_idx, end_idx, batch_size):
        input_batch = {k: v[idx:idx + batch_size].to(model.device) for k, v in train_encodings.items()}
        labels_batch = train_labels[idx:idx + batch_size].to(model.device)

        with autocast():  # Mixed Precision Training
            outputs = model(**input_batch, labels=labels_batch)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

    scaler.step(optimizer)
    scaler.update()





In [8]:
# : Evluate the model on the test set
model.eval()
with torch.no_grad():
    predictions = []
    for idx in range(0, len(test_labels), batch_size):
        input_batch = {k: v[idx:idx + batch_size].to(model.device) for k, v in test_encodings.items()}
        logits = model(**input_batch).logits
        batch_predictions = torch.argmax(logits, dim=1).cpu().tolist()
        predictions.extend(batch_predictions)

accuracy = accuracy_score(test_labels.cpu().tolist(), predictions)
print(f"Test accuracy: {accuracy:.2f}")

Test accuracy: 0.86
