In [None]:
!pip install transformers accelerate

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import torch
import json

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [4]:
# Load root_path

In [5]:
# Load train_df and test_df

In [6]:
train_texts = train_df['comment_text'].tolist()
train_texts = [str(text) for text in train_texts]

test_texts = test_df['comment_text'].tolist()
test_texts = [str(text) for text in test_texts]

train_labels = train_df['Conceptual'].tolist()
train_labels = [int(label) for label in train_labels]

test_labels = test_df['Conceptual'].tolist()
test_labels = [int(label) for label in test_labels]

In [None]:
test_texts[0]

In [None]:
print(len(train_texts))
print(len(train_labels))
print(len(test_texts))
print(len(test_labels))

In [None]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
# Tokenize the datasets
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

In [11]:
# Define a custom dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
# Create datasets for training and testing
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [None]:
# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [15]:
# Define a function to compute metrics
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [16]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='output_dir',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='logging_dir',
    logging_steps=10,
    evaluation_strategy="epoch"  # Evaluate at the end of each epoch
)

In [None]:
# Initialize the Trainer with the compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Use test dataset for evaluation here
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate the model on the test set using the metrics
results = trainer.evaluate()
print(results)

In [20]:
# Save the model

In [21]:
# Save results