In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import logging as hf_logging

hf_logging.set_verbosity_error()

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def get_model(num_labels):
    return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

def get_tokenizer():
    return BertTokenizer.from_pretrained('bert-base-uncased')

def get_dataset(file_path):
    data = pd.read_csv(file_path, header=None, names=['label', 'text'], encoding='latin-1')
    data['label'] = data['label'].map({'neutral': 1, 'positive': 0, 'negative': 2})
    return data

def get_trainer(model, train_dataset, eval_dataset):
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=1,              # total number of training epochs
        per_device_train_batch_size=64,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        # warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=eval_dataset            # evaluation dataset
    )

    return trainer

def main():
    data = get_dataset(r'D:\Vegeta\Projects\DL projects\NewsGenius\Data\all-data.csv')
    train_data, eval_data = train_test_split(data, test_size=0.2)
    
    tokenizer = get_tokenizer()
    
    train_dataset = CustomDataset(
        texts=train_data.text.to_numpy(),
        labels=train_data.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=128
    )

    eval_dataset = CustomDataset(
        texts=eval_data.text.to_numpy(),
        labels=eval_data.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=128
    )
    
    model = get_model(3)
    trainer = get_trainer(model, train_dataset, eval_dataset)
    
    trainer.train()
    trainer.save_model('model')

if __name__ == '__main__':
    main()


  0%|          | 0/48 [05:31<?, ?it/s]


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
from transformers import logging as hf_logging
from tqdm.auto import tqdm

hf_logging.set_verbosity_error()

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten().to(device),
            'attention_mask': encoding['attention_mask'].flatten().to(device),
            'labels': torch.tensor(label, dtype=torch.long).to(device)
        }

def get_model(num_labels):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    return model.to(device)

def get_tokenizer():
    return BertTokenizer.from_pretrained('bert-base-uncased')

def get_dataset(file_path):
    data = pd.read_csv(file_path, header=None, names=['label', 'text'], encoding='latin-1')
    data['label'] = data['label'].map({'neutral': 1, 'positive': 0, 'negative': 2})
    return data

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def get_trainer(model, train_dataset, eval_dataset):
    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=1,              # total number of training epochs
        per_device_train_batch_size=64,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
        eval_strategy="epoch"      # Evaluate every epoch
    )

    trainer = Trainer(
        model=model,                         # the instantiated ðŸ¤— Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=eval_dataset,           # evaluation dataset
        compute_metrics=compute_metrics      # function to compute metrics
    )

    return trainer

def main():
    data = get_dataset(r'D:\Vegeta\Projects\DL projects\NewsGenius\Data\all-data.csv')
    train_data, eval_data = train_test_split(data, test_size=0.2)
    
    tokenizer = get_tokenizer()
    
    train_dataset = CustomDataset(
        texts=train_data.text.to_numpy(),
        labels=train_data.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=128
    )

    eval_dataset = CustomDataset(
        texts=eval_data.text.to_numpy(),
        labels=eval_data.label.to_numpy(),
        tokenizer=tokenizer,
        max_len=128
    )
    
    model = get_model(3)
    trainer = get_trainer(model, train_dataset, eval_dataset)
    
    trainer.train()
    trainer.save_model('model')
    
    # Evaluate the model
    results = trainer.evaluate()
    print(f"Evaluation Results: {results}")

    # Generate classification report
    predictions, labels, _ = trainer.predict(eval_dataset)
    preds = predictions.argmax(-1)
    print(classification_report(labels, preds, target_names=['positive', 'neutral', 'negative']))

if __name__ == '__main__':
    # Check if CUDA is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    main()
