In [1]:
!pip install transformers torch scikit-learn




In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import torch

# Load dataset
DATASET_COLUMNS = ['target', 'ids', 'date', 'flag', 'user', 'text']
DATASET_ENCODING = "ISO-8859-1"
df = pd.read_csv('./Project_Data.csv', encoding=DATASET_ENCODING, names=DATASET_COLUMNS)

# Filter relevant columns
df = df[['target', 'text']]

# Map target values to class labels
df['target'] = df['target'].map({0: 0, 2: 1, 4: 2})

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode sequences
def encode_data(texts, labels, max_length=128):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt'
    )
    return encodings, torch.tensor(labels.values)

train_encodings, train_labels = encode_data(X_train, y_train)
test_encodings, test_labels = encode_data(X_test, y_test)




vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
from torch.utils.data import Dataset, DataLoader

class TwitterDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TwitterDataset(train_encodings, train_labels)
test_dataset = TwitterDataset(test_encodings, test_labels)


In [6]:
from transformers import BertForSequenceClassification, AdamW, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

# Wrap the model and data loader with accelerator.prepare() if necessary
model, train_dataloader = accelerator.prepare(model, train_dataloader)


NameError: name 'train_dataloader' is not defined

In [10]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

In [12]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from accelerate import Accelerator


In [14]:
# Create DataLoader for training
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),  # Shuffle the data
    batch_size=8  # Adjust batch size as needed
)

# Create DataLoader for testing/validation
test_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),  # No shuffle for evaluation
    batch_size=8  # Same as training batch size
)

accelerator = Accelerator()

# Prepare model and DataLoaders
model, train_dataloader, test_dataloader = accelerator.prepare(
    model, train_dataloader, test_dataloader
)


In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch
from tqdm.auto import tqdm

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Total number of training steps is [number of batches] x [number of epochs]
total_steps = len(train_dataloader) * 3  # Adjust the number of epochs as needed

# Set up the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Prepare optimizer and scheduler with Accelerator
optimizer, scheduler = accelerator.prepare(optimizer, scheduler)

# Training loop
epochs = 3  # Number of epochs to train
for epoch in range(epochs):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")

    for batch in progress_bar:
        # Move the batch to the device managed by Accelerator
        batch = {k: v.to(accelerator.device) for k, v in batch.items()}

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Accumulate the loss for reporting
        total_loss += loss.item()

        # Backward pass
        accelerator.backward(loss)

        # Step the optimizer and scheduler
        optimizer.step()
        scheduler.step()

        # Update the progress bar
        progress_bar.set_postfix({"loss": loss.item()})

    # Print epoch loss
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} finished with average loss: {avg_loss:.4f}")

# Save the model after training
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


  optimizer: Optimizer,


Epoch 1:   0%|          | 0/160000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [17]:
! pip install -U accelerate
! pip install -U transformers

Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.2-cp311-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   ------ --------------------------------- 1.6/10.0 MB 7.6 MB/s eta 0:00:02
   --------------- ------------------------ 3.9/10.0 MB 9.8 MB/s eta 0:00:01
   ------------------------- -------------- 6.3/10.0 MB 10.2 MB/s eta 0:00:01
   --------------------------------- ------ 8.4/10.0 MB 10.2 MB/s eta 0:00:01
   ---------------------------------------- 10.0/10.0 MB 10.1 MB/s eta 0:00:00
Downloading tokenizers-0.20.2-cp311-none-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   ----------------------------------- ---- 2.1/2.4 MB 9.8 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 

  You can safely remove it manually.


In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`