In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from torch import nn

sns.set()
%matplotlib inline
seed = 1212
seed2 = 2121

#### Read in the data set using pandas



In [2]:
def load_reviews(folder_path):
    reviews = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):  # Process only .txt files
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                reviews.extend(file.readlines())  # Add lines from the file
    return reviews

neg_train_paths = 'aclImdb/train/neg'
pos_train_paths = 'aclImdb/train/pos'

neg_test_paths = 'aclImdb/test/neg'
pos_test_paths = 'aclImdb/test/pos'

negative_train_reviews = load_reviews(neg_train_paths)
positive_train_reviews = load_reviews(pos_train_paths)

negative_test_reviews = load_reviews(neg_test_paths)
positive_test_reviews = load_reviews(pos_test_paths)

training_data = pd.DataFrame({
    'review': negative_train_reviews + positive_train_reviews,
    'label': [0] * len(negative_train_reviews) + [1] * len(positive_train_reviews)
})
#training_data['review'] = training_data['review'].str.strip() #Uncomment to clean
#Shuffle
training_data = training_data.sample(frac=1, random_state=seed).reset_index(drop=True)

testing_data = pd.DataFrame({
    'review': negative_test_reviews + positive_test_reviews,
    'label': [0] * len(negative_test_reviews) + [1] * len(positive_test_reviews)
})
#testing_data['review'] = testing_data['review'].str.strip()
#Shuffle
testing_data = testing_data.sample(frac=1, random_state=seed2).reset_index(drop=True)

#Batch if wanted
#training_data = training_data[:5000]
#testing_data = testing_data[:5000]

X_train = training_data['review']
y_train = training_data['label']

X_test = testing_data['review']
y_test = testing_data['label']



In [3]:
#loading model and tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.classifier = nn.Linear(768, 1)
model.num_labels = 1
#model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [18]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([0]).unsqueeze(0)  # Batch size 1
inputs['labels'] = labels
outputs = model(**inputs)
loss = outputs.loss
logits = outputs.logits

print(outputs)
print(loss)
print(logits)

SequenceClassifierOutput(loss=tensor(0.0734, grad_fn=<MseLossBackward0>), logits=tensor([[-0.2709]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
tensor(0.0734, grad_fn=<MseLossBackward0>)
tensor([[-0.2709]], grad_fn=<AddmmBackward0>)


In [5]:
# Convert Pandas Data Frame to hugging_face Dataset
train_data = Dataset.from_pandas(training_data)
test_data = Dataset.from_pandas(testing_data)

In [6]:
def tokenize_review(examples):
    return tokenizer(examples['review'], padding='max_length', truncation=True, max_length=512)

f_train_data = train_data.map(tokenize_review, batched=True)
f_train_data = f_train_data.remove_columns(['review'])
f_train_data = f_train_data.shuffle(seed=seed)

f_test_data = test_data.map(tokenize_review, batched=True)
f_test_data = f_test_data.remove_columns(['review'])



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [7]:
# Split for eval data
n_samples = len(f_train_data)
n_train = int(0.9 * n_samples)

f_train = f_train_data.select(range(n_train))
f_eval  = f_train_data.select(range(n_train, n_samples))

In [12]:
def RMSE(predictions):
    targets, labels = predictions
    targets, labels = logits.squeeze(), labels.squeeze()
    rmse = np.sqrt(np.mean((labels - targets) ** 2))
    return {'RMSE': rmse}


In [13]:
# Modify Our trainer here
training_args = TrainingArguments(
    'training_args',
    num_train_epochs = 5,
    logging_steps = 200,
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    eval_strategy = 'steps'
)

trainer = Trainer(
    model = model,
    train_dataset = f_train,
    eval_dataset  = f_eval,
    compute_metrics = RMSE,
    args = training_args
)

In [14]:
trainer.train()


RuntimeError: Found dtype Long but expected Float