In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load dataset
df = pd.read_csv("pitch_data.csv")

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Custom Dataset Class
class PitchDeckDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors='pt')

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Prepare dataset
texts = df['text'].tolist()
labels = df['label'].tolist()
dataset = PitchDeckDataset(texts, labels)

# Split data
train_size = int(0.8 * len(dataset))
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, len(dataset) - train_size])

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=101)  # 0-100 scoring

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Epoch,Training Loss,Validation Loss
1,No log,4.245443
2,No log,4.242662
3,No log,4.237604


Attempted to log scalar metric eval_loss:
4.245443344116211
Attempted to log scalar metric eval_runtime:
0.1433
Attempted to log scalar metric eval_samples_per_second:
13.953
Attempted to log scalar metric eval_steps_per_second:
6.977
Attempted to log scalar metric epoch:
1.0
Attempted to log scalar metric eval_loss:
4.24266242980957
Attempted to log scalar metric eval_runtime:
0.168
Attempted to log scalar metric eval_samples_per_second:
11.906
Attempted to log scalar metric eval_steps_per_second:
5.953
Attempted to log scalar metric epoch:
2.0
Attempted to log scalar metric eval_loss:
4.237604141235352
Attempted to log scalar metric eval_runtime:
0.1829
Attempted to log scalar metric eval_samples_per_second:
10.932
Attempted to log scalar metric eval_steps_per_second:
5.466
Attempted to log scalar metric epoch:
3.0
Attempted to log scalar metric train_runtime:
5.315
Attempted to log scalar metric train_samples_per_second:
2.258
Attempted to log scalar metric train_steps_per_second:
0

('./fine_tuned_bert\\tokenizer_config.json',
 './fine_tuned_bert\\special_tokens_map.json',
 './fine_tuned_bert\\vocab.txt',
 './fine_tuned_bert\\added_tokens.json')