In [None]:
!pip install transformers

In [None]:
!pip install --upgrade accelerate

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import Dataset
import numpy as np

In [None]:
# Custom dataset class
class ComplaintsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Function for encoding data
def encode_data(tokenizer, texts, labels, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    return ComplaintsDataset(encodings, labels)

In [None]:
# Load dataset
file_path = 'complaints-official-2-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Split dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.3)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5)

In [None]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode data
train_dataset = encode_data(tokenizer, train_texts.tolist(), train_labels.tolist())
val_dataset = encode_data(tokenizer, val_texts.tolist(), val_labels.tolist())
test_dataset = encode_data(tokenizer, test_texts.tolist(), test_labels.tolist())

# Load BERT model
base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Define a function to create the Trainer object for each set of hyperparameters
def create_trainer(model, train_dataset, val_dataset, args):
    return Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

# Define the hyperparameters grid for manual search
learning_rates = [5e-5, 3e-5, 2e-5]
num_epochs = [2, 3, 4]
best_accuracy = 0.0
best_hyperparams = {}

# Loop through hyperparameters
for lr in learning_rates:
    for epoch in num_epochs:
        # Update training arguments with current hyperparameters
        current_training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=epoch,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=16,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            learning_rate=lr,  # Set the learning rate
        )

        # Create Trainer object for current hyperparameters
        current_trainer = create_trainer(base_model, train_dataset, val_dataset, current_training_args)

        # Train the model
        current_trainer.train()

        # Evaluate on the validation set
        test_results = current_trainer.predict(test_dataset)
        predictions = np.argmax(test_results.predictions, axis=-1)
        accuracy = accuracy_score(test_labels, predictions)

        # Track the best hyperparameters based on accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_hyperparams = {'learning_rate': lr, 'num_train_epochs': epoch}

# Use the best hyperparameters found during the search
best_hyperparams['output_dir'] = './best_results'
best_training_args = TrainingArguments(**best_hyperparams)

# Create Trainer object for the best hyperparameters
best_trainer = create_trainer(base_model, train_dataset, val_dataset, best_training_args)

# Train the final model with the best hyperparameters
best_trainer.train()

In [None]:
# Evaluate the model
validation_results = best_trainer.evaluate(val_dataset)
print(f"Validation Results: {validation_results}")

# Predictions on test set
test_results = best_trainer.predict(test_dataset)
preds = np.argmax(test_results.predictions, axis=-1)

# Metrics calculation
accuracy_2_classes = accuracy_score(test_labels, preds)
f1_2_classes = f1_score(test_labels, preds, average='weighted')

print(f"Test Accuracy for 2 classes: {accuracy_2_classes}")
print(f"Test F1 Score for 2 classes: {f1_2_classes}")

In [None]:
# Four classes
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import numpy as np
from sklearn.model_selection import GridSearchCV

In [None]:
# Custom dataset class
class ComplaintsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Function for encoding data
def encode_data(tokenizer, texts, labels, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    return ComplaintsDataset(encodings, labels)

In [None]:
# Load dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Split dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.3)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5)

In [None]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode data and create datasets (assuming these functions are defined)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = encode_data(tokenizer, train_texts.tolist(), train_labels.tolist())
val_dataset = encode_data(tokenizer, val_texts.tolist(), val_labels.tolist())
test_dataset = encode_data(tokenizer, test_texts.tolist(), test_labels.tolist())

# Load BERT model for sequence classification with 4 classes
base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

In [None]:
# Define a function to create the Trainer object for each set of hyperparameters
def create_trainer(model, train_dataset, val_dataset, args):
    return Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

# Define the hyperparameters grid for manual search
learning_rates = [5e-5, 3e-5, 2e-5]
num_epochs = [2, 3, 4]
best_accuracy = 0.0
best_hyperparams = {}

# Loop through hyperparameters
for lr in learning_rates:
    for epoch in num_epochs:
        # Update training arguments with current hyperparameters
        current_training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=epoch,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=16,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            learning_rate=lr,  # Set the learning rate
        )

        # Create Trainer object for current hyperparameters
        current_trainer = create_trainer(base_model, train_dataset, val_dataset, current_training_args)

        # Train the model
        current_trainer.train()

        # Evaluate on the validation set
        test_results = current_trainer.predict(test_dataset)
        predictions = np.argmax(test_results.predictions, axis=-1)
        accuracy = accuracy_score(test_labels, predictions)

        # Track the best hyperparameters based on accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_hyperparams = {'learning_rate': lr, 'num_train_epochs': epoch}

# Use the best hyperparameters found during the search
best_hyperparams['output_dir'] = './best_results'
best_training_args = TrainingArguments(**best_hyperparams)

# Create Trainer object for the best hyperparameters
best_trainer = create_trainer(base_model, train_dataset, val_dataset, best_training_args)

# Train the final model with the best hyperparameters
best_trainer.train()

In [None]:
# Evaluate the final model on the test set
test_results = best_trainer.predict(test_dataset)
predictions = np.argmax(test_results.predictions, axis=-1)

# Metrics
accuracy_4_classes = accuracy_score(test_labels, predictions)
f1_4_classes = f1_score(test_labels, predictions, average='weighted')

print(f"Test Accuracy for 4 classes: {accuracy_4_classes}")
print(f"Test F1 Score for 4 classes: {f1_4_classes}")

In [None]:
def train_and_evaluate(num_samples, num_classes):
    # Subset your dataset based on num_samples and num_classes
    # This is just a template. You'll need to adapt it to your dataset.
    subset_df = df.sample(n=num_samples)
    subset_df = subset_df[subset_df['Label'] < num_classes]

    # Split the data into training and testing
    train_texts, test_texts, train_labels, test_labels = train_test_split(subset_df['Consumer complaint narrative'], subset_df['Label'], test_size=0.2)

    # Encode the data
    train_dataset = encode_data(tokenizer, train_texts.tolist(), train_labels.tolist())
    test_dataset = encode_data(tokenizer, test_texts.tolist(), test_labels.tolist())

    # Initialize the model for the specific number of classes
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,  # Or any other number of epochs you'd like to use
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    test_results = trainer.predict(test_dataset)
    predictions = np.argmax(test_results.predictions, axis=-1)

    # Calculate accuracy
    accuracy = accuracy_score(test_labels, predictions)

    return accuracy

In [None]:
import matplotlib.pyplot as plt

num_samples_range = [50, 100, 150, 200]
num_classes_options = [2, 4]
accuracies = {num_classes: [] for num_classes in num_classes_options}

for num_samples in num_samples_range:
    for num_classes in num_classes_options:
        accuracy = train_and_evaluate(num_samples, num_classes)  # Your function to train and get accuracy
        accuracies[num_classes].append(accuracy)

# Plotting
plt.figure(figsize=(10, 6))
for num_classes, acc_values in accuracies.items():
    plt.plot(num_samples_range, acc_values, label=f'{num_classes} Classes', marker='o')

plt.title('Accuracy vs Number of Training Samples for Different Classes')
plt.xlabel('Number of Training Samples')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
## BERT base model with manually tuning hyperparameters for 2 classes
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

class ComplaintsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    return ComplaintsDataset(encodings, labels)

# Load and preprocess the dataset
file_path = 'complaints-official-2-classes.xlsx'  # Update file path
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Split the dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.3)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode the data
train_dataset = encode_data(tokenizer, train_texts.tolist(), train_labels.tolist())
val_dataset = encode_data(tokenizer, val_texts.tolist(), val_labels.tolist())
test_dataset = encode_data(tokenizer, test_texts.tolist(), test_labels.tolist())

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), weight_decay=1e-5, lr=2e-5)

# Training loop
num_epochs = 10
patience = 3  # Number of epochs to wait for a decrease in validation loss
no_improvement = 0  # Counter to track epochs without improvement
best_val_loss = float('inf')  # Initialize best validation loss

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    
    # Training phase
    for batch in train_loader:
        batch_input_ids = batch['input_ids']
        batch_attention_mask = batch['attention_mask']
        batch_labels = batch['labels']

        model.zero_grad()

        outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    # print(f'Epoch {epoch + 1}/{num_epochs} | Train Loss: {avg_train_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for val_batch in val_loader:
            val_input_ids = val_batch['input_ids']
            val_attention_mask = val_batch['attention_mask']
            val_labels = val_batch['labels']

            val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
            val_loss = val_outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{num_epochs} | Train Loss: {avg_train_loss} | Validation Loss: {avg_val_loss}')
    
    # Early stopping based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improvement = 0
    else:
        no_improvement += 1
        if no_improvement >= patience:
            print(f'Validation loss did not decrease for {patience} consecutive epochs. Stopping training...')
            break

# Evaluation on the test set
model.eval()
test_predictions = []
test_references = []
with torch.no_grad():
    for batch in test_loader:
        batch_input_ids = batch['input_ids']
        batch_attention_mask = batch['attention_mask']
        batch_labels = batch['labels']
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).tolist()
        test_predictions.extend(predictions)
        test_references.extend(batch_labels.tolist())

accuracy_2_classes = accuracy_score(test_references, test_predictions)
f1_2_classes = f1_score(test_references, test_predictions, average='weighted')

print(f"Test Accuracy for 2 classes: {accuracy_2_classes}")
print(f"Test F1 Score for 2 classes: {f1_2_classes}")

In [1]:
## BERT base model with manually tuning hyperparameters for 4 classes
import torch
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

class ComplaintsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    return ComplaintsDataset(encodings, labels)

# Load and preprocess the dataset
file_path = 'complaints-official-4-classes.xlsx'  # Update file path
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Split the dataset
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.3)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode the data
train_dataset = encode_data(tokenizer, train_texts.tolist(), train_labels.tolist())
val_dataset = encode_data(tokenizer, val_texts.tolist(), val_labels.tolist())
test_dataset = encode_data(tokenizer, test_texts.tolist(), test_labels.tolist())

# Load the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# Optimizer
optimizer = AdamW(model.parameters(), weight_decay=1e-5, lr=2e-5)

# Training loop
num_epochs = 10
patience = 3  # Number of epochs to wait for a decrease in validation loss
no_improvement = 0  # Counter to track epochs without improvement
best_val_loss = float('inf')  # Initialize best validation loss

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    
    # Training phase
    for batch in train_loader:
        batch_input_ids = batch['input_ids']
        batch_attention_mask = batch['attention_mask']
        batch_labels = batch['labels']

        model.zero_grad()

        outputs = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_labels)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    # print(f'Epoch {epoch + 1}/{num_epochs} | Train Loss: {avg_train_loss}')

    # Validation phase
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for val_batch in val_loader:
            val_input_ids = val_batch['input_ids']
            val_attention_mask = val_batch['attention_mask']
            val_labels = val_batch['labels']

            val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)
            val_loss = val_outputs.loss
            total_val_loss += val_loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{num_epochs} | Train Loss: {avg_train_loss} | Validation Loss: {avg_val_loss}')
    
    # Early stopping based on validation loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        no_improvement = 0
    else:
        no_improvement += 1
        if no_improvement >= patience:
            print(f'Validation loss did not decrease for {patience} consecutive epochs. Stopping training...')
            break

# Evaluation on the test set
model.eval()
test_predictions = []
test_references = []
with torch.no_grad():
    for batch in test_loader:
        batch_input_ids = batch['input_ids']
        batch_attention_mask = batch['attention_mask']
        batch_labels = batch['labels']
        outputs = model(batch_input_ids, attention_mask=batch_attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).tolist()
        test_predictions.extend(predictions)
        test_references.extend(batch_labels.tolist())

# Calculate test set metrics
accuracy_4_classes = accuracy_score(test_references, test_predictions)
f1_4_classes = f1_score(test_references, test_predictions, average='weighted')

print(f"Test Accuracy for 4 classes: {accuracy_4_classes}")
print(f"Test F1 Score for 4 classes: {f1_4_classes}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/10 | Train Loss: 1.3457395076751708 | Validation Loss: 1.151762346426646
Epoch 2/10 | Train Loss: 1.1305241852998733 | Validation Loss: 1.1591932376225789
Epoch 3/10 | Train Loss: 0.9258066594600678 | Validation Loss: 0.9870468576749166
Epoch 4/10 | Train Loss: 0.7849193990230561 | Validation Loss: 1.2622058590253193
Epoch 5/10 | Train Loss: 0.6725334733724594 | Validation Loss: 1.1726154883702595
Epoch 6/10 | Train Loss: 0.5642437353730202 | Validation Loss: 1.1422935326894124
Validation loss did not decrease for 3 consecutive epochs. Stopping training...
Test Accuracy for 4 classes: 0.6666666666666666
Test F1 Score for 4 classes: 0.6651515151515153
