In [None]:
import pandas as pd
from transformers import BertModel, BertTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np

In [None]:
# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Extract the [CLS] token's embeddings

In [None]:
# Load pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Load dataset
file_path = 'complaints-official-2-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Split dataset into training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

In [None]:
# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, model)
test_features = extract_features(test_texts.tolist(), tokenizer, model)

In [None]:
train_features

In [None]:
test_features

In [None]:
# Logistic Regression Classifier
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(train_features, train_labels)

# Make predictions on the test set
predictions = log_reg_model.predict(test_features)
predictions

In [None]:
# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

In [None]:
### Using Bert and SK-Learn Logistic Regression for 2 classes with grid search
import pandas as pd
from transformers import BertModel, BertTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np

# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Extract the [CLS] token's embeddings

# Load pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load dataset
file_path = 'complaints-official-2-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Split dataset into training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, model)
test_features = extract_features(test_texts.tolist(), tokenizer, model)

# Logistic Regression Classifier with Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}
log_reg = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_features, train_labels)

# Best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_model.predict(test_features)

# Calculate accuracy and F1 score
accuracy_2_classes = accuracy_score(test_labels, predictions)
f1_2_classes = f1_score(test_labels, predictions, average='weighted')

print("Best Parameters:", grid_search.best_params_)
print(f"Accuracy over 2 classes: {accuracy_2_classes}")
print(f"F1 Score over 2 classes: {f1_2_classes}")


In [None]:
### Using Bert and SK-Learn Logistic Regression for 4 classes with grid search
from transformers import BertModel, BertTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np

# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Extract the [CLS] token's embeddings

# Load pre-trained BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Load dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Split dataset into training and testing
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, model)
test_features = extract_features(test_texts.tolist(), tokenizer, model)

# Logistic Regression Classifier with Grid Search
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['lbfgs', 'newton-cg', 'sag', 'saga']
}
log_reg = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_features, train_labels)

# Best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_model.predict(test_features)

# Calculate accuracy and F1 score
accuracy_4_classes = accuracy_score(test_labels, predictions)
f1_4_classes = f1_score(test_labels, predictions, average='weighted')

print("Best Parameters:", grid_search.best_params_)
print(f"Accuracy over 4 classes: {accuracy_4_classes}")
print(f"F1 Score over 4 classes: {f1_4_classes}")


In [None]:
import matplotlib.pyplot as plt

# Number of classes
classes = [2, 4]

# Accuracies and F1 scores
accuracies = [accuracy_2_classes, accuracy_4_classes]
f1_scores = [f1_2_classes, f1_4_classes]

# Plotting Accuracy
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(classes, accuracies, marker='o')
plt.title('Accuracy over Number of Classes')
plt.xlabel('Number of Classes')
plt.ylabel('Accuracy')
plt.xticks(classes)

# Plotting F1 Score
plt.subplot(1, 2, 2)
plt.plot(classes, f1_scores, marker='o', color='orange')
plt.title('F1 Score over Number of Classes')
plt.xlabel('Number of Classes')
plt.ylabel('F1 Score')
plt.xticks(classes)

# Show plots
plt.tight_layout()
plt.show()

In [None]:
### Use Bert base model and logistic regression in PyTorch with manual hyperparameter tuning (2 classes)
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

# Custom Logistic Regression model in PyTorch
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

# Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

# Load and preprocess dataset
file_path = 'complaints-official-2-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, bert_model)
test_features = extract_features(test_texts.tolist(), tokenizer, bert_model)

# Convert to PyTorch datasets
train_dataset = TensorDataset(torch.tensor(train_features, dtype=torch.float32), torch.tensor(train_labels.values, dtype=torch.float32).view(-1, 1))
test_dataset = TensorDataset(torch.tensor(test_features, dtype=torch.float32), torch.tensor(test_labels.values, dtype=torch.float32).view(-1, 1))

# Hyperparameters
learning_rate = 0.001
weight_decay = 0.001
batch_size = 16
step_size = 30 # Number of epochs after which to reduce learning rate
gamma = 0.1 # Reduction factor for learning rate

# DataLoader with batch size
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# Logistic Regression Model
input_dim = train_features.shape[1]
model = LogisticRegressionModel(input_dim)

# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# Training Loop with Early Stopping
patience = 10
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(500):  # Number of epochs
    model.train()
    for features, labels in train_loader:
        features, labels = features, labels

        # Forward pass
        outputs = model(features).view(-1)
        outputs = torch.sigmoid(outputs)
        loss = criterion(outputs, labels.view(-1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Update learning rate
    scheduler.step()
    
    # Validation loss
    with torch.no_grad():
        val_features, val_labels = test_dataset.tensors
        val_features, val_labels = val_features, val_labels
        val_outputs = model(val_features).view(-1)
        val_outputs = torch.sigmoid(val_outputs)
        val_loss = criterion(val_outputs, val_labels.view(-1))

    print(f'Epoch [{epoch+1}/100], Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter > patience:
        print("Early stopping triggered")
        break       
        
# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(torch.tensor(test_features, dtype=torch.float32)).sigmoid().round()
    accuracy = accuracy_score(test_labels, predictions.cpu().numpy())
    f1 = f1_score(test_labels, predictions.cpu().numpy(), average='weighted')

print(f"Accuracy over 2 classes: {accuracy}")
print(f"F1 Score over 2 classes: {f1}")


In [None]:
### Use Bert base model and logistic regression in PyTorch with manual hyperparameter tuning (4 classes)
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

# Custom Logistic Regression model in PyTorch
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim, num_classes=4):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        outputs = self.linear(x)
        return outputs

# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

# Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

# Load and preprocess dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, bert_model)
test_features = extract_features(test_texts.tolist(), tokenizer, bert_model)

# Convert labels for multi-class classification
train_labels = torch.tensor(train_labels.values, dtype=torch.long)
test_labels = torch.tensor(test_labels.values, dtype=torch.long)

# Convert to PyTorch datasets for multi-class
train_dataset = TensorDataset(torch.tensor(train_features, dtype=torch.float32), train_labels)
test_dataset = TensorDataset(torch.tensor(test_features, dtype=torch.float32), test_labels)

# Hyperparameters
learning_rate = 0.001
weight_decay = 0.001
batch_size = 16
step_size = 30 # Number of epochs after which to reduce learning rate
gamma = 0.1 # Reduction factor for learning rate

# DataLoader with batch size
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# Logistic Regression Model
input_dim = train_features.shape[1]
model = LogisticRegressionModel(input_dim)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# Training Loop with Early Stopping
patience = 10
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(500):  # Number of epochs
    model.train()
    for features, labels in train_loader:
        outputs = model(features)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Update learning rate
    scheduler.step()
    
    # Validation loss
    model.eval()
    with torch.no_grad():
        val_features, val_labels = test_dataset.tensors
        val_outputs = model(val_features)
        val_loss = criterion(val_outputs, val_labels)

    print(f'Epoch [{epoch+1}/500], Loss: {loss.item()}, Validation Loss: {val_loss.item()}')


    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter > patience:
        print("Early stopping triggered")
        break       

# Evaluate the model for multi-class classification
model.eval()
with torch.no_grad():
    predictions = model(torch.tensor(test_features, dtype=torch.float32))
    predicted_classes = torch.argmax(predictions, dim=1)
    accuracy = accuracy_score(test_labels.cpu().numpy(), predicted_classes.cpu().numpy())
    f1 = f1_score(test_labels.cpu().numpy(), predicted_classes.cpu().numpy(), average='weighted')

print(f"Accuracy over 4 classes: {accuracy}")
print(f"F1 Score over 4 classes: {f1}")

In [None]:
### Use Bert base model and MLPClassifer
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

torch.manual_seed(42)

# Custom MLP Classifier model in PyTorch
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes=4):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)  # First hidden layer
        self.fc2 = nn.Linear(512, 128)        # Second hidden layer
        self.fc3 = nn.Linear(128, num_classes) # Output layer
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)  # No activation function here, CrossEntropyLoss will take care of that
        return x

# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]

# Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

# Load and preprocess dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, bert_model)
test_features = extract_features(test_texts.tolist(), tokenizer, bert_model)

train_labels_tensor = torch.tensor(train_labels.values, dtype=torch.long)
test_labels_tensor = torch.tensor(test_labels.values, dtype=torch.long)

# Convert to PyTorch datasets for multi-class
train_dataset = TensorDataset(torch.tensor(train_features, dtype=torch.float32), train_labels_tensor)
test_dataset = TensorDataset(torch.tensor(test_features, dtype=torch.float32), test_labels_tensor)

# MLP Classifier Model
input_dim = train_features.shape[1]
model = MLPClassifier(input_dim)

# Hyperparameters
learning_rate = 0.001
weight_decay = 0.001
batch_size = 16
step_size = 30 # Number of epochs after which to reduce learning rate
gamma = 0.1 # Reduction factor for learning rate

# DataLoader with batch size
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

# Logistic Regression Model
input_dim = train_features.shape[1]
model = LogisticRegressionModel(input_dim)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

# Training Loop with Early Stopping
patience = 50
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(500):  # Number of epochs
    model.train()
    for features, labels in train_loader:
        outputs = model(features)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Update learning rate
    scheduler.step()
    
    # Validation loss
    model.eval()
    with torch.no_grad():
        val_features, val_labels = test_dataset.tensors
        val_outputs = model(val_features)
        val_loss = criterion(val_outputs, val_labels)

    print(f'Epoch [{epoch+1}/500], Loss: {loss.item()}, Validation Loss: {val_loss.item()}')


    # Early Stopping Check
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter > patience:
        print("Early stopping triggered")
        break       

# Evaluate the model for multi-class classification
model.eval()
with torch.no_grad():
    predictions = model(torch.tensor(test_features, dtype=torch.float32))
    predicted_classes = torch.argmax(predictions, dim=1)
    accuracy = accuracy_score(test_labels, predicted_classes)
    f1 = f1_score(test_labels, predicted_classes, average='weighted')

print(f"Accuracy over 4 classes: {accuracy}")
print(f"F1 Score over 4 classes: {f1}")

In [None]:
### Sentence Transformer with MLPClassifier for 4 classes
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
import pandas as pd
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
import torch

torch.manual_seed(42)

# Load and preprocess dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Setup for stsb-bert-base model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

# Extract features
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

train_features = extract_features(train_texts.tolist(), tokenizer, bert_model)
test_features = extract_features(test_texts.tolist(), tokenizer, bert_model)

# Convert labels for multi-class classification
train_labels = train_labels.values
test_labels = test_labels.values

# Define MLPClassifier
mlp_classifier = MLPClassifier()

# Define hyperparameters grid for grid search
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (150,)],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [1000, 3000, 6000]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(mlp_classifier, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(train_features, train_labels)

# Get best hyperparameters
best_params = grid_search.best_params_

# Train MLPClassifier with best hyperparameters
best_mlp_classifier = MLPClassifier(**best_params)
best_mlp_classifier.fit(train_features, train_labels)

# Predict using best model
predictions = best_mlp_classifier.predict(test_features)

# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions, average='weighted')

print(f"Accuracy over 4 classes: {accuracy}")
print(f"F1 Score over 4 classes: {f1}")