In [3]:
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import LinearSVC
import pandas as pd
import torch

# Setup for stsb-bert-base model
tokenizer = BertTokenizer.from_pretrained('sentence-transformers/stsb-bert-base')
bert_model = BertModel.from_pretrained('sentence-transformers/stsb-bert-base')
bert_model.eval()

# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

# Load and preprocess dataset
file_path = 'complaints-official-2-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, bert_model)
test_features = extract_features(test_texts.tolist(), tokenizer, bert_model)

# Hyperparameters grid for grid search
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'max_iter': [1000]
}

# Initialize LinearSVC
linear_svm = LinearSVC()

# Grid search with cross-validation
grid_search = GridSearchCV(linear_svm, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(train_features, train_labels)

# Get best hyperparameters
best_params = grid_search.best_params_

# Train LinearSVC with best hyperparameters
best_linear_svm = LinearSVC(**best_params)
best_linear_svm.fit(train_features, train_labels)

# Predict using best LinearSVC
predictions = best_linear_svm.predict(test_features)

# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions, average='weighted')

print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy over 2 classes: {accuracy}")
print(f"F1 Score over 2 classes: {f1}")

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/xiaojingzhang/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/xiaojingzhang/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/xiaojingzhang/anaconda3/lib/python3.11/site-packages/sklearn/svm/_classes.py", line 326, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                       

Best Hyperparameters: {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2'}
Accuracy over 2 classes: 0.7954545454545454
F1 Score over 2 classes: 0.795137420718816


In [4]:
### Sentence Transformer with LinearSVM for 4 classes
import torch
from transformers import BertModel, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV

# Function to extract features using BERT
def extract_features(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].detach().numpy()

# Setup for stsb-bert-base model
tokenizer = BertTokenizer.from_pretrained('sentence-transformers/stsb-bert-base')
bert_model = BertModel.from_pretrained('sentence-transformers/stsb-bert-base')
bert_model.eval()

# Load and preprocess dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['Consumer complaint narrative'], df['Label'], test_size=0.2, random_state=42)

# Extract features
train_features = extract_features(train_texts.tolist(), tokenizer, bert_model)
test_features = extract_features(test_texts.tolist(), tokenizer, bert_model)

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Expanded range
    'tol': [1e-5, 1e-4, 1e-3, 1e-2],
    'max_iter': [500, 1000, 2000, 5000]
}

# Create GridSearchCV object
grid_search = GridSearchCV(LinearSVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_features, train_labels)

# Best model
best_model = grid_search.best_estimator_

# Make predictions on the test set
predictions = best_model.predict(test_features)

# Calculate accuracy and F1 score
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions, average='weighted')

print("Best Parameters:", grid_search.best_params_)
print(f"Accuracy over 4 classes: {accuracy}")
print(f"F1 Score over 4 classes: {f1}")





































Best Parameters: {'C': 0.01, 'max_iter': 500, 'tol': 1e-05}
Accuracy over 4 classes: 0.5909090909090909
F1 Score over 4 classes: 0.5727476445123504


