In [None]:
### Sentence Transformer with LinearSVC for 2 classes
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix as sk_confusion_matrix
from sklearn.svm import LinearSVC
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import BertModel, BertTokenizer
import numpy as np
import torch
import scipy

# Initialize the tokenizer and model for 'stsb-bert-base'
tokenizer = BertTokenizer.from_pretrained('sentence-transformers/stsb-bert-base')
bert_model = BertModel.from_pretrained('sentence-transformers/stsb-bert-base')

def encode_texts(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = bert_model(**encoded_input)
    return model_output.pooler_output.detach().numpy()

def extract_features(narratives):
    features = []
    domain_terms = ["credit", "debt", "loan", "score", "report", "financial", "consumer", 
                    "account", "law", "act", "reporting", "services", "refund", "card"]
    for narrative in narratives:
        sentences = nltk.sent_tokenize(narrative)
        avg_sentence_length = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences) / len(sentences) if sentences else 0
        domain_term_count = sum(narrative.lower().count(term) for term in domain_terms)
        features.append([avg_sentence_length, domain_term_count])
    return features

# Load and preprocess the dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Extract manual features
manual_features = extract_features(df['Consumer complaint narrative'])

# Initialize TF-IDF Vectorizer and compute TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = tfidf_vectorizer.fit_transform(df['Consumer complaint narrative'])

# Encode the data using BERT
df['encoded_texts'] = df['Consumer complaint narrative'].apply(lambda x: encode_texts([x])[0])

# Combine BERT encodings, TF-IDF features, and manual features
df['manual_features'] = np.array(manual_features)
df['tfidf_features'] = list(tfidf_features)

combined_features = np.hstack([np.vstack(df['encoded_texts']),df['manual_features'],tfidf_features.toarray()])

# Split the dataset into 20% training, 60% test, and 20% validation sets
train_features, remaining_features, train_labels, remaining_labels = train_test_split(combined_features, df['Label'], test_size=0.80, stratify=df['Label'])
test_features, val_features, test_labels, val_labels = train_test_split(remaining_features, remaining_labels, test_size=0.25, stratify=remaining_labels) # 0.25 * 0.80 = 0.20

# Initialize and train the LinearSVC model
model = LinearSVC(random_state=0)
model.fit(train_features, train_labels)

# Make predictions on test and validation sets
test_predictions = model.predict(test_features)
val_predictions = model.predict(val_features)

# Calculate metrics for test set
test_accuracy = accuracy_score(test_labels, test_predictions)
test_f1 = f1_score(test_labels, test_predictions, average='weighted')

# Calculate metrics for validation set
val_accuracy = accuracy_score(val_labels, val_predictions)
val_f1 = f1_score(val_labels, val_predictions, average='weighted')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test F1 Score: {test_f1}')
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation F1 Score: {val_f1}')

# Confusion Matrix for test set
test_conf_matrix = sk_confusion_matrix(test_labels, test_predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(test_conf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Test Set Confusion Matrix')
plt.show()

# Print incorrect predictions on test set
incorrect_test_predictions = [(text, pred, actual) for text, pred, actual in zip(df.loc[test_labels.index, 'Consumer complaint narrative'], test_predictions, test_labels) if pred != actual]

print("Incorrect Predictions on Test Set:")
for text, pred, label in incorrect_test_predictions:
    print(f"Text: {text}, Predicted: {pred}, Actual: {label}")

In [None]:
### Sentence Transformer with LinearSVC for 4 classes
import torch
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix as sk_confusion_matrix
from sklearn.svm import LinearSVC
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import BertModel, BertTokenizer
import numpy as np

def extract_features(narratives):
    features = []
    domain_terms = [
        "money", "help", "credit", "account", "paid", "debt", "card",
        "loan", "financial", "consumer", "payment", "service",
        "reporting", "act", "law", "unauthorized", "section", "usc", "rights"
    ]
    for narrative in narratives:
        sentences = nltk.sent_tokenize(narrative)
        avg_sentence_length = sum(len(nltk.word_tokenize(sentence)) for sentence in sentences) / len(sentences) if sentences else 0
        domain_term_count = sum(narrative.lower().count(term) for term in domain_terms)
        features.append([avg_sentence_length, domain_term_count])
    return features

# Load and preprocess the dataset
file_path = 'complaints-official-4-classes.xlsx'
df = pd.read_excel(file_path)
df = df[['Consumer complaint narrative', 'Label']]
df.dropna(inplace=True)

# Extract manual features
manual_features = extract_features(df['Consumer complaint narrative'])

# Initialize TF-IDF Vectorizer and compute TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf_features = tfidf_vectorizer.fit_transform(df['Consumer complaint narrative'])

# Initialize the tokenizer and model for 'stsb-bert-base'
tokenizer = BertTokenizer.from_pretrained('sentence-transformers/stsb-bert-base')
bert_model = BertModel.from_pretrained('sentence-transformers/stsb-bert-base')

def encode_texts(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = bert_model(**encoded_input)
    return model_output.pooler_output.detach().numpy()

# Encode the data and extract features
df['encoded_texts'] = df['Consumer complaint narrative'].apply(lambda x: encode_texts([x])[0])
df['manual_features'] = manual_features
df['tfidf_features'] = list(tfidf_features)

# Combine BERT encodings, TF-IDF features, and manual features
df['combined_features'] = df.apply(lambda row: np.concatenate((row['encoded_texts'], row['manual_features'], row['tfidf_features'].toarray()[0])), axis=1)

# Split the dataset into 20% training, 60% test, and 20% validation sets
train_data, remaining_data = train_test_split(df, test_size=0.80, stratify=df['Label'])
test_data, val_data = train_test_split(remaining_data, test_size=0.25, stratify=remaining_data['Label'])  # 0.25 * 0.80 = 0.20

# Prepare the data for training, testing, and validation sets
X_train = np.vstack(train_data['combined_features'])
y_train = train_data['Label'].values
X_test = np.vstack(test_data['combined_features'])
y_test = test_data['Label'].values
X_val = np.vstack(val_data['combined_features'])
y_val = val_data['Label'].values

# Initialize and train the LinearSVC model
model = LinearSVC()
model.fit(X_train, y_train)

# Define the parameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  # Expanded range
    'tol': [1e-5, 1e-4, 1e-3, 1e-2],
    'max_iter': [500, 1000, 2000, 5000]
}

# Make predictions on the test and validation sets
test_predictions = model.predict(X_test)
val_predictions = model.predict(X_val)

# Test and validation metrics
test_accuracy = accuracy_score(y_test, test_predictions)
test_f1 = f1_score(y_test, test_predictions, average='weighted')
val_accuracy = accuracy_score(y_val, val_predictions)
val_f1 = f1_score(y_val, val_predictions, average='weighted')

print(f'Test Accuracy: {test_accuracy}')
print(f'Test F1 Score: {test_f1}')
print(f'Validation Accuracy: {val_accuracy}')
print(f'Validation F1 Score: {val_f1}')

# Confusion Matrix for validation set
val_conf_matrix = sk_confusion_matrix(y_val, val_predictions)
plt.figure(figsize=(10, 7))
sns.heatmap(val_conf_matrix, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Validation Set Confusion Matrix')
plt.show()

# Print incorrect predictions on validation set
incorrect_val_predictions = [(text, pred, actual) for text, pred, actual in zip(val_data['Consumer complaint narrative'], val_predictions, y_val) if pred != actual]

print("Incorrect Predictions on Validation Set:")
for text, pred, label in incorrect_val_predictions:
    print(f"Text: {text}, Predicted: {pred}, Actual: {label}")