In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
import warnings

warnings.filterwarnings('ignore')

# --- Download NLTK data (only needs to be done once) ---
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
try:
    # This is for the lemmatizer
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    print("Downloading NLTK wordnet...")
    nltk.download('wordnet')


# --- 1. Explanatory Data Analysis and Feature Engineering ---
print("--- Step 1: Loading and Preparing Data ---")

# Load the dataset from the provided URL
DATA_PATH = "complaints.csv"
try:
    df = pd.read_csv(DATA_PATH, on_bad_lines='skip')
    print(f"Dataset loaded successfully with {len(df)} rows.")
except Exception as e:
    print(f"Failed to load dataset. Error: {e}")
    exit()

# Define the target categories as specified in the task
TARGET_PRODUCTS = {
    "Credit reporting, repair, or other": "Credit reporting, credit repair services, or other personal consumer reports",
    "Debt collection": "Debt collection",
    "Consumer Loan": "Consumer Loan",
    "Mortgage": "Mortgage"
}

# Filter the dataframe to only include the products we are interested in
df_filtered = df[df['Product'].isin(TARGET_PRODUCTS.values())].copy()
print(f"Filtered to {len(df_filtered)} rows with target products.")

# Drop rows where the complaint narrative is missing, as it's our feature
df_filtered.dropna(subset=['Consumer complaint narrative'], inplace=True)
print(f"Kept {len(df_filtered)} rows with non-empty complaint narratives.")


# Create a mapping from product names to numerical labels (0, 1, 2, 3)
product_to_label = {
    "Credit reporting, credit repair services, or other personal consumer reports": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}
df_filtered['label'] = df_filtered['Product'].map(product_to_label)
label_to_product = {v: k for k, v in product_to_label.items()}

# Display the class distribution
print("\nClass Distribution:")
print(df_filtered['Product'].value_counts())
print("\n")


# --- 2. Text Pre-Processing ---
print("--- Step 2: Text Pre-Processing ---")

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Cleans and prepares text data for modeling."""
    # 1. Convert to lowercase
    text = text.lower()
    # 2. Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # 3. Tokenize
    words = text.split()
    # 4. Remove stopwords and lemmatize
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

# Apply the preprocessing function to the complaint narratives
# This might take a minute or two depending on the dataset size
print("Applying preprocessing to complaint narratives...")
df_filtered['cleaned_narrative'] = df_filtered['Consumer complaint narrative'].apply(preprocess_text)
print("Preprocessing complete.")
print("\n")


# Define features (X) and target (y)
X = df_filtered['cleaned_narrative']
y = df_filtered['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")
print("\n")

# --- 3. Selection of Multi Classification Models ---
# We will compare three popular models for text classification:
# - Multinomial Naive Bayes
# - Logistic Regression
# - Linear Support Vector Machine (SVM)

models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Linear SVM": LinearSVC(random_state=42)
}

# --- 4 & 5. Comparison of Model Performance & Evaluation ---
print("--- Steps 4 & 5: Training, Comparing, and Evaluating Models ---")

best_model = None
best_f1_score = 0.0

for model_name, model_instance in models.items():
    print(f"--- Training and Evaluating {model_name} ---")

    # Create a pipeline that first vectorizes the text and then applies the classifier
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('classifier', model_instance)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate performance
    report = classification_report(y_test, y_pred, target_names=TARGET_PRODUCTS.keys(), output_dict=True)
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=TARGET_PRODUCTS.keys()))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

    # Check if this model is the best one so far based on weighted F1-score
    weighted_f1 = report['weighted avg']['f1-score']
    if weighted_f1 > best_f1_score:
        best_f1_score = weighted_f1
        best_model = pipeline
        best_model_name = model_name

print(f"Best performing model is: {best_model_name} with a weighted F1-score of {best_f1_score:.4f}")
print("\n")

# --- 6. Prediction ---
print("--- Step 6: Prediction on a New Complaint ---")

# A new, unseen complaint
new_complaint = """
I am writing to dispute a charge on my mortgage account.
My bank, Acme Bank, has incorrectly charged me a late fee,
but I sent the payment well before the due date. I have bank
records to prove it. This has negatively affected my credit score.
"""

# Use the best model to predict the category
predicted_label = best_model.predict([new_complaint])[0]
predicted_product = label_to_product[predicted_label]

print(f"New Complaint Text:\n'{new_complaint.strip()}'")
print("-" * 30)
print(f"Predicted Category ID: {predicted_label}")
print(f"Predicted Product: '{predicted_product}'")


--- Step 1: Loading and Preparing Data ---
Dataset loaded successfully with 11522175 rows.
Filtered to 3521497 rows with target products.
Kept 1323203 rows with non-empty complaint narratives.

Class Distribution:
Product
Credit reporting, credit repair services, or other personal consumer reports    807276
Debt collection                                                                 371629
Mortgage                                                                        134837
Consumer Loan                                                                     9461
Name: count, dtype: int64


--- Step 2: Text Pre-Processing ---
Applying preprocessing to complaint narratives...
Preprocessing complete.


Data split into 1058562 training samples and 264641 testing samples.


--- Steps 4 & 5: Training, Comparing, and Evaluating Models ---
--- Training and Evaluating Multinomial Naive Bayes ---
Classification Report:
                                    precision    recall  f1-score   support


In [None]:
# --- Import necessary libraries ---
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings

warnings.filterwarnings('ignore')

# --- Download NLTK data (only needs to be done once) ---
try:
    stopwords.words('english')
except LookupError:
    print("Downloading NLTK stopwords...")
    nltk.download('stopwords')
try:
    # This is for the lemmatizer
    nltk.data.find('corpora/wordnet.zip')
except LookupError:
    print("Downloading NLTK wordnet...")
    nltk.download('wordnet')


# --- 1. Explanatory Data Analysis and Feature Engineering ---
print("--- Step 1: Loading and Preparing Data ---")

# Load the dataset from the provided URL
DATA_PATH = "complaints.csv"
try:
    df = pd.read_csv(DATA_PATH, on_bad_lines='skip')
    print(f"Dataset loaded successfully with {len(df)} rows.")
except Exception as e:
    print(f"Failed to load dataset. Error: {e}")
    exit()

# Define the target categories as specified in the task
TARGET_PRODUCTS = {
    "Credit reporting, repair, or other": "Credit reporting, credit repair services, or other personal consumer reports",
    "Debt collection": "Debt collection",
    "Consumer Loan": "Consumer Loan",
    "Mortgage": "Mortgage"
}

# Filter the dataframe to only include the products we are interested in
df_filtered = df[df['Product'].isin(TARGET_PRODUCTS.values())].copy()
print(f"Filtered to {len(df_filtered)} rows with target products.")

# Drop rows where the complaint narrative is missing, as it's our feature
df_filtered.dropna(subset=['Consumer complaint narrative'], inplace=True)
print(f"Kept {len(df_filtered)} rows with non-empty complaint narratives.")


# Create a mapping from product names to numerical labels (0, 1, 2, 3)
product_to_label = {
    "Credit reporting, credit repair services, or other personal consumer reports": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}
df_filtered['label'] = df_filtered['Product'].map(product_to_label)
label_to_product = {v: k for k, v in product_to_label.items()}

# Display the class distribution
print("\nClass Distribution:")
print(df_filtered['Product'].value_counts())
print("\n")


# --- 2. Text Pre-Processing ---
print("--- Step 2: Text Pre-Processing ---")

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Cleans and prepares text data for modeling."""
    # 1. Convert to lowercase
    text = text.lower()
    # 2. Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # 3. Tokenize
    words = text.split()
    # 4. Remove stopwords and lemmatize
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

# Apply the preprocessing function to the complaint narratives
# This might take a minute or two depending on the dataset size
print("Applying preprocessing to complaint narratives...")
df_filtered['cleaned_narrative'] = df_filtered['Consumer complaint narrative'].apply(preprocess_text)
print("Preprocessing complete.")
print("\n")


# Define features (X) and target (y)
X = df_filtered['cleaned_narrative']
y = df_filtered['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")
print("\n")

# --- 3. Selection of Multi Classification Models ---
# We will compare three popular models for text classification:
# - Multinomial Naive Bayes
# - Linear Support Vector Machine (SVM)
# - Random Forest
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
}

# --- 4 & 5. Comparison of Model Performance & Evaluation ---
print("--- Steps 4 & 5: Training, Comparing, and Evaluating Models ---")

best_model = None
best_f1_score = 0.0
best_model_name = ""

for model_name, model_instance in models.items():
    print(f"--- Training and Evaluating {model_name} ---")

    # Create a pipeline that first vectorizes the text and then applies the classifier
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('classifier', model_instance)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # Evaluate performance
    report = classification_report(y_test, y_pred, target_names=TARGET_PRODUCTS.keys(), output_dict=True)
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=TARGET_PRODUCTS.keys()))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n")

    # Check if this model is the best one so far based on weighted F1-score
    weighted_f1 = report['weighted avg']['f1-score']
    if weighted_f1 > best_f1_score:
        best_f1_score = weighted_f1
        best_model = pipeline
        best_model_name = model_name

print(f"Best performing model is: {best_model_name} with a weighted F1-score of {best_f1_score:.4f}")
print("\n")

# --- 6. Prediction ---
print("--- Step 6: Prediction on a New Complaint ---")

# A new, unseen complaint
new_complaint = """
I am writing to dispute a charge on my mortgage account.
My bank, Acme Bank, has incorrectly charged me a late fee,
but I sent the payment well before the due date. I have bank
records to prove it. This has negatively affected my credit score.
"""

# Use the best model to predict the category
predicted_label = best_model.predict([new_complaint])[0]
predicted_product = label_to_product[predicted_label]

print(f"New Complaint Text:\n'{new_complaint.strip()}'")
print("-" * 30)
print(f"Predicted Category ID: {predicted_label}")
print(f"Predicted Product: '{predicted_product}'")



--- Step 1: Loading and Preparing Data ---
Dataset loaded successfully with 11522175 rows.
Filtered to 3521497 rows with target products.
Kept 1323203 rows with non-empty complaint narratives.

Class Distribution:
Product
Credit reporting, credit repair services, or other personal consumer reports    807276
Debt collection                                                                 371629
Mortgage                                                                        134837
Consumer Loan                                                                     9461
Name: count, dtype: int64


--- Step 2: Text Pre-Processing ---
Applying preprocessing to complaint narratives...
Preprocessing complete.


Data split into 1058562 training samples and 264641 testing samples.


--- Steps 4 & 5: Training, Comparing, and Evaluating Models ---
--- Training and Evaluating Multinomial Naive Bayes ---
Classification Report:
                                    precision    recall  f1-score   support


In [1]:
# Kaiburr Assessment - Task 5: Data Science Example with BERT
# Name: Vijay
# Date: 2024-10-18

# This script performs text classification using a pre-trained BERT model.
# It leverages the Hugging Face transformers and PyTorch libraries.
# NOTE: To run this, you need to install torch and transformers:
# pip install torch transformers pandas scikit-learn

# --- Import necessary libraries ---
import pandas as pd
import numpy as np
import warnings
import torch
from torch.utils.data import DataLoader, Dataset
# --- CORRECTED IMPORT ---
# AdamW is now imported from torch.optim, not transformers
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

warnings.filterwarnings('ignore')

# --- Configuration ---
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 256  # Max length of the tokenized text
BATCH_SIZE = 8 # Batch size for training (reduce if you have memory issues)
EPOCHS = 2     # Number of training epochs
LEARNING_RATE = 2e-5

# --- Check for GPU availability ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")


# --- 1. Explanatory Data Analysis and Feature Engineering ---
print("--- Step 1: Loading and Preparing Data ---")

# IMPORTANT: Since you have downloaded the dataset, replace 'complaints.csv'
# with the exact path to your unzipped CSV file if it's different or in another folder.
LOCAL_DATA_PATH = "complaints.csv"
try:
    # Specifying dtype can help with memory allocation issues on loading large CSVs
    df = pd.read_csv(LOCAL_DATA_PATH, on_bad_lines='skip', dtype={'Consumer complaint narrative': 'string', 'Product': 'string'})
    print(f"Dataset loaded successfully with {len(df)} rows from '{LOCAL_DATA_PATH}'.")
except FileNotFoundError:
    print(f"Error: The file was not found at '{LOCAL_DATA_PATH}'.")
    print("Please make sure this script is in the same directory as your CSV file, or provide the full file path.")
    exit()
except Exception as e:
    print(f"Failed to load dataset. Error: {e}")
    exit()

# Define the target categories
TARGET_PRODUCTS = {
    "Credit reporting, credit repair services, or other personal consumer reports": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}
LABEL_NAMES = list(TARGET_PRODUCTS.keys())

# Filter the dataframe
df_filtered = df[df['Product'].isin(TARGET_PRODUCTS.keys())].copy()
df_filtered.dropna(subset=['Consumer complaint narrative'], inplace=True)

# Map product names to numerical labels
df_filtered['label'] = df_filtered['Product'].map(TARGET_PRODUCTS)
print(f"Filtered to {len(df_filtered)} rows with target products and narratives.")

# --- DATA SAMPLING FOR QUICK DEMONSTRATION ---
# Fine-tuning BERT on the full dataset can take a long time on a CPU.
# We will sample the data to make the script run faster for demonstration.
# For the final submission, you might want to use a larger sample or the full dataset.
df_sample = df_filtered.sample(n=4000, random_state=42)
print(f"Using a smaller sample of {len(df_sample)} rows for faster training.\n")


# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df_sample['Consumer complaint narrative'],
    df_sample['label'],
    test_size=0.2,
    random_state=42,
    stratify=df_sample['label']
)


# --- 2. Text Pre-Processing (BERT Tokenization) & Dataset Creation ---
print("--- Step 2: Setting up BERT Tokenizer and PyTorch Datasets ---")

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts.iloc[item])
        label = self.labels.iloc[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create PyTorch Datasets and DataLoaders
train_dataset = ComplaintDataset(X_train, y_train, tokenizer, MAX_LEN)
val_dataset = ComplaintDataset(X_val, y_val, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
print("DataLoaders created.\n")

# --- 3. Selection of Model (BERT) ---
print("--- Step 3: Loading Pre-trained BERT Model ---")
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(TARGET_PRODUCTS)
)
model.to(device)
print("BERT model loaded and moved to device.\n")


# --- 4. Model Fine-Tuning (Training) ---
print("--- Step 4: Fine-Tuning the Model ---")

# Setup optimizer and scheduler
# --- CORRECTED OPTIMIZER INITIALIZATION ---
# The 'correct_bias' argument is removed as it's not used by torch.optim.AdamW
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training loop
for epoch in range(EPOCHS):
    print(f'======== Epoch {epoch + 1} / {EPOCHS} ========')
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        model.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f'  Average training loss: {avg_train_loss:.2f}')

print("\nTraining complete.\n")


# --- 5. Model Evaluation ---
print("--- Step 5: Evaluating the Model ---")

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).flatten()

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Map integer labels back to product names for the report
target_names_for_report = [name.split(',')[0] for name in TARGET_PRODUCTS.keys()] # Shorten names for readability
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=target_names_for_report))
print("\n")

# --- 6. Prediction ---
print("--- Step 6: Prediction on a New Complaint ---")

# A new, unseen complaint
new_complaint = """
I am writing to dispute a charge on my mortgage account.
My bank, Acme Bank, has incorrectly charged me a late fee,
but I sent the payment well before the due date. I have bank
records to prove it. This has negatively affected my credit score.
"""

# Tokenize the new complaint
encoded_complaint = tokenizer.encode_plus(
    new_complaint,
    add_special_tokens=True,
    max_length=MAX_LEN,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)

# Make prediction
input_ids = encoded_complaint['input_ids'].to(device)
attention_mask = encoded_complaint['attention_mask'].to(device)

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1).item()

# Map label index to product name
predicted_product = LABEL_NAMES[prediction]

print(f"New Complaint Text:\n'{new_complaint.strip()}'")
print("-" * 30)
print(f"Predicted Category ID: {prediction}")
print(f"Predicted Product: '{predicted_product}'")



  from .autonotebook import tqdm as notebook_tqdm


Using device: cpu

--- Step 1: Loading and Preparing Data ---
Dataset loaded successfully with 11522175 rows from 'complaints.csv'.
Filtered to 1323203 rows with target products and narratives.
Using a smaller sample of 4000 rows for faster training.

--- Step 2: Setting up BERT Tokenizer and PyTorch Datasets ---
DataLoaders created.

--- Step 3: Loading Pre-trained BERT Model ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERT model loaded and moved to device.

--- Step 4: Fine-Tuning the Model ---


KeyboardInterrupt: 