In [1]:
# Kaiburr Assessment - Task 5: Data Science Example
'''**Candidate:** Vijay  
**Task:** Perform Text Classification on Consumer Complaint Dataset  
**Steps:**  
1. Explanatory Data Analysis and Feature Engineering  
2. Text Pre-Processing  
3. Model Selection  
4. Comparison of Model Performance  
5. Model Evaluation  
6. Prediction on New Complaint'''


'**Candidate:** Vijay  \n**Task:** Perform Text Classification on Consumer Complaint Dataset  \n**Steps:**  \n1. Explanatory Data Analysis and Feature Engineering  \n2. Text Pre-Processing  \n3. Model Selection  \n4. Comparison of Model Performance  \n5. Model Evaluation  \n6. Prediction on New Complaint'

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import warnings

warnings.filterwarnings('ignore')


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
DATA_PATH = "complaints.csv"

df = pd.read_csv(DATA_PATH, on_bad_lines='skip')
print("Dataset Shape:", df.shape)
df.head()


Dataset Shape: (11522175, 18)


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,2020-07-06,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,FL,346XX,,Other,Web,2020-07-06,Closed with explanation,Yes,,3730948
1,2025-10-14,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information is missing that should be on the r...,,,"EQUIFAX, INC.",TX,75062,,,Web,2025-10-14,In progress,Yes,,16558024
2,2025-10-10,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"EQUIFAX, INC.",GA,30341,,,Web,2025-10-10,In progress,Yes,,16507707
3,2025-10-15,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",TX,75287,,,Web,2025-10-15,In progress,Yes,,16593757
4,2025-10-16,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,,,Experian Information Solutions Inc.,NC,28379,,,Web,2025-10-16,In progress,Yes,,16623506


In [5]:
TARGET_PRODUCTS = {
    "Credit reporting, repair, or other": "Credit reporting, credit repair services, or other personal consumer reports",
    "Debt collection": "Debt collection",
    "Consumer Loan": "Consumer Loan",
    "Mortgage": "Mortgage"
}

df_filtered = df[df['Product'].isin(TARGET_PRODUCTS.values())].copy()
df_filtered.dropna(subset=['Consumer complaint narrative'], inplace=True)

product_to_label = {
    "Credit reporting, credit repair services, or other personal consumer reports": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Mortgage": 3
}
df_filtered['label'] = df_filtered['Product'].map(product_to_label)
df_filtered['Product'].value_counts()


Product
Credit reporting, credit repair services, or other personal consumer reports    807276
Debt collection                                                                 371629
Mortgage                                                                        134837
Consumer Loan                                                                     9461
Name: count, dtype: int64

In [6]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

df_filtered['cleaned_narrative'] = df_filtered['Consumer complaint narrative'].apply(preprocess_text)
df_filtered.head()


Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID,label,cleaned_narrative
67,2020-05-08,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,These are not my accounts.,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,NV,89030,,Consent provided,Web,2020-05-08,Closed with explanation,Yes,,3642453,0,account
636,2020-03-19,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,"I wrote three requests, the unverified account...",,"EQUIFAX, INC.",NC,28562,,Consent provided,Web,2020-03-19,Closed with explanation,Yes,,3573294,0,wrote three request unverified account listed ...
836,2020-09-15,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Account status incorrect,I paid off my Synchrony Lowes account. After t...,Company has responded to the consumer and the ...,SYNCHRONY FINANCIAL,CA,93307,,Consent provided,Web,2020-09-16,Closed with non-monetary relief,Yes,,3850587,0,paid synchrony lowes account account paid sync...
888,2023-07-11,"Credit reporting, credit repair services, or o...",Credit reporting,Incorrect information on your report,Information belongs to someone else,I've found that my credit report has erroneous...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,UT,84065,,Consent provided,Web,2023-07-11,Closed with explanation,Yes,,7233356,0,ive found credit report erroneous information ...
1019,2021-12-03,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft,RE : Attentively review my formal writ compose...,,"ACIMA CREDIT, LLC",PA,19124,,Consent provided,Web,2021-12-03,Closed with explanation,Yes,,4971676,1,attentively review formal writ composed declar...


In [7]:
X = df_filtered['cleaned_narrative']
y = df_filtered['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 1058562
Testing samples: 264641


In [8]:
models = {
    "Multinomial Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42)
}


best_model, best_model_name, best_f1_score = None, "", 0

for name, model in models.items():
    print(f"--- {name} ---")
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
    score = classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']
    if score > best_f1_score:
        best_f1_score, best_model, best_model_name = score, pipeline, name

print(f"\nBest Model: {best_model_name} (Weighted F1: {best_f1_score:.4f})")


--- Multinomial Naive Bayes ---
              precision    recall  f1-score   support

           0       0.92      0.88      0.90    161456
           1       0.81      0.80      0.80     74326
           2       0.36      0.24      0.29      1892
           3       0.75      0.96      0.84     26967

    accuracy                           0.86    264641
   macro avg       0.71      0.72      0.71    264641
weighted avg       0.87      0.86      0.86    264641

[[142859  13128    596   4873]
 [ 11765  59112    225   3224]
 [   281    416    463    732]
 [   616    536      5  25810]]
--- Linear SVM ---
              precision    recall  f1-score   support

           0       0.92      0.95      0.93    161456
           1       0.88      0.84      0.86     74326
           2       0.79      0.28      0.42      1892
           3       0.92      0.93      0.92     26967

    accuracy                           0.91    264641
   macro avg       0.88      0.75      0.78    264641
weighted 

In [9]:
new_complaint = """
I am writing to dispute a charge on my mortgage account.
My bank has incorrectly charged me a late fee,
but I sent the payment before the due date.
"""

label_to_product = {v: k for k, v in product_to_label.items()}
pred = best_model.predict([new_complaint])[0]
print("Predicted Product:", label_to_product[pred])


Predicted Product: Mortgage


In [13]:
new_complaint1 = """
A company named First Financial Adjusters will not stop calling me and my family members.
They are trying to collect on a medical bill from three years ago that my insurance
was supposed to cover. I have sent them the documentation multiple times but the harassment
continues. They are threatening to report this to the credit bureaus.
"""

pred = best_model.predict([new_complaint1])[0]
print("Predicted Product:", label_to_product[pred])

Predicted Product: Debt collection
