In [1]:
###REQUIRED LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore")


In [2]:
### LOAD AND EXPLORE DATASET

df = pd.read_csv("complaints.csv")
df = df[['Consumer complaint narrative', 'Product']]
df = df.dropna()
df = df[df['Consumer complaint narrative'].notnull()]
df = df.reset_index(drop=True)
df.head()


Unnamed: 0,Consumer complaint narrative,Product
0,My sister initiated an internal transfer of {$...,"Money transfer, virtual currency, or money ser..."
1,I received a title loan on a a temporary card....,Prepaid card
2,I've complained about procollect and this fals...,Debt collection
3,This is for a card with Sheels/FNBO XXXX XXXXX...,Credit card
4,"XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX, ...",Credit reporting or other personal consumer re...


In [3]:
### ENCODE TARGET LABELS


target_map = {
    'Credit reporting, credit repair services, or other personal consumer reports': 0,
    'Debt collection': 1,
    'Consumer Loan': 2,
    'Mortgage': 3
}

df = df[df['Product'].isin(target_map.keys())]
df['Category'] = df['Product'].map(target_map)
df = df[['Consumer complaint narrative', 'Category']]




In [4]:
### TEXT PREPROCESSING

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)



stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['CleanText'] = df['Consumer complaint narrative'].apply(preprocess)


In [6]:
### VECTORIZATION

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['CleanText'])
y = df['Category']


In [7]:
#### TRAIN-TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:
###TRAIN MODELS


In [10]:
### NAIVE BAYES

nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)


In [11]:
### LOGISTIC REGRESSION

lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)


In [14]:
### SUPPORT  VECTOR CLASSIFIER

svc = LinearSVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)


In [15]:
### EVALUATION

def evaluate_model(name, y_true, y_pred):
    print(f"--- {name} ---")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Classification Report:\n", classification_report(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\n")

evaluate_model("Naive Bayes", y_test, y_pred_nb)
evaluate_model("Logistic Regression", y_test, y_pred_lr)
evaluate_model("SVC", y_test, y_pred_svc)


--- Naive Bayes ---
Accuracy: 0.8731653270797527
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91    161272
           1       0.83      0.74      0.78     66293
           2       0.46      0.27      0.34      1882
           3       0.82      0.94      0.88     25980

    accuracy                           0.87    255427
   macro avg       0.75      0.72      0.73    255427
weighted avg       0.87      0.87      0.87    255427

Confusion Matrix:
 [[148853   8845    423   3151]
 [ 15227  49125    165   1776]
 [   366    488    505    523]
 [   855    576      2  24547]]


--- Logistic Regression ---
Accuracy: 0.9057656394977821
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.93    161272
           1       0.86      0.82      0.84     66293
           2       0.69      0.39      0.50      1882
           3       0.92      0.93      0.92     

In [16]:
### MAKE A PREDICTION

sample = ["I am unable to get any resolution from my debt collector"]
sample_cleaned = preprocess(sample[0])
sample_vec = tfidf.transform([sample_cleaned])
print("Predicted Category:", nb.predict(sample_vec)[0])


Predicted Category: 1
