<a href="https://colab.research.google.com/github/venkatasnehith/consumer-complaints-classification/blob/main/consumer_complaints_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
!pip install kagglehub scikit-learn pandas matplotlib seaborn --quiet


In [9]:
import kagglehub

# Download the dataset
path = kagglehub.dataset_download("shashwatwork/consume-complaints-dataset-fo-nlp")
print("✅ Dataset path:", path)


✅ Dataset path: /kaggle/input/consume-complaints-dataset-fo-nlp


In [10]:
import os
import pandas as pd

# Check all files
print("📁 Files in dataset folder:")
for file in os.listdir(path):
    print(" -", file)


📁 Files in dataset folder:
 - complaints_processed.csv


In [13]:
for file in os.listdir(path):
    if file.endswith(".csv"):
        try:
            df = pd.read_csv(os.path.join(path, file))
            print("✅ Successfully loaded:", file)
            break
        except Exception as e:
            print("❌ Could not load:", file, "\n", e)


✅ Successfully loaded: complaints_processed.csv


In [14]:
# Let's see column names
df.columns


Index(['Unnamed: 0', 'product', 'narrative'], dtype='object')

In [17]:
df = df[['product', 'narrative']].dropna()
df = df[df['narrative'].str.strip() != '']


In [18]:
df.rename(columns={'product': 'label', 'narrative': 'text'}, inplace=True)


In [19]:
import re

def clean_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Keep letters only
    text = text.lower()
    return text

df['clean_text'] = df['text'].apply(clean_text)


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

X = df['clean_text']
y = df['label_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))


✅ Accuracy: 0.8721485084505741
                     precision    recall  f1-score   support

        credit_card       0.79      0.77      0.78      3132
   credit_reporting       0.90      0.94      0.92     18283
    debt_collection       0.81      0.72      0.76      4615
mortgages_and_loans       0.86      0.82      0.84      3770
     retail_banking       0.87      0.89      0.88      2683

           accuracy                           0.87     32483
          macro avg       0.85      0.83      0.84     32483
       weighted avg       0.87      0.87      0.87     32483



In [23]:
import joblib

joblib.dump(model, "complaint_classifier.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']

In [24]:
# Load your saved model and components
import joblib

model = joblib.load("complaint_classifier.pkl")
vectorizer = joblib.load("vectorizer.pkl")
le = joblib.load("label_encoder.pkl")


In [25]:
def predict_complaint_category(complaint_text):
    # Clean the text just like before
    import re
    def clean_text(text):
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()
        return text

    cleaned = clean_text(complaint_text)
    vect_text = vectorizer.transform([cleaned])
    pred = model.predict(vect_text)[0]
    label = le.inverse_transform([pred])[0]

    return label


In [26]:
example = "I was charged late fees despite paying on time."
print("Predicted Category:", predict_complaint_category(example))


Predicted Category: credit_card


In [28]:
df['text'].sample(5).values


array(['inquiry dont recognize lender giving authorization perform hard inquiry',
       'reported credit bureau mortgage company event control definitely accident brief timeline mortgage automatically debited account one four mortgage withdrawn one company auto lease automatically debited account one many mortgage recurring bill withdrawn one choose bank dealer set lease thought would dealing done work house year prior changed name date previously deposited check attempted deposit caught right away contacted fraud department immediately put account hold lock access almost still account attempted try resolve issue phone said must come branch nearest branch area closed public due corona virus choice fly least mile branch open order identify person due many bill automatically paid account opened another account fl directed work make ach payment new account funded extra well proceeded enter new account information dozen payment go every month tried charge account showed invalid could sett

In [29]:
print(predict_complaint_category("I was charged a late fee even though I paid my loan on time."))


mortgages_and_loans


In [31]:
print(le.classes_)


['credit_card' 'credit_reporting' 'debt_collection' 'mortgages_and_loans'
 'retail_banking']


In [40]:
def predict_complaint_category(complaint_text):
    import re

    # Clean the text
    def clean_text(text):
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()
        return text

    cleaned = clean_text(complaint_text)
    vect_text = vectorizer.transform([cleaned])

    # Predict class and probability
    probs = model.predict_proba(vect_text)[0]  # Get probability for each class
    max_prob = max(probs)                     # Maximum probability
    pred_class_index = probs.argmax()         # Class with highest probability

    # Set confidence threshold
    threshold = 0.5

    if max_prob < threshold:
        return "❓ Uncertain / Unrelated"
    else:
        return le.inverse_transform([pred_class_index])[0]


# 🔵 Interactive input
user_input = input("Enter your complaint: ")
predicted_label = predict_complaint_category(user_input)
print("Predicted Category:", predicted_label)


Enter your complaint: I was charged a late fee even though I paid my loan on time
Predicted Category: mortgages_and_loans
