In [1]:
#Load the Dataset
import pandas as pd

# Load the dataset
df = pd.read_csv("consumer_complaints.csv")

# Keep only necessary columns
df = df[["Consumer complaint narrative", "Product"]].dropna()

# Define category mapping
category_mapping = {
    "Credit reporting, repair, or other": 0,
    "Debt collection": 1,
    "Consumer Loan": 2,
    "Payday loan, title loan, personal loan, or advance": 2,
    "Mortgage": 3
}

# Filter dataset to keep only relevant categories
df = df[df["Product"].isin(category_mapping.keys())]

# Apply category mapping
df["category"] = df["Product"].map(category_mapping)

# Drop the original "Product" column
df = df.drop(columns=["Product"])

# Check if data is loaded properly
print(df.head())


  df = pd.read_csv("consumer_complaints.csv")


                          Consumer complaint narrative  category
299  I have dealt with XXXX XXXX all my life and ev...         1
321  I am writing to address a concerning matter re...         1
377  This is so annoying & frustrating. Ive sent Ex...         1
378  Delete those late dates and update the statuse...         1
380  This is so annoying & frustrating. Ive sent Eq...         1


In [2]:
#Text Preprocessing (Cleaning the Text)
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Define text cleaning function
def clean_text(text):
    text = text.lower()  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\W+', ' ', text)  
    words = word_tokenize(text)  
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  
    return " ".join(words)

# Apply text cleaning
df["cleaned_text"] = df["Consumer complaint narrative"].astype(str).apply(clean_text)

# Drop the original text column
df = df.drop(columns=["Consumer complaint narrative"])

# Check cleaned text
print(df.head())


[nltk_data] Downloading package stopwords to /Users/surya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/surya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/surya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


     category                                       cleaned_text
299         1  dealt xxxx xxxx life even though put fraud ale...
321         1  writing address concerning matter regarding cr...
377         1  annoying frustrating ive sent experian multipl...
378         1  delete late date update status account mention...
380         1  annoying frustrating ive sent equifax multiple...


In [3]:
##Convert Text into Numerical Features (TF-IDF)
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 words

# Convert text into numerical form
X = vectorizer.fit_transform(df["cleaned_text"])

# Store the target labels (categories)
y = df["category"]

print("TF-IDF Matrix Shape:", X.shape)


TF-IDF Matrix Shape: (439589, 5000)


In [4]:
#Train the Machine Learning Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Split the dataset (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9667644850883778
Classification Report:
               precision    recall  f1-score   support

           1       0.97      0.98      0.98     61160
           2       0.76      0.50      0.61      1821
           3       0.96      0.96      0.96     24937

    accuracy                           0.97     87918
   macro avg       0.90      0.82      0.85     87918
weighted avg       0.97      0.97      0.97     87918



In [6]:
# Making Predictions on New Complaints
# Define a new complaint
new_complaint = ["I am unable to pay my mortgage due to a system error."]

# Clean the complaint
new_complaint_cleaned = [clean_text(text) for text in new_complaint]

# Convert it to TF-IDF format
new_complaint_vectorized = vectorizer.transform(new_complaint_cleaned)

# Predict category
predicted_category = model.predict(new_complaint_vectorized)
print("Predicted Category:", predicted_category[0])


Predicted Category: 3
