In [1]:
import pandas as pd

df = pd.read_csv("../data/rows.csv", low_memory=False)
print("✅ Loaded rows:", len(df))
print("🔍 Sample complaint:\n", df["Consumer complaint narrative"].iloc[0])


✅ Loaded rows: 1282355
🔍 Sample complaint:
 nan


In [2]:
df = df[df["Consumer complaint narrative"].astype(str).str.len() > 100].reset_index(drop=True)
print("✅ Rows with enough content:", len(df))


✅ Rows with enough content: 371561


In [3]:
import re

def clean_complaint(text):
    if not isinstance(text, str):
        return ""
    try:
        # Remove placeholders and normalize spacing
        text = re.sub(r"\bXXXX\b", "", text)
        text = re.sub(r"\bXX/XX/?\d{0,4}\b", "", text)
        text = re.sub(r"\{\$[\d,.]+\}", "", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text
    except Exception as e:
        print(f"Error cleaning text: {e}")
        return ""

df["cleaned_narrative"] = df["Consumer complaint narrative"].apply(clean_complaint)


In [4]:
df = df[df["cleaned_narrative"].str.strip() != ""].reset_index(drop=True)
print("✅ Final cleaned complaints:", len(df))


✅ Final cleaned complaints: 371561


In [5]:
for i in range(3):
    print(f"\nComplaint {i+1}:\n{df['cleaned_narrative'].iloc[i][:300]}...\n")



Complaint 1:
The Summer of I was denied a mortgage loan due to a charge off from credit card. I both mailed an account validation letter and disputed this debt with the credit bureaus and the credit card company several times. Only the credit bureaus responded that this debt was verified but showed no proof as I...


Complaint 2:
There is an account reporting on my credit report from which is saying the balance is coming from from . I never signed a joint lease with anyone, I have asked several times for a copy of this lease I supposed I signed. The credit bureau is allowing this allow to report on my credit report when it d...


Complaint 3:
The reason for my writing is to inform you that this company is being unreasonable and not adhering to the law. I am a victim of identity theft. My personal information was compromised and as a result fraudulent charges were included on my accounts without my consent or authorization. I disputed thi...



In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings

emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("✅ Embedding model loaded.")


  emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange


✅ Embedding model loaded.


In [8]:
from langchain.schema import Document

docs_subset = [
    Document(
        page_content=str(row["cleaned_narrative"]),
        metadata={"product": str(row.get("Product", ""))}
    )
    for _, row in df.head(1000).iterrows()
]

print("✅ Documents prepared:", len(docs_subset))



✅ Documents prepared: 1000


In [9]:
vectorstore = FAISS.from_documents(docs_subset, emb)
print("✅ FAISS index built.")


✅ FAISS index built.


In [10]:
question = "What is the complaint about credit report errors?"
hits = vectorstore.similarity_search(question, k=3)

for i, doc in enumerate(hits, 1):
    print(f"\nResult {i}:\n{doc.page_content[:300]}...\n")



Result 1:
I started a Dispute in about how was reporting payment and balance info. The reporting was corrected. The Dispute was to be removed from my report by . It is still listed as an open Dispute and blocks me from any other Disputes. Only one Dispute at a time. The company at fault is Credit Karma. They ...


Result 2:
I saw something on the news about Congresswoman about how she is trying to help change the way that the credit bureaus report inaccurate information. She stated " In this broken system, credit reports are routinely filled with errors that are difficult for consumers to correct. Negative information ...


Result 3:
; Experian and are reporting me as 30 to 60 days late on the account number and . This account reflects a balance and past due. This inaccurate reporting is harming my credit score. I have disputed this directly with the three bureaus but the error has not been corrected. It is impossible to be late...



In [11]:
from transformers import pipeline

# Load summarization pipeline
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Combine retrieved complaints into one text block
combined_text = " ".join([doc.page_content for doc in hits])

# Summarize
summary = summarizer(combined_text, max_length=150, min_length=60, do_sample=False)[0]["summary_text"]
print("📝 Summary:\n", summary)


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use mps:0


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

📝 Summary:
  The company at fault is Credit Karma. They refuse to acknowledge there is anything wrong . I started a Dispute in about how was reporting payment and balance info. The reporting was corrected. The Dispute was to be removed from my report by . It is still listed as an open Dispute and blocks me from any other disputes .


In [13]:
def summarize_text(text, summarizer, min_ratio=0.4, max_ratio=0.8):
    input_len = len(text.split())
    max_len = max(int(input_len * max_ratio), 30)
    min_len = max(int(input_len * min_ratio), 20)
    
    summary = summarizer(text, max_length=max_len, min_length=min_len, do_sample=False)[0]["summary_text"]
    return summary


In [14]:
sample_text = hits[0].page_content
print("📝 Summary:\n", summarize_text(sample_text, summarizer))


📝 Summary:
  Credit Karma is still listed as an open Dispute and blocks me from any other Disputes . The company at fault is Credit Karma . They refuse to acknowledge there is anything wrong .


In [15]:
summaries = []

for doc in hits:
    text = doc.page_content
    summary = summarize_text(text, summarizer)
    summaries.append(summary)

for i, s in enumerate(summaries, 1):
    print(f"\n🔹 Summary {i}:\n{s}\n")



🔹 Summary 1:
 Credit Karma is still listed as an open Dispute and blocks me from any other Disputes . The company at fault is Credit Karma . They refuse to acknowledge there is anything wrong .


🔹 Summary 2:
 I noticed that I was reported 30 days late in of this year . I have never made a late payment on this account to my knowledge . This must be some kind of error because since 2013 when I opened this account, I have always maintained a low balance and paid on time . This is going to ruin my perfect credit history when in fact, I was never late . I really need help with clearing this up because I am sure there is some kind kind of discrepancy. This is .


🔹 Summary 3:
 Experian and Experian are reporting me as 30 to 60 days late on the account number and . This account reflects a balance and past due . This inaccurate reporting is harming my credit score . I have disputed this directly with the three bureaus but the error



In [16]:
combined_input = " ".join(summaries)
refined_summary = summarize_text(combined_input, summarizer)
print("🧾 Refined Summary:\n", refined_summary)


🧾 Refined Summary:
  Credit Karma is still listed as an open Dispute and blocks me from any other Disputes . The company at fault is Credit Karma . They refuse to acknowledge there is anything wrong . Experian and Experian are reporting me as 30 to 60 days late on the account number and . This account reflects a balance and past due . This inaccurate reporting is harming my credit score .


In [17]:
import pickle

with open("faiss_index.pkl", "wb") as f:
    pickle.dump(vectorstore, f)

print("✅ FAISS index saved to faiss_index.pkl")


✅ FAISS index saved to faiss_index.pkl


In [18]:
import os
print("✅ Exists:", os.path.exists("faiss_index.pkl"))


✅ Exists: True


In [20]:
df["Complaint ID"].head(10).tolist()



[3189109,
 3186792,
 3187373,
 3184692,
 3184195,
 3183544,
 3183547,
 3183589,
 3183581,
 3182609]

In [21]:
def get_complaint_by_id(df, complaint_id):
    match = df[df["Complaint ID"] == complaint_id]
    if not match.empty:
        return match[[
            "Date received", "Product", "Issue", "Company",
            "State", "Submitted via", "Company response to consumer",
            "Consumer complaint narrative"
        ]]
    else:
        return f"❌ No complaint found with ID {complaint_id}"


In [22]:
get_complaint_by_id(df, 3189109)


Unnamed: 0,Date received,Product,Issue,Company,State,Submitted via,Company response to consumer,Consumer complaint narrative
0,03/23/2019,"Credit reporting, credit repair services, or o...",Incorrect information on your report,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",IL,Web,Closed with explanation,The Summer of XX/XX/2018 I was denied a mortga...


In [23]:
df.to_csv("cleaned_complaints.csv", index=False)
print("✅ Saved cleaned dataset to cleaned_complaints.csv")


✅ Saved cleaned dataset to cleaned_complaints.csv
