In [44]:
import pandas as pd

raw_df = pd.read_csv("../data/complaints.csv")
raw_df.head()

Unnamed: 0,Date received,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Tags,Consumer consent provided?,Submitted via,Date sent to company,Company response to consumer,Timely response?,Consumer disputed?,Complaint ID
0,01/20/25,Credit reporting or other personal consumer re...,Credit reporting,Improper use of your report,Reporting company used your report improperly,Iam filling this complaint because XXXX has re...,,"EQUIFAX, INC.",NC,28269,,Consent provided,Web,01/20/25,Closed with non-monetary relief,Yes,,11611687
1,01/11/25,Credit reporting or other personal consumer re...,Credit reporting,Problem with a company's investigation into an...,Their investigation did not fix an error on yo...,LexisNexis sent me an email today at approxima...,,LEXISNEXIS,NJ,088XX,,Consent provided,Web,01/11/25,Closed with explanation,Yes,,11479904
2,01/09/25,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Account status incorrect,My account with XXXX XXXX TRANSUNION was never...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",GA,310XX,,Consent provided,Web,01/09/25,Closed with non-monetary relief,Yes,,11446168
3,02/11/25,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,I am a victim of XXXX. The information listed ...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",WI,53590,,Consent provided,Web,02/11/25,Closed with non-monetary relief,Yes,,12038221
4,01/23/25,Credit reporting or other personal consumer re...,Credit reporting,Incorrect information on your report,Information belongs to someone else,PLEASE BLOCKED AND REMOVED I respectfully requ...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,TN,38128,,Consent provided,Web,01/23/25,Closed with explanation,Yes,,11688896


In [45]:
# extract only the columns that are relevant to project objective
df = raw_df[['Date received', 'Consumer complaint narrative', 'Company', 'Issue']].copy()
df.columns = ['date_received', 'narrative', 'company', 'issue']

df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116945 entries, 0 to 116944
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   date_received  116945 non-null  object
 1   narrative      116945 non-null  object
 2   company        116945 non-null  object
 3   issue          116945 non-null  object
dtypes: object(4)
memory usage: 3.6+ MB


In [46]:
# offical categories (9 labels)
df["issue"].value_counts()

issue
Incorrect information on your report                               55062
Problem with a company's investigation into an existing problem    33508
Improper use of your report                                        26393
Unable to get your credit report or credit score                     642
Credit monitoring or identity theft protection services              609
Problem with fraud alerts or security freezes                        500
Problem with a company's investigation into an existing issue        194
Identity theft protection or other monitoring services                36
Problem with a purchase shown on your statement                        1
Name: count, dtype: int64

In [47]:
# top 5 companies being complained 
df["company"].value_counts()[:5]

company
TRANSUNION INTERMEDIATE HOLDINGS, INC.    38826
Experian Information Solutions Inc.       36299
EQUIFAX, INC.                             31272
CAPITAL ONE FINANCIAL CORPORATION           567
CITIBANK, N.A.                              438
Name: count, dtype: int64

In [48]:
df["date_received"] = pd.to_datetime(df["date_received"], format="%m/%d/%y")

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116945 entries, 0 to 116944
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   date_received  116945 non-null  datetime64[ns]
 1   narrative      116945 non-null  object        
 2   company        116945 non-null  object        
 3   issue          116945 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 3.6+ MB


In [50]:
df.head()

Unnamed: 0,date_received,narrative,company,issue
0,2025-01-20,Iam filling this complaint because XXXX has re...,"EQUIFAX, INC.",Improper use of your report
1,2025-01-11,LexisNexis sent me an email today at approxima...,LEXISNEXIS,Problem with a company's investigation into an...
2,2025-01-09,My account with XXXX XXXX TRANSUNION was never...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Incorrect information on your report
3,2025-02-11,I am a victim of XXXX. The information listed ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Incorrect information on your report
4,2025-01-23,PLEASE BLOCKED AND REMOVED I respectfully requ...,Experian Information Solutions Inc.,Incorrect information on your report


# Text Preprocessing 

- lowercase
- remove punctuation, number, short filler words

In [51]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Modify stopwords to preserve negations
from spacy.lang.en.stop_words import STOP_WORDS

# Define words to keep
negations = {"no", "not", "n't", "never", "none"}
whitelist = {"against"}
custom_stopwords = STOP_WORDS - negations - whitelist

In [52]:
def clean_text_spacy(text):
    # lowercase
    doc = nlp(text.lower())
    
    tokens = []
    for token in doc:
        # Keep negations as-is
        if token.text in {"not", "no", "n't", "never", "none"}:
            tokens.append(token.text)
        # Then apply rest of filters
        elif token.is_alpha and token.text not in custom_stopwords and len(token) > 2:
            tokens.append(token.lemma_)
    
    # merge the cleaned tokens to a long string as processed narrative
    return " ".join(tokens)

In [53]:
df_sample = df.sample(10, random_state=42).copy()
df_sample['clean_narrative'] = df_sample['narrative'].apply(clean_text_spacy)

df_sample.head()

Unnamed: 0,date_received,narrative,company,issue,clean_narrative
59704,2025-02-05,Im filing a complaint against Equifax for fall...,Experian Information Solutions Inc.,Problem with a company's investigation into an...,file complaint against equifax fall properly i...
2744,2025-02-06,"XXXX XXXX XXXX XXXX XXXX XXXX. \nXXXX XXXX, NJ...",NAVY FEDERAL CREDIT UNION,Incorrect information on your report,xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx x...
85458,2025-01-19,"Hello, to whom it may concern, I could write o...",Experian Information Solutions Inc.,Improper use of your report,hello concern write bunch fair credit reportin...
63838,2025-01-31,"I don't remember dates, but I have tried to di...","EQUIFAX, INC.",Problem with a company's investigation into an...,n't remember date try dispute update item xxxx...
79736,2025-01-17,This CFPB complaint has been filed to request ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",Incorrect information on your report,cfpb complaint file request pursuant fcra tran...


In [54]:
df['clean_narrative'] = df['narrative'].apply(clean_text_spacy)

In [55]:
df.to_csv("../data/cleaned_complaints.csv", index=False)