In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os

In [2]:
os.chdir('C:/Users/Skyline/Intelligent-Complaint-Analysis-for-Financial-Services')
print(f'Current working directory: {os.getcwd()}')

try:
    df = pd.read_csv('data/complaints.csv')
    print('Dataset loaded successfully.')
except FileNotFoundError:
    print('Error: complaints.csv not found.')
    raise

print('Dataset columns:', df.columns.tolist())

df['word_count'] = df['Consumer complaint narrative'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)

plt.figure(figsize=(10, 6))
sns.histplot(df['word_count'], bins=50)
plt.title('Distribution of Complaint Narrative Word Counts')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.savefig('data/word_count_distribution.png')
plt.close()

print('Word Count Summary Statistics:')
print(df['word_count'].describe())

short_narratives = df[df['word_count'] < 10]
long_narratives = df[df['word_count'] > 500]
print(f'Number of very short narratives (<10 words): {len(short_narratives)}')
print(f'Number of very long narratives (>500 words): {len(long_narratives)}')

complaints_with_narratives = df[df['Consumer complaint narrative'].notnull()]
complaints_without_narratives = df[df['Consumer complaint narrative'].isnull()]
print(f'Complaints with narratives: {len(complaints_with_narratives)}')
print(f'Complaints without narratives: {len(complaints_without_narratives)}')

relevant_products = ['Credit card', 'Consumer Loan', 'Payday loan', 'Checking or savings account', 'Money transfer']
df_filtered = df[df['Product'].isin(relevant_products)]
print(f'Number of complaints after filtering: {len(df_filtered)}')

df_filtered = df_filtered[df_filtered['Consumer complaint narrative'].notnull()]
print(f'Number of complaints after removing empty narratives: {len(df_filtered)}')

def clean_text(text):
    if pd.notnull(text):
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', '', text)
        boilerplate = r'\bi\s+am\s+writing\s+to\s+file\s+a\s+complaint\b'
        text = re.sub(boilerplate, '', text, flags=re.IGNORECASE)
        text = ' '.join(text.split())
        return text
    return text

df_filtered['Consumer complaint narrative'] = df_filtered['Consumer complaint narrative'].apply(clean_text)

df_filtered = df_filtered.sample(n=500, random_state=42)
print(f'Number of complaints after sampling (500 rows): {len(df_filtered)}')

df_filtered.to_csv('data/complaints_cleaned.csv', index=False)
print('Reduced cleaned dataset saved to data/complaints_cleaned.csv')

Current working directory: C:\Users\Skyline\Intelligent-Complaint-Analysis-for-Financial-Services


  df = pd.read_csv('data/complaints.csv')


Dataset loaded successfully.
Dataset columns: ['Date received', 'Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative', 'Company public response', 'Company', 'State', 'ZIP code', 'Tags', 'Consumer consent provided?', 'Submitted via', 'Date sent to company', 'Company response to consumer', 'Timely response?', 'Consumer disputed?', 'Complaint ID']
Word Count Summary Statistics:
count    9.609797e+06
mean     5.446667e+01
std      1.497672e+02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      5.000000e+01
max      6.469000e+03
Name: word_count, dtype: float64
Number of very short narratives (<10 words): 6650979
Number of very long narratives (>500 words): 160989
Complaints with narratives: 2980756
Complaints without narratives: 6629041
Number of complaints after filtering: 554979
Number of complaints after removing empty narratives: 232193
Number of complaints after sampling (500 rows): 500
Reduced cleaned dataset saved to data/complaints_c