# **Cleaning Credit Card Complaint Dataset**

Importing necessary libraries

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

Loading the datset

In [None]:
df = pd.read_csv("credit_card_complaints.csv")

Analysing dataset

In [None]:
df.info()

In [None]:
df.columns

In [None]:
initial_cols,initial_rows = df.shape[1], df.shape[0]
print("Columns: ", initial_cols)
print(f"Rows: {initial_rows:,}")

In [None]:
initial_total_values = initial_rows * initial_cols
print(f"Total values: {initial_total_values:,}")

In [None]:
df.head()

In [None]:
duplicates = df.duplicated()
d_count = duplicates.sum()

print(f"Number of duplicates: {d_count}")

In [None]:
other_products = df[df['product'] != 'Credit card']
other_products.head()
o_count = len(other_products)
print("Count of products other than credit card: ",o_count)

In [None]:
df = df.drop(['product'], axis=1)
df.head()

In [None]:
initial_null_values = df.isnull().sum().sum()
print(f"Total NaN values: {initial_null_values:,}")

In [None]:
si_null = df['sub_issue'].isnull().sum()
cpr_null = df['company_public_response'].isnull().sum()
ccn_null = df['consumer_complaint_narrative'].isnull().sum()
ccp_null = df['consumer_consent_provided'].isnull().sum()

print(f"Null value % in sub_issue: {si_null/len(df)*100:.2f}%")
print(f"Null value % in consumer_complaint_narrative: {ccn_null/len(df)*100:.2f}%")
print(f"Null value % in company_public_response: {cpr_null/len(df)*100:.2f}%")
print(f"Null value % in consumer_consent_provided: {ccp_null/len(df)*100:.2f}%")

In [None]:
df = df.drop(['sub_issue','consumer_complaint_narrative','company_public_response'], axis=1)
df.head()

In [None]:
df.dropna(subset=['consumer_consent_provided'], inplace=True)

In [None]:
null_values = df.isnull().sum().sum()
print(f"Initial NaN values: {initial_null_values:,}")
print(f"Current NaN values: {null_values:,}")

In [None]:
df.head()

In [None]:
sp_null = df['sub_product'].isnull().sum()
tags_null = df['tags'].isnull().sum()

print(f"Null value % in sub_issue: {sp_null/len(df)*100:.2f}%")
print(f"Null value % in consumer_complaint_narrative: {tags_null/len(df)*100:.2f}%")

In [None]:
df = df.drop(['sub_product', 'tags'], axis=1)
df.head()

In [None]:
null_values = df.isnull().sum().sum()
print(f"Initial NaN values: {initial_null_values:,}")
print(f"Current NaN values: {null_values:,}")

In [None]:
print( df['zip_code'].dtype )
print( df['date_sent_to_company'].dtype )

In [None]:
df = df.drop(['zip_code'], axis=1)

In [None]:
df = df.drop(['complaint_id'], axis=1)

In [None]:
cols,rows = df.shape[1], df.shape[0]
null_values = df.isnull().sum().sum()

print(f"Total Values: {cols*rows:,}")
print(f"Total NaN values: {null_values:,}")
print(f"% of NaN values: {null_values/(cols*rows)*100:.2f}")

In [None]:
# converting to datetime objects
df['date_received'] = pd.to_datetime(df['date_received'])
df['date_sent_to_company'] = pd.to_datetime(df['date_sent_to_company'])

print(df['date_received'].dtype)
print(df['date_sent_to_company'].dtype)

In [None]:
df['time_difference'] = (df['date_sent_to_company'] - df['date_received']).dt.days
df = df.drop(['date_sent_to_company','date_received'],axis=1)

In [None]:
df.head()

In [None]:
# removing incorrect responses
df = df[df['time_difference'] >= 0]
df.head()

In [None]:
df.columns

Categorical Data Encoding and Labeling

In [None]:
df_encoded = pd.get_dummies(df, columns=['company_response_to_consumer','submitted_via','company','issue','state'])

label_encoder = LabelEncoder()
df_encoded['consumer_disputed'] = label_encoder.fit_transform(df_encoded['consumer_disputed'])
df_encoded['timely_response'] = label_encoder.fit_transform(df_encoded['timely_response'])

df_encoded.head()

In [None]:
df_encoded['consumer_consent_provided'] = df_encoded['consumer_consent_provided'].apply(lambda x: 1 if x == 'Consent provided' else 0)
df_encoded.head()

In [None]:
df_encoded.to_csv('Downloads\\cleaned_data.csv', index=False)