In [1]:
import pandas as pd
import numpy as np

!pip install fuzzywuzzy python-Levenshtein
from fuzzywuzzy import process



In [2]:
df = pd.read_csv('account_dim.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6998 entries, 0 to 6997
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   account_id        6998 non-null   int64  
 1   customer_id       6998 non-null   int64  
 2   branch_id         6998 non-null   int64  
 3   account_type      6998 non-null   object 
 4   opening_date      6998 non-null   object 
 5   balance           6998 non-null   float64
 6   status            6998 non-null   object 
 7   account_age_days  6998 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 437.5+ KB


In [5]:
df.head()

Unnamed: 0,account_id,customer_id,branch_id,account_type,opening_date,balance,status,account_age_days
0,1,1024,89,Credit Card,2013-06-19,25.31,Suspended,4523
1,2,3002,124,Loan,2017-01-11,331.88,Suspended,3221
2,3,3628,130,Current,2014-02-06,148.28,Suspended,4291
3,4,612,37,Savings,2024-01-02,17.12,Active,674
4,5,1472,33,Current,2025-05-27,180.89,Closed,163


In [6]:
df.isnull().sum()

account_id          0
customer_id         0
branch_id           0
account_type        0
opening_date        0
balance             0
status              0
account_age_days    0
dtype: int64

In [24]:
df.duplicated().sum()

np.int64(0)

In [7]:
df['account_type'].unique()

array(['Credit Card', 'Loan', 'Current', 'Savings', 'Cerdit Card',
       'Savigns', 'Crurent', 'Crdeit Card', 'Credit Crad', 'Curretn',
       'Credit Cadr', 'Credit aCrd'], dtype=object)

In [8]:
#Define clean valid roles
# 1. valid account types
valid_accounts = ['credit card', 'loan', 'current', 'savings']

In [9]:
#Normalize dirty roles
df['account_type'] = (df['account_type'].astype(str).str.strip().str.lower())

In [10]:
# 2. fuzzy correction function
def correct_account_type(x):
    text = str(x).strip().lower()           # normalize here
    match, score = process.extractOne(text, valid_accounts)
    if score >= 60:
        return match.title()                # 'credit card' -> 'Credit Card'
    else:
        return text.title()                 # fallback

In [11]:
# 3. apply directly to the column
df['account_type'] = df['account_type'].apply(correct_account_type)

In [12]:
df['account_type'].unique()


array(['Credit Card', 'Loan', 'Current', 'Savings'], dtype=object)

In [13]:
df.head(30)

Unnamed: 0,account_id,customer_id,branch_id,account_type,opening_date,balance,status,account_age_days
0,1,1024,89,Credit Card,2013-06-19,25.31,Suspended,4523
1,2,3002,124,Loan,2017-01-11,331.88,Suspended,3221
2,3,3628,130,Current,2014-02-06,148.28,Suspended,4291
3,4,612,37,Savings,2024-01-02,17.12,Active,674
4,5,1472,33,Current,2025-05-27,180.89,Closed,163
5,6,2507,60,Credit Card,2021-04-28,594.74,Suspended,1653
6,7,2198,195,Loan,2013-10-18,442.08,Suspended,4402
7,8,2885,51,Current,2025-03-15,110.71,Active,236
8,9,183,111,Loan,2015-04-04,129.88,Closed,3869
9,10,1283,22,Current,2024-04-15,44.93,Suspended,570


In [14]:
# 1. Fix issue_date - convert to datetime
df['opening_date'] = pd.to_datetime(df['opening_date'], errors='coerce')
df.dtypes

account_id                   int64
customer_id                  int64
branch_id                    int64
account_type                object
opening_date        datetime64[ns]
balance                    float64
status                      object
account_age_days             int64
dtype: object

In [15]:
df['status'].unique()

array(['Suspended', 'Active', 'Closed'], dtype=object)

In [16]:
df.isnull().sum()

account_id          0
customer_id         0
branch_id           0
account_type        0
opening_date        0
balance             0
status              0
account_age_days    0
dtype: int64

In [17]:
df.duplicated().sum()

np.int64(0)

In [18]:
#account.csv → fix negative balance
df['balance'] = df['balance'].abs()


In [20]:
df[df['balance'] < 0]

Unnamed: 0,account_id,customer_id,branch_id,account_type,opening_date,balance,status,account_age_days


In [21]:
df.dtypes

account_id                   int64
customer_id                  int64
branch_id                    int64
account_type                object
opening_date        datetime64[ns]
balance                    float64
status                      object
account_age_days             int64
dtype: object

In [22]:
df.head(103)

Unnamed: 0,account_id,customer_id,branch_id,account_type,opening_date,balance,status,account_age_days
0,1,1024,89,Credit Card,2013-06-19,25.31,Suspended,4523
1,2,3002,124,Loan,2017-01-11,331.88,Suspended,3221
2,3,3628,130,Current,2014-02-06,148.28,Suspended,4291
3,4,612,37,Savings,2024-01-02,17.12,Active,674
4,5,1472,33,Current,2025-05-27,180.89,Closed,163
...,...,...,...,...,...,...,...,...
98,99,1299,123,Current,2015-03-17,326.36,Suspended,3887
99,100,697,118,Credit Card,2024-08-13,214.73,Suspended,450
100,101,2161,24,Credit Card,2023-05-12,135.44,Closed,909
101,102,1762,6,Current,2025-06-09,41.87,Active,150


In [23]:
df.shape

(6998, 8)

In [24]:
df.to_csv(r"C:\Users\swath\Downloads\account.csv", index=False)