In [3]:
import pandas as pd
import numpy as np

!pip install fuzzywuzzy python-Levenshtein
from fuzzywuzzy import process



In [4]:
df = pd.read_csv('fact_credit_card.csv')

In [5]:
df.shape

(5025, 6)

In [6]:
df.isnull().sum()

card_id         0
customer_id     0
card_type       0
limit_amount    0
issue_date      0
is_active       0
dtype: int64

In [7]:
df.head()

Unnamed: 0,card_id,customer_id,card_type,limit_amount,issue_date,is_active
0,1,4001,RPuay,19593.39,03/02/2017,1
1,2,544,RuPay,-42708.22,2022-12-03,1
2,3,4020,MasterCard,134380.06,2020-06-04,0
3,4,4009,Amex,111110.06,"Jul 07, 2019",1
4,5,4029,RuPay,112894.0,2017/01/29,1


In [8]:
df.describe

<bound method NDFrame.describe of       card_id  customer_id   card_type limit_amount    issue_date is_active
0           1         4001       RPuay     19593.39    03/02/2017         1
1           2          544       RuPay    -42708.22    2022-12-03         1
2           3         4020  MasterCard    134380.06    2020-06-04         0
3           4         4009        Amex    111110.06  Jul 07, 2019         1
4           5         4029       RuPay     112894.0    2017/01/29         1
...       ...          ...         ...          ...           ...       ...
5020     2593         4005  MasterCard    204267.97    13-01-2014         1
5021     1841         4008  MasterCard   -112088.68    2019-11-05         0
5022     3075         4003  MasterCard    135449.27    2019-07-27         1
5023     4297         4039        Visa     78413.26    2015-10-15         1
5024     2052         4012  MasterCard     83520.21    2023-06-29         0

[5025 rows x 6 columns]>

In [9]:
df.describe()

Unnamed: 0,card_id,customer_id
count,5025.0,5025.0
mean,2499.419303,3995.563383
std,1443.043476,251.944288
min,1.0,126.0
25%,1250.0,4010.0
50%,2502.0,4020.0
75%,3748.0,4030.0
max,5000.0,4040.0


In [10]:
df.duplicated().sum()

np.int64(25)

In [11]:
df.drop_duplicates()

Unnamed: 0,card_id,customer_id,card_type,limit_amount,issue_date,is_active
0,1,4001,RPuay,19593.39,03/02/2017,1
1,2,544,RuPay,-42708.22,2022-12-03,1
2,3,4020,MasterCard,134380.06,2020-06-04,0
3,4,4009,Amex,111110.06,"Jul 07, 2019",1
4,5,4029,RuPay,112894.0,2017/01/29,1
...,...,...,...,...,...,...
4995,4996,4009,Amex,138439.95,2021-06-27,1
4996,4997,4025,MasterCard,197593.27,2021-05-26,1
4997,4998,4032,RuPay,110628.31,2024-10-30,1
4998,4999,4005,Amex,193948.15,2025-10-02,1


In [12]:
# 1. Fix issue_date - convert to datetime
df['issue_date'] = pd.to_datetime(df['issue_date'], format='mixed', dayfirst=True)

In [13]:
df['issue_date'] = pd.to_datetime(df['issue_date'], errors='coerce')
df.dtypes

card_id                  int64
customer_id              int64
card_type               object
limit_amount            object
issue_date      datetime64[ns]
is_active               object
dtype: object

In [14]:
df.isnull().sum()

card_id         0
customer_id     0
card_type       0
limit_amount    0
issue_date      0
is_active       0
dtype: int64

In [15]:
df.head(10)

Unnamed: 0,card_id,customer_id,card_type,limit_amount,issue_date,is_active
0,1,4001,RPuay,19593.39,2017-02-03,1
1,2,544,RuPay,-42708.22,2022-12-03,1
2,3,4020,MasterCard,134380.06,2020-06-04,0
3,4,4009,Amex,111110.06,2019-07-07,1
4,5,4029,RuPay,112894.0,2017-01-29,1
5,6,4005,Amex,83192.61,2025-10-12,1
6,7,4021,Amex,206911.34,2014-05-04,1
7,8,4032,Amex,253629.22,2017-10-12,1
8,9,4040,Amex,46289.85,2020-07-09,1
9,10,4034,Visa,246651.17,2021-08-05,1


In [16]:
# 1. Basic cleaning (strip + lowercase)
df['card_type'] = (df['card_type'].astype(str).str.strip().str.lower())
    

In [17]:
# 2. Valid categories
valid_cards = [
    'visa',
    'mastercard',
    'amex',
    'rupay'
]

In [18]:
# 3. Fuzzy match function
def clean_card_type(x):
    match, score = process.extractOne(x, valid_cards)
    return match if score >= 60 else x

In [19]:
# 4. Apply correction
df['card_type'] = df['card_type'].apply(clean_card_type)

In [20]:
# 5. Proper formatting
df['card_type'] = df['card_type'].replace({
    'visa': 'Visa',
    'mastercard': 'MasterCard',
    'amex': 'Amex',
    'rupay': 'RuPay'
})

In [21]:
print(df['card_type'].value_counts())
print(df['card_type'].unique())


card_type
Visa          1288
MasterCard    1271
RuPay         1237
Amex          1229
Name: count, dtype: int64
['RuPay' 'MasterCard' 'Amex' 'Visa']


In [22]:
df.isnull().sum()

card_id         0
customer_id     0
card_type       0
limit_amount    0
issue_date      0
is_active       0
dtype: int64

In [23]:
# 5. Fix amount - remove currency symbols and commas
df['limit_amount'] = df['limit_amount'].astype(str).str.replace('₹', '').str.replace(',', '')
df['limit_amount'] = df['limit_amount'].astype(float)
df.dtypes

card_id                  int64
customer_id              int64
card_type               object
limit_amount           float64
issue_date      datetime64[ns]
is_active               object
dtype: object

In [24]:
df['is_active'] = df['is_active'].astype(str).str.strip().str.lower()


In [25]:
df['is_active'] = df['is_active'].replace({
    '1': 1,
    'true': 1,
    'yes': 1,
    '0': 0,
    'false': 0,
    'no': 0
})


  df['is_active'] = df['is_active'].replace({


In [26]:
df['is_active'] = df['is_active'].astype(int)

In [27]:
df.dtypes

card_id                  int64
customer_id              int64
card_type               object
limit_amount           float64
issue_date      datetime64[ns]
is_active                int64
dtype: object

In [28]:
df.isnull().sum()

card_id         0
customer_id     0
card_type       0
limit_amount    0
issue_date      0
is_active       0
dtype: int64

In [29]:
df.isnull().sum()

card_id         0
customer_id     0
card_type       0
limit_amount    0
issue_date      0
is_active       0
dtype: int64

In [30]:
df.shape

(5025, 6)

In [31]:
df.duplicated().sum()


np.int64(25)

In [32]:
df.duplicated('card_id').sum()


np.int64(25)

In [33]:
df[df.duplicated('card_id', keep=False)].sort_values('card_id')


Unnamed: 0,card_id,customer_id,card_type,limit_amount,issue_date,is_active
48,49,4019,RuPay,175681.55,2024-03-04,1
5001,49,4019,RuPay,175681.55,2024-03-04,1
230,231,4037,MasterCard,201351.84,2022-08-31,1
5013,231,4037,MasterCard,201351.84,2022-08-31,1
5002,268,4011,RuPay,100640.6,2018-12-17,1
267,268,4011,RuPay,100640.6,2018-12-17,1
5017,278,4026,Visa,98400.8,2024-01-12,1
277,278,4026,Visa,98400.8,2024-01-12,1
897,898,4038,MasterCard,94039.85,2019-01-06,1
5005,898,4038,MasterCard,94039.85,2019-01-06,1


In [34]:
df = df.drop_duplicates(subset='card_id', keep='first')

In [35]:

df[df.duplicated('card_id', keep=False)]


Unnamed: 0,card_id,customer_id,card_type,limit_amount,issue_date,is_active


In [36]:
df.shape

(5000, 6)

In [37]:
#credit_card.csv → fix negative limit_amount
df['limit_amount'] = df['limit_amount'].abs()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['limit_amount'] = df['limit_amount'].abs()


In [38]:
df[df['limit_amount'] < 0]

Unnamed: 0,card_id,customer_id,card_type,limit_amount,issue_date,is_active


In [39]:
df.dtypes

card_id                  int64
customer_id              int64
card_type               object
limit_amount           float64
issue_date      datetime64[ns]
is_active                int64
dtype: object

In [40]:
df.shape

(5000, 6)

In [42]:
df.to_csv(r"C:\Users\swath\Downloads\credit_card.csv" , index=False)