In [1]:
import pandas as pd
import numpy as np

!pip install fuzzywuzzy python-Levenshtein
from fuzzywuzzy import process




In [3]:
df = pd.read_csv('fact_transaction.csv')

In [4]:
df.shape

(12040, 7)

In [5]:
df.head()

Unnamed: 0,transaction_id,account_id,transaction_date,transaction_type,amount,channel,is_fraud
0,1,5857,2025/05/19,POS,6085.95,UPI,0
1,2,2564,2023/01/30 11:07,ATM Withdrawal,5785.92,UPI,0
2,3,2275,2024-01-12 14:28:10,Debit,522.88,UPI,0
3,4,6471,10/07/2018 18:37:52,Credit,1386.3,UPI,0
4,5,6290,2018/04/13,UPI,2879.44,Online,0


In [6]:
df.dtypes 

transaction_id       int64
account_id           int64
transaction_date    object
transaction_type    object
amount              object
channel             object
is_fraud            object
dtype: object

In [7]:
df.duplicated().sum()

np.int64(40)

In [8]:
df = df.drop_duplicates()

In [9]:
df.shape

(12000, 7)

In [10]:
df['transaction_date'] = pd.to_datetime(df['transaction_date'], format='mixed', dayfirst=True)
df['transaction_date'] = pd.to_datetime(df['transaction_date']).dt.strftime('%Y-%m-%d %H:%M:%S')

In [11]:
df.head(25)

Unnamed: 0,transaction_id,account_id,transaction_date,transaction_type,amount,channel,is_fraud
0,1,5857,2025-05-19 00:00:00,POS,6085.95,UPI,0
1,2,2564,2023-01-30 11:07:00,ATM Withdrawal,5785.92,UPI,0
2,3,2275,2024-01-12 14:28:10,Debit,522.88,UPI,0
3,4,6471,2018-07-10 18:37:52,Credit,1386.3,UPI,0
4,5,6290,2018-04-13 00:00:00,UPI,2879.44,Online,0
5,6,6516,2019-11-07 00:00:00,ATM Withdrawal,441.92,ATM,0
6,7,1802,2025-08-31 05:38:00,Debit,2148.77,ATM,0
7,8,2096,2020-01-06 15:14:51,UPI,3843.81,POS,0
8,9,3269,2021-02-11 08:55:37,UPI,7571.14,POS,1
9,10,6750,2022-11-04 17:24:01,Credit,4316.21,Branch,0


In [12]:
df.dtypes 

transaction_id       int64
account_id           int64
transaction_date    object
transaction_type    object
amount              object
channel             object
is_fraud            object
dtype: object

In [13]:
df['transaction_date'] = pd.to_datetime(df['transaction_date'], errors='coerce')
df.dtypes

transaction_id               int64
account_id                   int64
transaction_date    datetime64[ns]
transaction_type            object
amount                      object
channel                     object
is_fraud                    object
dtype: object

In [14]:
df.isnull().sum()

transaction_id      0
account_id          0
transaction_date    0
transaction_type    0
amount              0
channel             0
is_fraud            0
dtype: int64

In [15]:
df['transaction_type'] = (df['transaction_type'].astype(str).str.lower().str.strip())
    

In [16]:
valid_types = [
    'pos',
    'debit',
    'credit',
    'upi',
    'atm withdrawal',
    'online transfer'
]

In [17]:
def correct_type(x):
    match, score = process.extractOne(x, valid_types)
    return match if score >= 60 else x   

In [18]:
df['transaction_type'] = df['transaction_type'].apply(correct_type)


In [19]:
df['transaction_type'] = df['transaction_type'].replace({
    'pos': 'POS',
    'debit': 'Debit',
    'credit': 'Credit',
    'upi': 'UPI',
    'atm withdrawal': 'ATM Withdrawal',
    'online transfer': 'Online Transfer'
})

In [20]:
print(df['transaction_type'].value_counts())
print(df['transaction_type'].unique())
df.dtypes 


transaction_type
POS                2095
Debit              2025
ATM Withdrawal     2007
Credit             1968
Online Transfer    1963
UPI                1942
Name: count, dtype: int64
['POS' 'ATM Withdrawal' 'Debit' 'Credit' 'UPI' 'Online Transfer']


transaction_id               int64
account_id                   int64
transaction_date    datetime64[ns]
transaction_type            object
amount                      object
channel                     object
is_fraud                    object
dtype: object

In [21]:
df.isnull().sum()

transaction_id      0
account_id          0
transaction_date    0
transaction_type    0
amount              0
channel             0
is_fraud            0
dtype: int64

In [22]:
df['channel'] = (df['channel'].astype(str).str.strip().str.lower())

In [23]:
# 2. Define correct target categories
valid_channels = [
    'atm',
    'branch',
    'pos',
    'upi',
    'online'
]

In [24]:
# 3. Fuzzy function
def correct_channel(x):
    match, score = process.extractOne(x, valid_channels)
    return match if score >= 60 else x

In [25]:
# 4. Apply correction
df['channel'] = df['channel'].apply(correct_channel)


In [26]:
# 5. Proper casing
df['channel'] = df['channel'].replace({
    'atm': 'ATM',
    'branch': 'Branch',
    'pos': 'POS',
    'upi': 'UPI',
    'online': 'Online'
})

In [27]:
# 6. Check result
print(df['channel'].value_counts())
print(df['channel'].unique())

channel
ATM       2487
Branch    2426
POS       2397
Online    2351
UPI       2339
Name: count, dtype: int64
['UPI' 'Online' 'ATM' 'POS' 'Branch']


In [28]:
df.isnull().sum()

transaction_id      0
account_id          0
transaction_date    0
transaction_type    0
amount              0
channel             0
is_fraud            0
dtype: int64

In [29]:
df['is_fraud'] = df['is_fraud'].astype(str).str.strip().str.lower()

In [30]:
#(Handles values: “yes/no”, “true/false”, “1/0”, mixed spellings)
df['is_fraud'] = df['is_fraud'].replace({
    'yes': 1,
    'y': 1,
    'true': 1,
    '1': 1,
    'fraud': 1,
    'no': 0,
    'n': 0,
    'false': 0,
    '0': 0,
    'not fraud': 0
})


  df['is_fraud'] = df['is_fraud'].replace({


In [31]:
df['is_fraud'] = df['is_fraud'].astype(int)
df.dtypes

transaction_id               int64
account_id                   int64
transaction_date    datetime64[ns]
transaction_type            object
amount                      object
channel                     object
is_fraud                     int64
dtype: object

In [32]:
df.isnull().sum()


transaction_id      0
account_id          0
transaction_date    0
transaction_type    0
amount              0
channel             0
is_fraud            0
dtype: int64

In [33]:
# 5. Fix amount - remove currency symbols and commas
df['amount'] = df['amount'].astype(str).str.replace('₹', '').str.replace(',', '')
df['amount'] = df['amount'].astype(float)
df.dtypes

transaction_id               int64
account_id                   int64
transaction_date    datetime64[ns]
transaction_type            object
amount                     float64
channel                     object
is_fraud                     int64
dtype: object

In [35]:
#transaction.csv → fix negative amount
df['amount'] = df['amount'].abs()


In [36]:
df[df['amount'] < 0]

Unnamed: 0,transaction_id,account_id,transaction_date,transaction_type,amount,channel,is_fraud


In [37]:
df.dtypes

transaction_id               int64
account_id                   int64
transaction_date    datetime64[ns]
transaction_type            object
amount                     float64
channel                     object
is_fraud                     int64
dtype: object

In [40]:
df.head(93)

Unnamed: 0,transaction_id,account_id,transaction_date,transaction_type,amount,channel,is_fraud
0,1,5857,2025-05-19 00:00:00,POS,6085.95,UPI,0
1,2,2564,2023-01-30 11:07:00,ATM Withdrawal,5785.92,UPI,0
2,3,2275,2024-01-12 14:28:10,Debit,522.88,UPI,0
3,4,6471,2018-07-10 18:37:52,Credit,1386.30,UPI,0
4,5,6290,2018-04-13 00:00:00,UPI,2879.44,Online,0
...,...,...,...,...,...,...,...
88,89,2928,2019-03-25 10:49:00,ATM Withdrawal,500.19,ATM,0
89,90,3611,2018-09-05 09:32:30,UPI,3736.57,Online,0
90,91,708,2024-01-11 04:57:32,Online Transfer,7871.65,Online,0
91,92,471,2021-02-11 06:14:59,Credit,2007.22,ATM,0


In [41]:
df.to_csv(r"C:\Users\swath\Downloads\transaction.csv", index=False)