In [1]:
import pandas as pd
import numpy as np

!pip install fuzzywuzzy python-Levenshtein
from fuzzywuzzy import process



In [3]:
df = pd.read_csv('fact_fraud_case.csv')

In [4]:
df.shape

(2515, 7)

In [5]:
df.head()

Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount
0,1,4,Vishing,AI Alert,17/02/2024,Closed,2911.34
1,2,8197,Smishing,Customer Complaint,25/06/2019,Investigating,5331.17
2,3,8199,Chargeback Fraud,AI Alert,2020-09-14,Resolved,28211.42
3,4,7,Account Takeover,AI Alert,2024-06-29,Resolved,10280.88
4,5,9,Vishing,Customer Complaint,2020/12/31,Closed,5322.88


In [6]:
df.isnull().sum()

fraud_id             0
transaction_id       0
fraud_type           0
detection_method     0
report_date          0
resolution_status    0
loss_amount          0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(15)

In [8]:
df.drop_duplicates()

Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount
0,1,4,Vishing,AI Alert,17/02/2024,Closed,2911.34
1,2,8197,Smishing,Customer Complaint,25/06/2019,Investigating,5331.17
2,3,8199,Chargeback Fraud,AI Alert,2020-09-14,Resolved,28211.42
3,4,7,Account Takeover,AI Alert,2024-06-29,Resolved,10280.88
4,5,9,Vishing,Customer Complaint,2020/12/31,Closed,5322.88
...,...,...,...,...,...,...,...
2495,2496,8156,Account aTkeover,AI Alert,2021-06-15,Investigating,3505.91
2496,2497,8162,Identity Theft,Customer Complaint,11-09-2024,Closed,-10902.92
2497,2498,8179,Vishing,AI Alert,13/07/2025,Investigating,21524.74
2498,2499,8183,Phishing,Manual Review,30/06/2020,Closed,5839.43


In [9]:
# 1. Fix issue_date - convert to datetime
df['report_date'] = pd.to_datetime(df['report_date'], format='mixed', dayfirst=True)

In [10]:
df['report_date'] = pd.to_datetime(df['report_date'], errors='coerce')
df.dtypes

fraud_id                      int64
transaction_id                int64
fraud_type                   object
detection_method             object
report_date          datetime64[ns]
resolution_status            object
loss_amount                  object
dtype: object

In [11]:
df.isnull().sum()

fraud_id             0
transaction_id       0
fraud_type           0
detection_method     0
report_date          0
resolution_status    0
loss_amount          0
dtype: int64

In [12]:
# Step 1: Basic clean
df['fraud_type'] = (df['fraud_type'].astype(str).str.strip().str.lower())


In [13]:
# Step 2: List of correct fraud types
valid_fraud_types = [
    'phishing',
    'vishing',
    'identity theft',
    'chargeback fraud',
    'card cloning',
    'account takeover',
    'smishing'
]

In [14]:
# Step 3: Fuzzy match function
def clean_fraud_type(x):
    match, score = process.extractOne(x, valid_fraud_types)
    return match if score >= 60 else x

In [15]:
# Step 4: Apply auto-correction
df['fraud_type'] = df['fraud_type'].apply(clean_fraud_type)


In [16]:
# Step 5: Proper formatting (Title Case)
df['fraud_type'] = df['fraud_type'].replace({
    'phishing': 'Phishing',
    'vishing': 'Vishing',
    'identity theft': 'Identity Theft',
    'chargeback fraud': 'Chargeback Fraud',
    'card cloning': 'Card Cloning',
    'account takeover': 'Account Takeover',
    'smishing': 'Smishing'
})

In [17]:
print(df['fraud_type'].value_counts())
print(df['fraud_type'].unique())


fraud_type
Phishing            385
Chargeback Fraud    381
Account Takeover    362
Identity Theft      351
Vishing             347
Smishing            345
Card Cloning        344
Name: count, dtype: int64
['Vishing' 'Smishing' 'Chargeback Fraud' 'Account Takeover' 'Card Cloning'
 'Phishing' 'Identity Theft']


In [18]:
# 1. Basic clean
df['detection_method'] = (df['detection_method'].astype(str).str.strip().str.lower())
    

In [19]:
# 2. Valid final categories
valid_detection = [
    'ai alert',
    'customer complaint',
    'manual review'
]

In [20]:
# 3. Fuzzy correction
def clean_detection(x):
    match, score = process.extractOne(x, valid_detection)
    return match if score >= 60 else x

In [21]:
df['detection_method'] = df['detection_method'].apply(clean_detection)


In [22]:
# 4. Nice casing
df['detection_method'] = df['detection_method'].replace({
    'ai alert': 'AI Alert',
    'customer complaint': 'Customer Complaint',
    'manual review': 'Manual Review'
})

In [23]:
# 5. Check
print(df['detection_method'].value_counts())
print(df['detection_method'].unique())

detection_method
Customer Complaint    851
AI Alert              840
Manual Review         824
Name: count, dtype: int64
['AI Alert' 'Customer Complaint' 'Manual Review']


In [24]:
# 1. Basic clean: strip + lowercase
df['resolution_status'] = (df['resolution_status'].astype(str).str.strip().str.lower())
    

In [25]:
# Map to correct final names
df['resolution_status'] = df['resolution_status'].replace({
    'investigating': 'Investigating',
    'closed': 'Closed',
    'resolved': 'Resolved'
})

In [26]:
df.isnull().sum()

fraud_id             0
transaction_id       0
fraud_type           0
detection_method     0
report_date          0
resolution_status    0
loss_amount          0
dtype: int64

In [27]:
df['resolution_status'].value_counts()
df['resolution_status'].unique()

array(['Closed', 'Investigating', 'Resolved'], dtype=object)

In [28]:
df.head(130)

Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount
0,1,4,Vishing,AI Alert,2024-02-17,Closed,2911.34
1,2,8197,Smishing,Customer Complaint,2019-06-25,Investigating,5331.17
2,3,8199,Chargeback Fraud,AI Alert,2020-09-14,Resolved,28211.42
3,4,7,Account Takeover,AI Alert,2024-06-29,Resolved,10280.88
4,5,9,Vishing,Customer Complaint,2020-12-31,Closed,5322.88
...,...,...,...,...,...,...,...
125,126,8472,Smishing,Customer Complaint,2023-10-06,Closed,12600.42
126,127,283,Vishing,Manual Review,2023-06-06,Resolved,9367.22
127,128,286,Phishing,Customer Complaint,2024-06-20,Resolved,12005.76
128,129,287,Chargeback Fraud,Manual Review,2020-12-14,Investigating,24112.56


In [29]:
# 5. Fix amount - remove currency symbols and commas
df['loss_amount'] = df['loss_amount'].astype(str).str.replace('₹', '').str.replace(',', '')
df['loss_amount'] = df['loss_amount'].astype(float)
df.dtypes


fraud_id                      int64
transaction_id                int64
fraud_type                   object
detection_method             object
report_date          datetime64[ns]
resolution_status            object
loss_amount                 float64
dtype: object

In [30]:
df.isnull().sum()

fraud_id             0
transaction_id       0
fraud_type           0
detection_method     0
report_date          0
resolution_status    0
loss_amount          0
dtype: int64

In [31]:
df.shape

(2515, 7)

In [32]:
df.duplicated().sum()


np.int64(15)

In [33]:
df.duplicated('fraud_id').sum()


np.int64(15)

In [34]:
df[df.duplicated('fraud_id', keep=False)].sort_values('fraud_id')


Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount
85,86,170,Phishing,Customer Complaint,2023-04-19,Investigating,4598.48
2505,86,170,Phishing,Customer Complaint,2023-04-19,Investigating,4598.48
2503,137,8498,Identity Theft,AI Alert,2021-03-02,Closed,24897.02
136,137,8498,Identity Theft,AI Alert,2021-03-02,Closed,24897.02
195,196,476,Account Takeover,Customer Complaint,2020-06-22,Resolved,21650.18
2511,196,476,Account Takeover,Customer Complaint,2020-06-22,Resolved,21650.18
2500,259,627,Smishing,Manual Review,2021-03-08,Closed,15343.29
258,259,627,Smishing,Manual Review,2021-03-08,Closed,15343.29
2504,462,1147,Chargeback Fraud,Customer Complaint,2022-07-07,Investigating,32938.68
461,462,1147,Chargeback Fraud,Customer Complaint,2022-07-07,Investigating,32938.68


In [35]:
df = df.drop_duplicates(subset='fraud_id', keep='first')


In [36]:
df[df.duplicated('fraud_id', keep=False)]


Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount


In [37]:
df.duplicated().sum()
df.shape

(2500, 7)

In [38]:
#fraud_case.csv → fix negative loss_amount
df['loss_amount'] = df['loss_amount'].abs()

In [39]:
df[df['loss_amount'] < 0]

Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount


In [40]:
df.head(25)

Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount
0,1,4,Vishing,AI Alert,2024-02-17,Closed,2911.34
1,2,8197,Smishing,Customer Complaint,2019-06-25,Investigating,5331.17
2,3,8199,Chargeback Fraud,AI Alert,2020-09-14,Resolved,28211.42
3,4,7,Account Takeover,AI Alert,2024-06-29,Resolved,10280.88
4,5,9,Vishing,Customer Complaint,2020-12-31,Closed,5322.88
5,6,8202,Card Cloning,AI Alert,2023-07-25,Resolved,23506.53
6,7,12,Phishing,AI Alert,2024-03-18,Closed,12936.15
7,8,8205,Identity Theft,Manual Review,2025-02-16,Resolved,19689.27
8,9,14,Phishing,Customer Complaint,2023-04-14,Investigating,14998.51
9,10,15,Identity Theft,Customer Complaint,2024-07-26,Closed,82.03


In [41]:
df.dtypes

fraud_id                      int64
transaction_id                int64
fraud_type                   object
detection_method             object
report_date          datetime64[ns]
resolution_status            object
loss_amount                 float64
dtype: object

In [42]:
df.isnull().sum()

fraud_id             0
transaction_id       0
fraud_type           0
detection_method     0
report_date          0
resolution_status    0
loss_amount          0
dtype: int64

In [48]:
df.to_csv(r"C:\Users\swath\Downloads\fraud_case.csv" , index=False)

Unnamed: 0,fraud_id,transaction_id,fraud_type,detection_method,report_date,resolution_status,loss_amount
