In [615]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from collections import Counter
import xgboost as xgb
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

In [617]:
df_account_holder = pd.read_csv('account holder data.csv', index_col = 0)
df_account = pd.read_csv('account data.csv', index_col = 0)
df_mule_flag = pd.read_csv('mule flag.csv', index_col = 0)

In [619]:
df_account_holder['DateOfBirth'] = pd.to_datetime(df_account_holder['DateOfBirth'], format='%d/%m/%Y', errors='coerce')

In [621]:
print(df_account_holder.index.duplicated().sum())
print(df_account.index.duplicated().sum())
print(df_mule_flag.index.duplicated().sum())

45
0
0


In [623]:
df_account_holder = df_account_holder.drop_duplicates()

In [625]:
# Calculate age in years
today = pd.Timestamp('today')
df_account_holder['Age'] = (today - df_account_holder['DateOfBirth']).dt.days // 365.25

In [627]:
df_account_holder = df_account_holder.drop(columns = ['DateOfBirth'])

In [631]:
df_combined = pd.concat([df_account_holder, df_mule_flag], axis = 1)

In [633]:
df = df_combined[df_combined['MuleAccount'].notna()]

In [637]:
df = df.drop(columns = ['MuleAccount'])

In [639]:
df

Unnamed: 0_level_0,Gender,Income,CreditScore,LoanAmount,EmploymentStatus,MaritalStatus,OccupancyStatus,NumDependents,SocialMediaUsageHours,ShoppingFrequencyPerMonth,HealthInsuranceStatus,Age
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ID_0001,Female,96534.0,,44116.0,Unemployed,Single,Owner,3.0,21.0,9.0,Yes,61.0
ID_0002,Male,75106.0,390.0,9782.0,Unemployed,,Owner,2.0,9.0,10.0,No,20.0
ID_0005,Male,83230.0,750.0,28557.0,Employed,Single,Owner,1.0,11.0,5.0,No,49.0
ID_0006,Male,94055.0,664.0,5798.0,Employed,Married,Renter,4.0,19.0,4.0,No,35.0
ID_0007,,76373.0,650.0,9627.0,Unemployed,Divorced,Renter,2.0,11.0,6.0,Yes,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...
ID_0996,,55262.0,,1299.0,Unemployed,Widowed,Renter,4.0,9.0,10.0,Yes,45.0
ID_0997,Female,98213.0,725.0,39469.0,Unemployed,Married,Renter,3.0,12.0,5.0,Yes,57.0
ID_0998,Male,38268.0,324.0,4994.0,Retired,Divorced,,0.0,10.0,15.0,No,63.0
ID_0999,Male,97866.0,802.0,29878.0,Employed,Married,Renter,,2.0,22.0,No,39.0


In [641]:
# Create encoder that will treat NaN as missing
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)

# Encode categorical columns
cat_cols = df.select_dtypes(include=['object']).columns
df[cat_cols] = encoder.fit_transform(df[cat_cols])

In [643]:
# Apply KNN imputation
imputer = KNNImputer(n_neighbors=15)
imputed_data = imputer.fit_transform(df)

# Convert back to DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=df.columns)

# Decode back to original categories
for col, le in label_encoders.items():
    imputed_df[col] = le.inverse_transform(imputed_df[col].astype(int))
    

In [645]:
df_1 = imputed_df.copy()
df_1.index = df.index

In [653]:
df_1

Unnamed: 0_level_0,Gender,Income,CreditScore,LoanAmount,EmploymentStatus,MaritalStatus,OccupancyStatus,NumDependents,SocialMediaUsageHours,ShoppingFrequencyPerMonth,HealthInsuranceStatus,Age
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ID_0001,Female,96534.0,545.2,44116.0,Unemployed,Single,Owner,3.000000,21.0,9.0,Yes,61.0
ID_0002,Male,75106.0,390.0,9782.0,Unemployed,Married,Owner,2.000000,9.0,10.0,No,20.0
ID_0005,Male,83230.0,750.0,28557.0,Employed,Single,Owner,1.000000,11.0,5.0,No,49.0
ID_0006,Male,94055.0,664.0,5798.0,Employed,Married,Renter,4.000000,19.0,4.0,No,35.0
ID_0007,Female,76373.0,650.0,9627.0,Unemployed,Divorced,Renter,2.000000,11.0,6.0,Yes,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...
ID_0996,Female,55262.0,686.6,1299.0,Unemployed,Widowed,Renter,4.000000,9.0,10.0,Yes,45.0
ID_0997,Female,98213.0,725.0,39469.0,Unemployed,Married,Renter,3.000000,12.0,5.0,Yes,57.0
ID_0998,Male,38268.0,324.0,4994.0,Retired,Divorced,Owner,0.000000,10.0,15.0,No,63.0
ID_0999,Male,97866.0,802.0,29878.0,Employed,Married,Renter,2.066667,2.0,22.0,No,39.0
