In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from collections import Counter
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import fbeta_score

In [2]:
df=pd.read_csv('credit_card_transactions.csv')

In [3]:
pd.set_option('display.max_columns', None)
df.head(2)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,2019/1/1 0:00,2703190000000000.0,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988/3/9,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0
1,2019/1/1 0:00,630423000000.0,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978/6/21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,


# Feature Extraction

In [4]:
#Remove irrelevant columns, duplicate rows, and null values
df=df.drop(['cc_num', 'first','last','trans_num','unix_time','street'], axis=1)
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [5]:
df.shape

(890052, 17)

In [6]:
df['is_fraud'].value_counts()

is_fraud
0    884987
1      5065
Name: count, dtype: int64

## Time, age, distance

In [7]:
#Process time data
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], format='%Y/%m/%d %H:%M')

df['year'] = df['trans_date_trans_time'].dt.year
df['month'] = df['trans_date_trans_time'].dt.month
df['day'] = df['trans_date_trans_time'].dt.day
df['weekday'] = df['trans_date_trans_time'].dt.weekday  # (0=Monday, 6=Sunday)
df['hour'] = df['trans_date_trans_time'].dt.hour

In [8]:
#The cardholder's age at the time of the transaction
df['dob'] = pd.to_datetime(df['dob'], format=r'%Y/%m/%d')
df['cardholder_age'] = df.apply(
    lambda row: 2019 - row['dob'].year if row['year'] == 2019 else 2020 - row['dob'].year, 
    axis=1
)

#Process merchant name
df['merchant']=df['merchant'].apply(lambda x:x[6:]) 

In [9]:
# Calculate the distance between the merchant and the cardholder based on longitude and latitude

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # The radius of the Earth in kilometers
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

df['distance'] = df.apply(lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

In [10]:
df.head(2)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,job,dob,merch_lat,merch_long,is_fraud,merch_zipcode,year,month,day,weekday,hour,cardholder_age,distance
0,2019-01-01,"Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,36.011293,-82.048315,0,28705.0,2019,1,1,1,0,31,78.597568
2,2019-01-01,Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,43.150704,-112.154481,0,83236.0,2019,1,1,1,0,57,108.206083


## Zip codes, state, city, population, and population density

In [11]:
# Read a csv of zip codes for US states and cities
zip_data = pd.read_csv('uszips.csv')

# Process zip codes in the dataset
df['merch_zipcode'] = df['merch_zipcode'].astype(str).str.zfill(5)
df['merch_zipcode']=df['merch_zipcode'].apply(lambda x:str(int(float(x))))
zip_data['city_zip'] = zip_data['city_zip'].astype(str).str.zfill(5)
df['zip'] = df['zip'].astype(str).str.zfill(5)

zip_data = zip_data[['city_zip', 'state_name', 'city_name', 'population', 'density']].drop_duplicates()

In [12]:
zip_data.shape

(33784, 5)

In [13]:
# Extract merchants' state and city
df = df.merge(zip_data, 
              left_on='merch_zipcode', 
              right_on='city_zip', 
              how='left').drop(columns=['city_zip'])

df.rename(columns={'state_name': 'merch_state', 'city_name': 'merch_city',
                   'population': 'merch_pop','density': 'merch_dens'}, inplace=True)

In [14]:
# Extract cardholders' state and city
df = df.merge(zip_data, 
              left_on='zip', 
              right_on='city_zip', 
              how='left').drop(columns=['city_zip'])

df.rename(columns={'state_name': 'holder_state', 'city_name': 'holder_city',
    'population': 'holder_pop','density': 'holder_dens'}, inplace=True)

In [15]:
df.head(2)

Unnamed: 0,trans_date_trans_time,merchant,category,amt,gender,city,state,zip,lat,long,city_pop,job,dob,merch_lat,merch_long,is_fraud,merch_zipcode,year,month,day,weekday,hour,cardholder_age,distance,merch_state,merch_city,merch_pop,merch_dens,holder_state,holder_city,holder_pop,holder_dens
0,2019-01-01,"Rippin, Kub and Mann",misc_net,4.97,F,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,36.011293,-82.048315,0,28705,2019,1,1,1,0,31,78.597568,North Carolina,Bakersville,6708.0,20.9,North Carolina,Moravian Falls,3142.0,19.8
1,2019-01-01,Lind-Buckridge,entertainment,220.11,M,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,43.150704,-112.154481,0,83236,2019,1,1,1,0,57,108.206083,Idaho,Firth,2018.0,3.6,Idaho,Malad City,4546.0,2.4


# Feature Engineering

In [16]:
df_new=df[['year','month','day','hour','weekday',
           'amt','distance',
           'gender','cardholder_age','job','holder_state','holder_city','holder_pop','holder_dens',
           'category','merchant','merch_state','merch_city','merch_pop','merch_dens',
           'is_fraud']]

In [17]:
df_new.head(2)

Unnamed: 0,year,month,day,hour,weekday,amt,distance,gender,cardholder_age,job,holder_state,holder_city,holder_pop,holder_dens,category,merchant,merch_state,merch_city,merch_pop,merch_dens,is_fraud
0,2019,1,1,0,1,4.97,78.597568,F,31,"Psychologist, counselling",North Carolina,Moravian Falls,3142.0,19.8,misc_net,"Rippin, Kub and Mann",North Carolina,Bakersville,6708.0,20.9,0
1,2019,1,1,0,1,220.11,108.206083,M,57,Nature conservation officer,Idaho,Malad City,4546.0,2.4,entertainment,Lind-Buckridge,Idaho,Firth,2018.0,3.6,0


## Periodic Encoding

In [18]:
#Periodic Encoding

df_new['weekday_sin'] = np.sin(2 * np.pi * df_new['weekday'] / 7)
df_new['weekday_cos'] = np.cos(2 * np.pi * df_new['weekday'] / 7)

df_new['month_sin'] = np.sin(2 * np.pi * df_new['month'] / 12)
df_new['month_cos'] = np.cos(2 * np.pi * df_new['month'] / 12)

df_new['day_sin'] = np.sin(2 * np.pi * df_new['day'] / 31)
df_new['day_cos'] = np.cos(2 * np.pi * df_new['day'] / 31)

df_new['hour_sin'] = np.sin(2 * np.pi * df_new['hour'] / 24)
df_new['hour_cos'] = np.cos(2 * np.pi * df_new['hour'] / 24)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['weekday_sin'] = np.sin(2 * np.pi * df_new['weekday'] / 7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['weekday_cos'] = np.cos(2 * np.pi * df_new['weekday'] / 7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['month_sin'] = np.sin(2 * np.pi * df_new['month'] / 12)
A value is

## one-hot

In [19]:
#one-hot
df_new = pd.get_dummies(df_new, columns=['year','category','gender'])

## Scaling

In [20]:
#Scaling: Min-Max Scaling
min_max_scaler = MinMaxScaler()
df_new[['amt_scaler', 'distance_scaler','cardholder_age_scaler','holder_pop_scaler','holder_dens_scaler','merch_pop_scaler','merch_dens_scaler'
    ]] = min_max_scaler.fit_transform(df_new[['amt', 'distance','cardholder_age','holder_pop','holder_dens','merch_pop','merch_dens']])


In [21]:
df_new.drop_duplicates(inplace=True)
df_new.dropna(inplace=True)

## Target Encoding

In [22]:
#Target Encoding

# Split training set and test set
train_df, test_df = train_test_split(df_new, test_size=0.2, stratify=df_new['is_fraud'], random_state=42)

def target_encode(train, test, column, target):
    encoding_map = train.groupby(column)[target].mean()
    
    train_encoded = train[column].map(encoding_map)
    test_encoded = test[column].map(encoding_map)
    
    global_mean = train[target].mean()
    train_encoded.fillna(global_mean, inplace=True)
    test_encoded.fillna(global_mean, inplace=True)
    
    return train_encoded, test_encoded

# Columns that require target encoding
columns_to_encode = ['job','holder_state','holder_city','merchant','merch_state', 'merch_city']

for col in columns_to_encode:
    train_df[f'{col}_encoded'], test_df[f'{col}_encoded'] = target_encode(train_df, test_df, col, 'is_fraud')

In [23]:
train_df.shape

(656677, 57)

In [24]:
train_df.head(2)

Unnamed: 0,month,day,hour,weekday,amt,distance,cardholder_age,job,holder_state,holder_city,holder_pop,holder_dens,merchant,merch_state,merch_city,merch_pop,merch_dens,is_fraud,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,year_2019,year_2020,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,distance_scaler,cardholder_age_scaler,holder_pop_scaler,holder_dens_scaler,merch_pop_scaler,merch_dens_scaler,job_encoded,holder_state_encoded,holder_city_encoded,merchant_encoded,merch_state_encoded,merch_city_encoded
695042,12,8,15,6,118.63,36.346766,25,Biomedical engineer,Indiana,Leo,5935.0,181.2,"McCullough, Hudson and Schuster",Indiana,Churubusco,7550.0,50.0,0,-0.781831,0.62349,-2.449294e-16,1.0,0.998717,-0.050649,-0.707107,-0.707107,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0.004064,0.242516,0.134146,0.058196,0.003944,0.057669,0.000821,0.0,0.004708,0.0,0.00107,0.00521,0.0
350821,7,6,9,5,83.7,116.760892,33,"Geologist, engineering",Virginia,Atlantic,463.0,31.6,Heathcote LLC,Virginia,Virginia Beach,50665.0,1289.0,0,-0.974928,-0.222521,-0.5,-0.866025,0.937752,0.347305,0.707107,-0.707107,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,0.002857,0.779389,0.231707,0.00454,0.000688,0.386992,0.021173,0.010493,0.00615,0.011952,0.019666,0.006241,0.0


# Resampling

In [25]:
df_resample=train_df.drop(columns=['month','day','hour','weekday','amt','distance','cardholder_age','job','holder_state',
                         'holder_city','holder_pop','holder_dens','merchant','merch_state','merch_city','merch_pop','merch_dens'])
df_resample.head(2)

Unnamed: 0,is_fraud,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,year_2019,year_2020,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,distance_scaler,cardholder_age_scaler,holder_pop_scaler,holder_dens_scaler,merch_pop_scaler,merch_dens_scaler,job_encoded,holder_state_encoded,holder_city_encoded,merchant_encoded,merch_state_encoded,merch_city_encoded
695042,0,-0.781831,0.62349,-2.449294e-16,1.0,0.998717,-0.050649,-0.707107,-0.707107,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0.004064,0.242516,0.134146,0.058196,0.003944,0.057669,0.000821,0.0,0.004708,0.0,0.00107,0.00521,0.0
350821,0,-0.974928,-0.222521,-0.5,-0.866025,0.937752,0.347305,0.707107,-0.707107,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,0.002857,0.779389,0.231707,0.00454,0.000688,0.386992,0.021173,0.010493,0.00615,0.011952,0.019666,0.006241,0.0


In [26]:
df_resample=df_resample.drop(columns=['year_2019','year_2020','distance_scaler','holder_pop_scaler','merch_pop_scaler',
                         'holder_state_encoded','holder_city_encoded','merch_state_encoded','merch_city_encoded',
                         'holder_dens_scaler','merch_dens_scaler'])
df_resample.head(2)

Unnamed: 0,is_fraud,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,cardholder_age_scaler,job_encoded,merchant_encoded
695042,0,-0.781831,0.62349,-2.449294e-16,1.0,0.998717,-0.050649,-0.707107,-0.707107,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0.004064,0.134146,0.0,0.00107
350821,0,-0.974928,-0.222521,-0.5,-0.866025,0.937752,0.347305,0.707107,-0.707107,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,0.002857,0.231707,0.010493,0.019666


In [27]:
df_resample.shape

(656677, 29)

In [31]:
df_resample.to_csv("data-train-noresample.csv", index=False) 

In [32]:
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from collections import Counter

X = df_resample.drop(columns=['is_fraud'])
y = df_resample['is_fraud']

def process_in_chunks(X, y, chunk_size=300000): 
    smote_enn = SMOTEENN(sampling_strategy=0.5, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

X_resampled, y_resampled = process_in_chunks(X, y)



Chunk 1 label distribution: Counter({0: 293107, 1: 148707})




Chunk 2 label distribution: Counter({0: 292995, 1: 148744})




Chunk 3 label distribution: Counter({0: 55007, 1: 28169})


In [33]:
print("Original class distribution:", y.value_counts())
print("Resampled class distribution:", y_resampled.value_counts())

Original class distribution: is_fraud
0    652957
1      3720
Name: count, dtype: int64
Resampled class distribution: is_fraud
0    641109
1    325620
Name: count, dtype: int64


In [35]:
y_resampled.name = 'is_fraud'  
df_resampled_05 = pd.concat([X_resampled, y_resampled], axis=1)  # Combine

print(f"Resampled DataFrame shape: {df_resampled_05.shape}")

Resampled DataFrame shape: (966729, 29)


In [36]:
df_resampled_05 = df_resampled_05.reset_index(drop=True)
df_resampled_05.to_csv("data-training-0.5.csv", index=False) 

In [25]:
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from collections import Counter

X = df_resample.drop(columns=['is_fraud'])
y = df_resample['is_fraud']

def process_in_chunks(X, y, chunk_size=300000): 
    smote_enn = SMOTEENN(sampling_strategy=1, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

X_resampled, y_resampled = process_in_chunks(X, y)



Chunk 1 label distribution: Counter({1: 298116, 0: 293070})




Chunk 2 label distribution: Counter({1: 298188, 0: 292948})




Chunk 3 label distribution: Counter({1: 56362, 0: 54997})


In [26]:
print("Original class distribution:", y.value_counts())
print("Resampled class distribution:", y_resampled.value_counts())

Original class distribution: is_fraud
0    652957
1      3720
Name: count, dtype: int64
Resampled class distribution: is_fraud
1    652666
0    641015
Name: count, dtype: int64


In [27]:
y_resampled.name = 'is_fraud'  
df_resampled_1 = pd.concat([X_resampled, y_resampled], axis=1)  # Combine

print(f"Resampled DataFrame shape: {df_resampled_1.shape}")

Resampled DataFrame shape: (1293681, 29)


In [28]:
df_resampled_1 = df_resampled_1.reset_index(drop=True)
df_resampled_1.to_csv("data-training-1.csv", index=False) 

In [36]:
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from collections import Counter

X = df_resample.drop(columns=['is_fraud'])
y = df_resample['is_fraud']

def process_in_chunks(X, y, chunk_size=300000): 
    smote_enn = SMOTEENN(sampling_strategy=0.1, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

X_resampled, y_resampled = process_in_chunks(X, y)



Chunk 1 label distribution: Counter({0: 293478, 1: 27235})




Chunk 2 label distribution: Counter({0: 293389, 1: 27096})




Chunk 3 label distribution: Counter({0: 55113, 1: 5407})


In [37]:
print("Original class distribution:", y.value_counts())
print("Resampled class distribution:", y_resampled.value_counts())

Original class distribution: is_fraud
0    652957
1      3720
Name: count, dtype: int64
Resampled class distribution: is_fraud
0    641980
1     59738
Name: count, dtype: int64


In [38]:
y_resampled.name = 'is_fraud'  
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)  # Combine

print(f"Resampled DataFrame shape: {df_resampled.shape}")
df_resampled.head(2)

Resampled DataFrame shape: (701718, 31)


Unnamed: 0,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,cardholder_age_scaler,holder_dens_scaler,merch_dens_scaler,job_encoded,merchant_encoded,is_fraud
0,-0.781831,0.62349,-2.449294e-16,1.0,0.998717,-0.050649,-0.707107,-0.707107,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0.004064,0.134146,0.003944,0.000821,0.0,0.00107,0
1,-0.974928,-0.222521,-0.5,-0.866025,0.937752,0.347305,0.707107,-0.707107,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,0.002857,0.231707,0.000688,0.021173,0.010493,0.019666,0


In [39]:
df_resampled = df_resampled.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [40]:
df_resampled.head(2)

Unnamed: 0,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,cardholder_age_scaler,holder_dens_scaler,merch_dens_scaler,job_encoded,merchant_encoded,is_fraud
0,-0.781831,0.62349,-2.449294e-16,1.0,0.998717,-0.050649,-0.707107,-0.707107,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0.004064,0.134146,0.003944,0.000821,0.0,0.00107,0
1,-0.974928,-0.222521,-0.5,-0.866025,0.937752,0.347305,0.707107,-0.707107,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,0.002857,0.231707,0.000688,0.021173,0.010493,0.019666,0


## sampling strategy = 0.2

In [71]:
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from collections import Counter

X = df_resample.drop(columns=['is_fraud'])
y = df_resample['is_fraud']

def process_in_chunks(X, y, chunk_size=300000): 
    smote_enn = SMOTEENN(sampling_strategy=0.2, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

X_resampled, y_resampled = process_in_chunks(X, y)



Chunk 1 label distribution: Counter({0: 293270, 1: 58112})




Chunk 2 label distribution: Counter({0: 293143, 1: 58141})




Chunk 3 label distribution: Counter({0: 55043, 1: 11181})


In [72]:
print("Original class distribution:", y.value_counts())
print("Resampled class distribution:", y_resampled.value_counts())

Original class distribution: is_fraud
0    652957
1      3720
Name: count, dtype: int64
Resampled class distribution: is_fraud
0    641456
1    127434
Name: count, dtype: int64


In [73]:
y_resampled.name = 'is_fraud'  
df_resampled = pd.concat([X_resampled, y_resampled], axis=1)  # Combine

print(f"Resampled DataFrame shape: {df_resampled.shape}")
df_resampled.head(2)

Resampled DataFrame shape: (768890, 29)


Unnamed: 0,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,cardholder_age_scaler,job_encoded,merchant_encoded,is_fraud
0,-0.781831,0.62349,-2.449294e-16,1.0,0.998717,-0.050649,-0.707107,-0.707107,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,0.004064,0.134146,0.0,0.00107,0
1,-0.974928,-0.222521,-0.5,-0.866025,0.937752,0.347305,0.707107,-0.707107,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,0.002857,0.231707,0.010493,0.019666,0


In [74]:
df_resampled = df_resampled.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [77]:
test_df=test_df.drop(columns=['month','day','hour','weekday','amt','distance','cardholder_age','job','holder_state',
                         'holder_city','holder_pop','holder_dens','merchant','merch_state','merch_city','merch_pop','merch_dens'])

Unnamed: 0,is_fraud,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,year_2019,year_2020,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,distance_scaler,cardholder_age_scaler,holder_pop_scaler,holder_dens_scaler,merch_pop_scaler,merch_dens_scaler,job_encoded,holder_state_encoded,holder_city_encoded,merchant_encoded,merch_state_encoded,merch_city_encoded
0,0,0.781831,0.62349,-0.5,-0.866025,0.394356,0.918958,1.224647e-16,-1.0,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,0.004469,0.702163,0.536585,0.002422,0.000109,0.002475,6.9e-05,0.0,0.006659,0.0,0.003198,0.006409,0.0
1,0,0.974928,-0.222521,0.866025,0.5,0.651372,-0.758758,-0.258819,0.965926,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,0.007933,0.691644,0.426829,0.713884,0.022427,0.0076,1.3e-05,0.0,0.002614,0.005012,0.003161,0.004615,0.012232


In [78]:
test_df=test_df.drop(columns=['year_2019','year_2020','distance_scaler','holder_pop_scaler','merch_pop_scaler',
                         'holder_state_encoded','holder_city_encoded','merch_state_encoded','merch_city_encoded'])

Unnamed: 0,is_fraud,weekday_sin,weekday_cos,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,category_entertainment,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_F,gender_M,amt_scaler,cardholder_age_scaler,holder_dens_scaler,merch_dens_scaler,job_encoded,merchant_encoded
0,0,0.781831,0.62349,-0.5,-0.866025,0.394356,0.918958,1.224647e-16,-1.0,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,0.004469,0.536585,0.000109,6.9e-05,0.0,0.003198
1,0,0.974928,-0.222521,0.866025,0.5,0.651372,-0.758758,-0.258819,0.965926,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,0.007933,0.426829,0.022427,1.3e-05,0.0,0.003161


In [80]:
test_df=test_df.drop(columns=['holder_dens_scaler','merch_dens_scaler'])

In [75]:
df_resampled.shape

(768890, 29)

In [81]:
test_df.shape

(164170, 29)

In [82]:
df_resampled.to_csv("data-training-02.csv", index=False) 

In [84]:
test_df.to_csv("data-testing-02.csv", index=False) 

In [47]:
#df_resampled.to_csv("data-training-new.csv", index=False) 

In [48]:
#test_df.to_csv("data-testing-new.csv", index=False) 

## test best rate

In [28]:
df_resample.shape

(656677, 29)

In [60]:
df_testrate=df_resample.sample(frac = 0.2, random_state=42)
df_testrate.shape

(131335, 29)

In [61]:
df_testrate['is_fraud'].value_counts()

is_fraud
0    130542
1       793
Name: count, dtype: int64

#### RandomForestClassifier

In [51]:
#RandomForestClassifier

X = df_testrate.drop(columns=['is_fraud'])
y = df_testrate['is_fraud']

def process_in_chunks(X, y, sampling_strategy, chunk_size=30000): 
    smote_enn = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

def pipeline_with_sampling(sampling_strategy):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_resampled, y_resampled = process_in_chunks(X_train, y_train, sampling_strategy=sampling_strategy)

    model = RandomForestClassifier(random_state=42, n_jobs=-1)
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    return precision, recall, f2_score

def random_search_sampling():
    param_distributions = {
        'sampling_strategy': np.linspace(0.1, 1.0, 10)  # 不同采样率
    }
    scorer = make_scorer(fbeta_score, beta=2)

    results = []
    for sampling_rate in param_distributions['sampling_strategy']:
        print(f"Testing sampling_strategy={sampling_rate:.2f}")
        precision, recall, f2 = pipeline_with_sampling(sampling_rate)
        results.append((sampling_rate, precision, recall, f2))
        print(f"Precision for sampling_strategy={sampling_rate:.2f}: {precision:.4f}")
        print(f"Recall for sampling_strategy={sampling_rate:.2f}: {recall:.4f}")
        print(f"F2 Score for sampling_strategy={sampling_rate:.2f}: {f2:.4f}")

    best_sampling_strategy, best_precision, best_recall, best_f2 = max(results, key=lambda x: x[3])
    print(f"Best Sampling Strategy: {best_sampling_strategy:.2f} with Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F2 Score: {best_f2:.4f}")
    return best_sampling_strategy, best_precision, best_recall, best_f2

best_sampling_strategy, best_precision, best_recall, best_f2 = random_search_sampling()

Testing sampling_strategy=0.10




Chunk 1 label distribution: Counter({0: 29026, 1: 2850})




Chunk 2 label distribution: Counter({0: 29037, 1: 2850})




Chunk 3 label distribution: Counter({0: 29190, 1: 2896})




Chunk 4 label distribution: Counter({0: 14575, 1: 1469})
Precision for sampling_strategy=0.10: 0.8623
Recall for sampling_strategy=0.10: 0.7041
F2 Score for sampling_strategy=0.10: 0.7310
Testing sampling_strategy=0.20




Chunk 1 label distribution: Counter({0: 28980, 1: 5918})




Chunk 2 label distribution: Counter({0: 28983, 1: 5913})




Chunk 3 label distribution: Counter({0: 29151, 1: 5936})




Chunk 4 label distribution: Counter({0: 14540, 1: 2979})
Precision for sampling_strategy=0.20: 0.8299
Recall for sampling_strategy=0.20: 0.7219
F2 Score for sampling_strategy=0.20: 0.7412
Testing sampling_strategy=0.30




Chunk 1 label distribution: Counter({0: 28944, 1: 8920})




Chunk 2 label distribution: Counter({0: 28967, 1: 8924})




Chunk 3 label distribution: Counter({0: 29138, 1: 8944})




Chunk 4 label distribution: Counter({0: 14531, 1: 4487})
Precision for sampling_strategy=0.30: 0.8188
Recall for sampling_strategy=0.30: 0.7219
F2 Score for sampling_strategy=0.30: 0.7394
Testing sampling_strategy=0.40




Chunk 1 label distribution: Counter({0: 28932, 1: 11906})




Chunk 2 label distribution: Counter({0: 28954, 1: 11910})




Chunk 3 label distribution: Counter({0: 29135, 1: 11929})




Chunk 4 label distribution: Counter({0: 14527, 1: 5985})
Precision for sampling_strategy=0.40: 0.7919
Recall for sampling_strategy=0.40: 0.6982
F2 Score for sampling_strategy=0.40: 0.7152
Testing sampling_strategy=0.50




Chunk 1 label distribution: Counter({0: 28926, 1: 14894})




Chunk 2 label distribution: Counter({0: 28957, 1: 14901})




Chunk 3 label distribution: Counter({0: 29127, 1: 14916})




Chunk 4 label distribution: Counter({0: 14521, 1: 7486})
Precision for sampling_strategy=0.50: 0.7742
Recall for sampling_strategy=0.50: 0.7101
F2 Score for sampling_strategy=0.50: 0.7220
Testing sampling_strategy=0.60




Chunk 1 label distribution: Counter({0: 28930, 1: 17878})




Chunk 2 label distribution: Counter({0: 28950, 1: 17878})




Chunk 3 label distribution: Counter({0: 29123, 1: 17901})




Chunk 4 label distribution: Counter({0: 14525, 1: 8985})
Precision for sampling_strategy=0.60: 0.7838
Recall for sampling_strategy=0.60: 0.6864
F2 Score for sampling_strategy=0.60: 0.7039
Testing sampling_strategy=0.70




Chunk 1 label distribution: Counter({0: 28921, 1: 20856})




Chunk 2 label distribution: Counter({0: 28946, 1: 20861})




Chunk 3 label distribution: Counter({0: 29125, 1: 20891})




Chunk 4 label distribution: Counter({0: 14521, 1: 10482})
Precision for sampling_strategy=0.70: 0.7815
Recall for sampling_strategy=0.70: 0.6982
F2 Score for sampling_strategy=0.70: 0.7134
Testing sampling_strategy=0.80




Chunk 1 label distribution: Counter({0: 28925, 1: 23847})




Chunk 2 label distribution: Counter({0: 28948, 1: 23843})




Chunk 3 label distribution: Counter({0: 29125, 1: 23876})




Chunk 4 label distribution: Counter({0: 14518, 1: 11982})
Precision for sampling_strategy=0.80: 0.7616
Recall for sampling_strategy=0.80: 0.6805
F2 Score for sampling_strategy=0.80: 0.6953
Testing sampling_strategy=0.90




Chunk 1 label distribution: Counter({0: 28921, 1: 26825})




Chunk 2 label distribution: Counter({0: 28947, 1: 26827})




Chunk 3 label distribution: Counter({0: 29124, 1: 26860})




Chunk 4 label distribution: Counter({0: 14519, 1: 13479})
Precision for sampling_strategy=0.90: 0.7655
Recall for sampling_strategy=0.90: 0.6568
F2 Score for sampling_strategy=0.90: 0.6760
Testing sampling_strategy=1.00




Chunk 1 label distribution: Counter({1: 29808, 0: 28921})




Chunk 2 label distribution: Counter({1: 29811, 0: 28943})




Chunk 3 label distribution: Counter({1: 29844, 0: 29125})




Chunk 4 label distribution: Counter({1: 14978, 0: 14522})
Precision for sampling_strategy=1.00: 0.7972
Recall for sampling_strategy=1.00: 0.6746
F2 Score for sampling_strategy=1.00: 0.6960
Best Sampling Strategy: 0.20 with Precision: 0.8299, Recall: 0.7219, F2 Score: 0.7412


#### XGBClassifier

In [69]:
#XGBClassifier

X = df_testrate.drop(columns=['is_fraud'])
y = df_testrate['is_fraud']

def pipeline_with_sampling(sampling_strategy):

    X_resampled, X_test, y_resampled, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = XGBClassifier(random_state = 42)
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    return precision, recall, f2_score

def random_search_sampling():
    param_distributions = {
        'sampling_strategy': [0.0]  
    }

    scorer = make_scorer(fbeta_score, beta=2)

    results = []
    for sampling_rate in param_distributions['sampling_strategy']:
        print(f"Testing sampling_strategy={sampling_rate:.2f}")
        precision, recall, f2 = pipeline_with_sampling(sampling_rate)
        results.append((sampling_rate, precision, recall, f2))
        print(f"Precision for sampling_strategy={sampling_rate:.2f}: {precision:.4f}")
        print(f"Recall for sampling_strategy={sampling_rate:.2f}: {recall:.4f}")
        print(f"F2 Score for sampling_strategy={sampling_rate:.2f}: {f2:.4f}")

    best_sampling_strategy, best_precision, best_recall, best_f2 = max(results, key=lambda x: x[3])
    print(f"Best Sampling Strategy: {best_sampling_strategy:.2f} with Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F2 Score: {best_f2:.4f}")
    return best_sampling_strategy, best_precision, best_recall, best_f2

best_sampling_strategy, best_precision, best_recall, best_f2 = random_search_sampling()

Testing sampling_strategy=0.00
Precision for sampling_strategy=0.00: 0.9565
Recall for sampling_strategy=0.00: 0.7811
F2 Score for sampling_strategy=0.00: 0.8108
Best Sampling Strategy: 0.00 with Precision: 0.9565, Recall: 0.7811, F2 Score: 0.8108


#### LogisticRegression

In [52]:
#LogisticRegression

from sklearn.linear_model import LogisticRegression

X = df_testrate.drop(columns=['is_fraud'])
y = df_testrate['is_fraud']

def process_in_chunks(X, y, sampling_strategy, chunk_size=30000): 
    smote_enn = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

def pipeline_with_sampling(sampling_strategy):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    X_resampled, y_resampled = process_in_chunks(X_train, y_train, sampling_strategy=sampling_strategy)

    model = LogisticRegression(random_state=42, n_jobs=-1)
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    return precision, recall, f2_score

def random_search_sampling():
    param_distributions = {
        'sampling_strategy': np.linspace(0.1, 1.0, 10)  
    }
    scorer = make_scorer(fbeta_score, beta=2)

    results = []
    for sampling_rate in param_distributions['sampling_strategy']:
        print(f"Testing sampling_strategy={sampling_rate:.2f}")
        precision, recall, f2 = pipeline_with_sampling(sampling_rate)
        results.append((sampling_rate, precision, recall, f2))
        print(f"Precision for sampling_strategy={sampling_rate:.2f}: {precision:.4f}")
        print(f"Recall for sampling_strategy={sampling_rate:.2f}: {recall:.4f}")
        print(f"F2 Score for sampling_strategy={sampling_rate:.2f}: {f2:.4f}")

    best_sampling_strategy, best_precision, best_recall, best_f2 = max(results, key=lambda x: x[3])
    print(f"Best Sampling Strategy: {best_sampling_strategy:.2f} with Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F2 Score: {best_f2:.4f}")
    return best_sampling_strategy, best_precision, best_recall, best_f2

best_sampling_strategy, best_precision, best_recall, best_f2 = random_search_sampling()

Testing sampling_strategy=0.10




Chunk 1 label distribution: Counter({0: 29026, 1: 2850})




Chunk 2 label distribution: Counter({0: 29037, 1: 2850})




Chunk 3 label distribution: Counter({0: 29190, 1: 2896})




Chunk 4 label distribution: Counter({0: 14575, 1: 1469})
Precision for sampling_strategy=0.10: 0.5618
Recall for sampling_strategy=0.10: 0.2959
F2 Score for sampling_strategy=0.10: 0.3268
Testing sampling_strategy=0.20




Chunk 1 label distribution: Counter({0: 28980, 1: 5918})




Chunk 2 label distribution: Counter({0: 28983, 1: 5913})




Chunk 3 label distribution: Counter({0: 29151, 1: 5936})




Chunk 4 label distribution: Counter({0: 14540, 1: 2979})
Precision for sampling_strategy=0.20: 0.3238
Recall for sampling_strategy=0.20: 0.5385
F2 Score for sampling_strategy=0.20: 0.4754
Testing sampling_strategy=0.30




Chunk 1 label distribution: Counter({0: 28944, 1: 8920})




Chunk 2 label distribution: Counter({0: 28967, 1: 8924})




Chunk 3 label distribution: Counter({0: 29138, 1: 8944})




Chunk 4 label distribution: Counter({0: 14531, 1: 4487})
Precision for sampling_strategy=0.30: 0.1996
Recall for sampling_strategy=0.30: 0.6036
F2 Score for sampling_strategy=0.30: 0.4297
Testing sampling_strategy=0.40




Chunk 1 label distribution: Counter({0: 28932, 1: 11906})




Chunk 2 label distribution: Counter({0: 28954, 1: 11910})




Chunk 3 label distribution: Counter({0: 29135, 1: 11929})




Chunk 4 label distribution: Counter({0: 14527, 1: 5985})
Precision for sampling_strategy=0.40: 0.1531
Recall for sampling_strategy=0.40: 0.6331
F2 Score for sampling_strategy=0.40: 0.3891
Testing sampling_strategy=0.50




Chunk 1 label distribution: Counter({0: 28926, 1: 14894})




Chunk 2 label distribution: Counter({0: 28957, 1: 14901})




Chunk 3 label distribution: Counter({0: 29127, 1: 14916})




Chunk 4 label distribution: Counter({0: 14521, 1: 7486})
Precision for sampling_strategy=0.50: 0.1356
Recall for sampling_strategy=0.50: 0.6509
F2 Score for sampling_strategy=0.50: 0.3699
Testing sampling_strategy=0.60




Chunk 1 label distribution: Counter({0: 28930, 1: 17878})




Chunk 2 label distribution: Counter({0: 28950, 1: 17878})




Chunk 3 label distribution: Counter({0: 29123, 1: 17901})




Chunk 4 label distribution: Counter({0: 14525, 1: 8985})
Precision for sampling_strategy=0.60: 0.1228
Recall for sampling_strategy=0.60: 0.6627
F2 Score for sampling_strategy=0.60: 0.3526
Testing sampling_strategy=0.70




Chunk 1 label distribution: Counter({0: 28921, 1: 20856})




Chunk 2 label distribution: Counter({0: 28946, 1: 20861})




Chunk 3 label distribution: Counter({0: 29125, 1: 20891})




Chunk 4 label distribution: Counter({0: 14521, 1: 10482})
Precision for sampling_strategy=0.70: 0.1132
Recall for sampling_strategy=0.70: 0.6686
F2 Score for sampling_strategy=0.70: 0.3375
Testing sampling_strategy=0.80




Chunk 1 label distribution: Counter({0: 28925, 1: 23847})




Chunk 2 label distribution: Counter({0: 28948, 1: 23843})




Chunk 3 label distribution: Counter({0: 29125, 1: 23876})




Chunk 4 label distribution: Counter({0: 14518, 1: 11982})
Precision for sampling_strategy=0.80: 0.1074
Recall for sampling_strategy=0.80: 0.6805
F2 Score for sampling_strategy=0.80: 0.3291
Testing sampling_strategy=0.90




Chunk 1 label distribution: Counter({0: 28921, 1: 26825})




Chunk 2 label distribution: Counter({0: 28947, 1: 26827})




Chunk 3 label distribution: Counter({0: 29124, 1: 26860})




Chunk 4 label distribution: Counter({0: 14519, 1: 13479})
Precision for sampling_strategy=0.90: 0.0912
Recall for sampling_strategy=0.90: 0.6864
F2 Score for sampling_strategy=0.90: 0.2977
Testing sampling_strategy=1.00




Chunk 1 label distribution: Counter({1: 29808, 0: 28921})




Chunk 2 label distribution: Counter({1: 29811, 0: 28943})




Chunk 3 label distribution: Counter({1: 29844, 0: 29125})




Chunk 4 label distribution: Counter({1: 14978, 0: 14522})
Precision for sampling_strategy=1.00: 0.0875
Recall for sampling_strategy=1.00: 0.6864
F2 Score for sampling_strategy=1.00: 0.2897
Best Sampling Strategy: 0.20 with Precision: 0.3238, Recall: 0.5385, F2 Score: 0.4754


#### SVM

In [53]:
#SVM

from sklearn.svm import LinearSVC, SVC

X = df_testrate.drop(columns=['is_fraud'])
y = df_testrate['is_fraud']

def process_in_chunks(X, y, sampling_strategy, chunk_size=30000): 
    smote_enn = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

def pipeline_with_sampling(sampling_strategy):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_resampled, y_resampled = process_in_chunks(X_train, y_train, sampling_strategy=sampling_strategy)

    model = SVC(random_state = 42)
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    return precision, recall, f2_score

def random_search_sampling():
    param_distributions = {
        'sampling_strategy': np.linspace(0.1, 1.0, 10) 
    }

    scorer = make_scorer(fbeta_score, beta=2)

    results = []
    for sampling_rate in param_distributions['sampling_strategy']:
        print(f"Testing sampling_strategy={sampling_rate:.2f}")
        precision, recall, f2 = pipeline_with_sampling(sampling_rate)
        results.append((sampling_rate, precision, recall, f2))
        print(f"Precision for sampling_strategy={sampling_rate:.2f}: {precision:.4f}")
        print(f"Recall for sampling_strategy={sampling_rate:.2f}: {recall:.4f}")
        print(f"F2 Score for sampling_strategy={sampling_rate:.2f}: {f2:.4f}")

    best_sampling_strategy, best_precision, best_recall, best_f2 = max(results, key=lambda x: x[3])
    print(f"Best Sampling Strategy: {best_sampling_strategy:.2f} with Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F2 Score: {best_f2:.4f}")
    return best_sampling_strategy, best_precision, best_recall, best_f2

best_sampling_strategy, best_precision, best_recall, best_f2 = random_search_sampling()

Testing sampling_strategy=0.10




Chunk 1 label distribution: Counter({0: 29026, 1: 2850})




Chunk 2 label distribution: Counter({0: 29037, 1: 2850})




Chunk 3 label distribution: Counter({0: 29190, 1: 2896})




Chunk 4 label distribution: Counter({0: 14575, 1: 1469})
Precision for sampling_strategy=0.10: 0.3148
Recall for sampling_strategy=0.10: 0.1006
F2 Score for sampling_strategy=0.10: 0.1164
Testing sampling_strategy=0.20




Chunk 1 label distribution: Counter({0: 28980, 1: 5918})




Chunk 2 label distribution: Counter({0: 28983, 1: 5913})




Chunk 3 label distribution: Counter({0: 29151, 1: 5936})




Chunk 4 label distribution: Counter({0: 14540, 1: 2979})
Precision for sampling_strategy=0.20: 0.2710
Recall for sampling_strategy=0.20: 0.1716
F2 Score for sampling_strategy=0.20: 0.1852
Testing sampling_strategy=0.30




Chunk 1 label distribution: Counter({0: 28944, 1: 8920})




Chunk 2 label distribution: Counter({0: 28967, 1: 8924})




Chunk 3 label distribution: Counter({0: 29138, 1: 8944})




Chunk 4 label distribution: Counter({0: 14531, 1: 4487})
Precision for sampling_strategy=0.30: 0.2303
Recall for sampling_strategy=0.30: 0.2071
F2 Score for sampling_strategy=0.30: 0.2114
Testing sampling_strategy=0.40




Chunk 1 label distribution: Counter({0: 28932, 1: 11906})




Chunk 2 label distribution: Counter({0: 28954, 1: 11910})




Chunk 3 label distribution: Counter({0: 29135, 1: 11929})




Chunk 4 label distribution: Counter({0: 14527, 1: 5985})
Precision for sampling_strategy=0.40: 0.2113
Recall for sampling_strategy=0.40: 0.2426
F2 Score for sampling_strategy=0.40: 0.2356
Testing sampling_strategy=0.50




Chunk 1 label distribution: Counter({0: 28926, 1: 14894})




Chunk 2 label distribution: Counter({0: 28957, 1: 14901})




Chunk 3 label distribution: Counter({0: 29127, 1: 14916})




Chunk 4 label distribution: Counter({0: 14521, 1: 7486})
Precision for sampling_strategy=0.50: 0.1694
Recall for sampling_strategy=0.50: 0.2426
F2 Score for sampling_strategy=0.50: 0.2233
Testing sampling_strategy=0.60




Chunk 1 label distribution: Counter({0: 28930, 1: 17878})




Chunk 2 label distribution: Counter({0: 28950, 1: 17878})




Chunk 3 label distribution: Counter({0: 29123, 1: 17901})




Chunk 4 label distribution: Counter({0: 14525, 1: 8985})
Precision for sampling_strategy=0.60: 0.1619
Recall for sampling_strategy=0.60: 0.2663
F2 Score for sampling_strategy=0.60: 0.2358
Testing sampling_strategy=0.70




Chunk 1 label distribution: Counter({0: 28921, 1: 20856})




Chunk 2 label distribution: Counter({0: 28946, 1: 20861})




Chunk 3 label distribution: Counter({0: 29125, 1: 20891})




Chunk 4 label distribution: Counter({0: 14521, 1: 10482})
Precision for sampling_strategy=0.70: 0.1607
Recall for sampling_strategy=0.70: 0.2899
F2 Score for sampling_strategy=0.70: 0.2497
Testing sampling_strategy=0.80




Chunk 1 label distribution: Counter({0: 28925, 1: 23847})




Chunk 2 label distribution: Counter({0: 28948, 1: 23843})




Chunk 3 label distribution: Counter({0: 29125, 1: 23876})




Chunk 4 label distribution: Counter({0: 14518, 1: 11982})
Precision for sampling_strategy=0.80: 0.1545
Recall for sampling_strategy=0.80: 0.3018
F2 Score for sampling_strategy=0.80: 0.2535
Testing sampling_strategy=0.90




Chunk 1 label distribution: Counter({0: 28921, 1: 26825})




Chunk 2 label distribution: Counter({0: 28947, 1: 26827})




Chunk 3 label distribution: Counter({0: 29124, 1: 26860})




Chunk 4 label distribution: Counter({0: 14519, 1: 13479})
Precision for sampling_strategy=0.90: 0.1441
Recall for sampling_strategy=0.90: 0.3018
F2 Score for sampling_strategy=0.90: 0.2476
Testing sampling_strategy=1.00




Chunk 1 label distribution: Counter({1: 29808, 0: 28921})




Chunk 2 label distribution: Counter({1: 29811, 0: 28943})




Chunk 3 label distribution: Counter({1: 29844, 0: 29125})




Chunk 4 label distribution: Counter({1: 14978, 0: 14522})
Precision for sampling_strategy=1.00: 0.1432
Recall for sampling_strategy=1.00: 0.3254
F2 Score for sampling_strategy=1.00: 0.2594
Best Sampling Strategy: 1.00 with Precision: 0.1432, Recall: 0.3254, F2 Score: 0.2594


#### MLPClassifier

In [54]:
#MLPClassifier

from sklearn.neural_network import MLPClassifier

X = df_testrate.drop(columns=['is_fraud'])
y = df_testrate['is_fraud']

def process_in_chunks(X, y, sampling_strategy, chunk_size=30000): 
    smote_enn = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

def pipeline_with_sampling(sampling_strategy):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_resampled, y_resampled = process_in_chunks(X_train, y_train, sampling_strategy=sampling_strategy)

    model = MLPClassifier(random_state = 42)
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    return precision, recall, f2_score

def random_search_sampling():
    param_distributions = {
        'sampling_strategy': np.linspace(0.1, 1.0, 10)  
    }

    scorer = make_scorer(fbeta_score, beta=2)

    results = []
    for sampling_rate in param_distributions['sampling_strategy']:
        print(f"Testing sampling_strategy={sampling_rate:.2f}")
        precision, recall, f2 = pipeline_with_sampling(sampling_rate)
        results.append((sampling_rate, precision, recall, f2))
        print(f"Precision for sampling_strategy={sampling_rate:.2f}: {precision:.4f}")
        print(f"Recall for sampling_strategy={sampling_rate:.2f}: {recall:.4f}")
        print(f"F2 Score for sampling_strategy={sampling_rate:.2f}: {f2:.4f}")

    best_sampling_strategy, best_precision, best_recall, best_f2 = max(results, key=lambda x: x[3])
    print(f"Best Sampling Strategy: {best_sampling_strategy:.2f} with Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F2 Score: {best_f2:.4f}")
    return best_sampling_strategy, best_precision, best_recall, best_f2

best_sampling_strategy, best_precision, best_recall, best_f2 = random_search_sampling()

Testing sampling_strategy=0.10




Chunk 1 label distribution: Counter({0: 29026, 1: 2850})




Chunk 2 label distribution: Counter({0: 29037, 1: 2850})




Chunk 3 label distribution: Counter({0: 29190, 1: 2896})




Chunk 4 label distribution: Counter({0: 14575, 1: 1469})
Precision for sampling_strategy=0.10: 0.6623
Recall for sampling_strategy=0.10: 0.5917
F2 Score for sampling_strategy=0.10: 0.6046
Testing sampling_strategy=0.20




Chunk 1 label distribution: Counter({0: 28980, 1: 5918})




Chunk 2 label distribution: Counter({0: 28983, 1: 5913})




Chunk 3 label distribution: Counter({0: 29151, 1: 5936})




Chunk 4 label distribution: Counter({0: 14540, 1: 2979})
Precision for sampling_strategy=0.20: 0.5632
Recall for sampling_strategy=0.20: 0.5799
F2 Score for sampling_strategy=0.20: 0.5765
Testing sampling_strategy=0.30




Chunk 1 label distribution: Counter({0: 28944, 1: 8920})




Chunk 2 label distribution: Counter({0: 28967, 1: 8924})




Chunk 3 label distribution: Counter({0: 29138, 1: 8944})




Chunk 4 label distribution: Counter({0: 14531, 1: 4487})
Precision for sampling_strategy=0.30: 0.6291
Recall for sampling_strategy=0.30: 0.5621
F2 Score for sampling_strategy=0.30: 0.5744
Testing sampling_strategy=0.40




Chunk 1 label distribution: Counter({0: 28932, 1: 11906})




Chunk 2 label distribution: Counter({0: 28954, 1: 11910})




Chunk 3 label distribution: Counter({0: 29135, 1: 11929})




Chunk 4 label distribution: Counter({0: 14527, 1: 5985})
Precision for sampling_strategy=0.40: 0.6478
Recall for sampling_strategy=0.40: 0.6095
F2 Score for sampling_strategy=0.40: 0.6168
Testing sampling_strategy=0.50




Chunk 1 label distribution: Counter({0: 28926, 1: 14894})




Chunk 2 label distribution: Counter({0: 28957, 1: 14901})




Chunk 3 label distribution: Counter({0: 29127, 1: 14916})




Chunk 4 label distribution: Counter({0: 14521, 1: 7486})
Precision for sampling_strategy=0.50: 0.6591
Recall for sampling_strategy=0.50: 0.5148
F2 Score for sampling_strategy=0.50: 0.5384
Testing sampling_strategy=0.60




Chunk 1 label distribution: Counter({0: 28930, 1: 17878})




Chunk 2 label distribution: Counter({0: 28950, 1: 17878})




Chunk 3 label distribution: Counter({0: 29123, 1: 17901})




Chunk 4 label distribution: Counter({0: 14525, 1: 8985})
Precision for sampling_strategy=0.60: 0.5943
Recall for sampling_strategy=0.60: 0.6154
F2 Score for sampling_strategy=0.60: 0.6110
Testing sampling_strategy=0.70




Chunk 1 label distribution: Counter({0: 28921, 1: 20856})




Chunk 2 label distribution: Counter({0: 28946, 1: 20861})




Chunk 3 label distribution: Counter({0: 29125, 1: 20891})




Chunk 4 label distribution: Counter({0: 14521, 1: 10482})
Precision for sampling_strategy=0.70: 0.5578
Recall for sampling_strategy=0.70: 0.4852
F2 Score for sampling_strategy=0.70: 0.4982
Testing sampling_strategy=0.80




Chunk 1 label distribution: Counter({0: 28925, 1: 23847})




Chunk 2 label distribution: Counter({0: 28948, 1: 23843})




Chunk 3 label distribution: Counter({0: 29125, 1: 23876})




Chunk 4 label distribution: Counter({0: 14518, 1: 11982})
Precision for sampling_strategy=0.80: 0.6934
Recall for sampling_strategy=0.80: 0.5621
F2 Score for sampling_strategy=0.80: 0.5843
Testing sampling_strategy=0.90




Chunk 1 label distribution: Counter({0: 28921, 1: 26825})




Chunk 2 label distribution: Counter({0: 28947, 1: 26827})




Chunk 3 label distribution: Counter({0: 29124, 1: 26860})




Chunk 4 label distribution: Counter({0: 14519, 1: 13479})
Precision for sampling_strategy=0.90: 0.6622
Recall for sampling_strategy=0.90: 0.5799
F2 Score for sampling_strategy=0.90: 0.5947
Testing sampling_strategy=1.00




Chunk 1 label distribution: Counter({1: 29808, 0: 28921})




Chunk 2 label distribution: Counter({1: 29811, 0: 28943})




Chunk 3 label distribution: Counter({1: 29844, 0: 29125})




Chunk 4 label distribution: Counter({1: 14978, 0: 14522})
Precision for sampling_strategy=1.00: 0.5333
Recall for sampling_strategy=1.00: 0.5680
F2 Score for sampling_strategy=1.00: 0.5607
Best Sampling Strategy: 0.40 with Precision: 0.6478, Recall: 0.6095, F2 Score: 0.6168


#### XGBClassifier

In [57]:
#XGBClassifier

from xgboost import XGBClassifier

X = df_testrate.drop(columns=['is_fraud'])
y = df_testrate['is_fraud']

def process_in_chunks(X, y, sampling_strategy, chunk_size=30000): 
    smote_enn = SMOTEENN(sampling_strategy=sampling_strategy, random_state=42, n_jobs=-1)
    chunks = []
    for i in range(0, len(X), chunk_size):
        X_chunk = X.iloc[i:i+chunk_size]
        y_chunk = y.iloc[i:i+chunk_size]
        X_res, y_res = smote_enn.fit_resample(X_chunk, y_chunk)
        chunks.append((X_res, y_res))
        print(f"Chunk {i // chunk_size + 1} label distribution: {Counter(y_res)}")  

    X_resampled = pd.concat([chunk[0] for chunk in chunks], axis=0)
    y_resampled = pd.concat([chunk[1] for chunk in chunks], axis=0)
    return X_resampled, y_resampled

def pipeline_with_sampling(sampling_strategy):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_resampled, y_resampled = process_in_chunks(X_train, y_train, sampling_strategy=sampling_strategy)

    model = XGBClassifier(random_state = 42)
    model.fit(X_resampled, y_resampled)
    y_pred = model.predict(X_test)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    return precision, recall, f2_score

def random_search_sampling():
    param_distributions = {
        'sampling_strategy': np.linspace(0.1, 1.0, 10)  # 不同采样率
    }

    scorer = make_scorer(fbeta_score, beta=2)

    results = []
    for sampling_rate in param_distributions['sampling_strategy']:
        print(f"Testing sampling_strategy={sampling_rate:.2f}")
        precision, recall, f2 = pipeline_with_sampling(sampling_rate)
        results.append((sampling_rate, precision, recall, f2))
        print(f"Precision for sampling_strategy={sampling_rate:.2f}: {precision:.4f}")
        print(f"Recall for sampling_strategy={sampling_rate:.2f}: {recall:.4f}")
        print(f"F2 Score for sampling_strategy={sampling_rate:.2f}: {f2:.4f}")

    best_sampling_strategy, best_precision, best_recall, best_f2 = max(results, key=lambda x: x[3])
    print(f"Best Sampling Strategy: {best_sampling_strategy:.2f} with Precision: {best_precision:.4f}, Recall: {best_recall:.4f}, F2 Score: {best_f2:.4f}")
    return best_sampling_strategy, best_precision, best_recall, best_f2

best_sampling_strategy, best_precision, best_recall, best_f2 = random_search_sampling()

Testing sampling_strategy=0.10




Chunk 1 label distribution: Counter({0: 29026, 1: 2850})




Chunk 2 label distribution: Counter({0: 29037, 1: 2850})




Chunk 3 label distribution: Counter({0: 29190, 1: 2896})




Chunk 4 label distribution: Counter({0: 14575, 1: 1469})
Precision for sampling_strategy=0.10: 0.9286
Recall for sampling_strategy=0.10: 0.7692
F2 Score for sampling_strategy=0.10: 0.7966
Testing sampling_strategy=0.20




Chunk 1 label distribution: Counter({0: 28980, 1: 5918})




Chunk 2 label distribution: Counter({0: 28983, 1: 5913})




Chunk 3 label distribution: Counter({0: 29151, 1: 5936})




Chunk 4 label distribution: Counter({0: 14540, 1: 2979})
Precision for sampling_strategy=0.20: 0.9310
Recall for sampling_strategy=0.20: 0.7988
F2 Score for sampling_strategy=0.20: 0.8222
Testing sampling_strategy=0.30




Chunk 1 label distribution: Counter({0: 28944, 1: 8920})




Chunk 2 label distribution: Counter({0: 28967, 1: 8924})




Chunk 3 label distribution: Counter({0: 29138, 1: 8944})




Chunk 4 label distribution: Counter({0: 14531, 1: 4487})
Precision for sampling_strategy=0.30: 0.9184
Recall for sampling_strategy=0.30: 0.7988
F2 Score for sampling_strategy=0.30: 0.8202
Testing sampling_strategy=0.40




Chunk 1 label distribution: Counter({0: 28932, 1: 11906})




Chunk 2 label distribution: Counter({0: 28954, 1: 11910})




Chunk 3 label distribution: Counter({0: 29135, 1: 11929})




Chunk 4 label distribution: Counter({0: 14527, 1: 5985})
Precision for sampling_strategy=0.40: 0.9375
Recall for sampling_strategy=0.40: 0.7988
F2 Score for sampling_strategy=0.40: 0.8232
Testing sampling_strategy=0.50




Chunk 1 label distribution: Counter({0: 28926, 1: 14894})




Chunk 2 label distribution: Counter({0: 28957, 1: 14901})




Chunk 3 label distribution: Counter({0: 29127, 1: 14916})




Chunk 4 label distribution: Counter({0: 14521, 1: 7486})
Precision for sampling_strategy=0.50: 0.9060
Recall for sampling_strategy=0.50: 0.7988
F2 Score for sampling_strategy=0.50: 0.8182
Testing sampling_strategy=0.60




Chunk 1 label distribution: Counter({0: 28930, 1: 17878})




Chunk 2 label distribution: Counter({0: 28950, 1: 17878})




Chunk 3 label distribution: Counter({0: 29123, 1: 17901})




Chunk 4 label distribution: Counter({0: 14525, 1: 8985})
Precision for sampling_strategy=0.60: 0.9007
Recall for sampling_strategy=0.60: 0.8047
F2 Score for sampling_strategy=0.60: 0.8222
Testing sampling_strategy=0.70




Chunk 1 label distribution: Counter({0: 28921, 1: 20856})




Chunk 2 label distribution: Counter({0: 28946, 1: 20861})




Chunk 3 label distribution: Counter({0: 29125, 1: 20891})




Chunk 4 label distribution: Counter({0: 14521, 1: 10482})
Precision for sampling_strategy=0.70: 0.9067
Recall for sampling_strategy=0.70: 0.8047
F2 Score for sampling_strategy=0.70: 0.8232
Testing sampling_strategy=0.80




Chunk 1 label distribution: Counter({0: 28925, 1: 23847})




Chunk 2 label distribution: Counter({0: 28948, 1: 23843})




Chunk 3 label distribution: Counter({0: 29125, 1: 23876})




Chunk 4 label distribution: Counter({0: 14518, 1: 11982})
Precision for sampling_strategy=0.80: 0.9067
Recall for sampling_strategy=0.80: 0.8047
F2 Score for sampling_strategy=0.80: 0.8232
Testing sampling_strategy=0.90




Chunk 1 label distribution: Counter({0: 28921, 1: 26825})




Chunk 2 label distribution: Counter({0: 28947, 1: 26827})




Chunk 3 label distribution: Counter({0: 29124, 1: 26860})




Chunk 4 label distribution: Counter({0: 14519, 1: 13479})
Precision for sampling_strategy=0.90: 0.8882
Recall for sampling_strategy=0.90: 0.7988
F2 Score for sampling_strategy=0.90: 0.8152
Testing sampling_strategy=1.00




Chunk 1 label distribution: Counter({1: 29808, 0: 28921})




Chunk 2 label distribution: Counter({1: 29811, 0: 28943})




Chunk 3 label distribution: Counter({1: 29844, 0: 29125})




Chunk 4 label distribution: Counter({1: 14978, 0: 14522})
Precision for sampling_strategy=1.00: 0.9038
Recall for sampling_strategy=1.00: 0.8343
F2 Score for sampling_strategy=1.00: 0.8474
Best Sampling Strategy: 1.00 with Precision: 0.9038, Recall: 0.8343, F2 Score: 0.8474
