In [1]:
import pandas as pd
from mh_sys_gen import MHSysGen
from sklearn.model_selection import train_test_split


In [25]:
import pandas as pd
import numpy as np

# --- 1. Define the 133 Required Columns (Copy-pasted from your main.py) ---
ALL_REQUIRED_COLUMNS = [
    'TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 
    'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'dist1', 'dist2',
    'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
    'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4',
    'D5', 'D8', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3', 'M4',
    'M5', 'M6', 'V6', 'V12', 'V13', 'V19', 'V20', 'V35', 'V38', 'V45', 'V48',
    'V53', 'V54', 'V55', 'V56', 'V61', 'V62', 'V66', 'V67', 'V70', 'V73',
    'V75', 'V76', 'V78', 'V82', 'V83', 'V87', 'V91', 'V94', 'V96', 'V99',
    'V115', 'V126', 'V128', 'V131', 'V133', 'V136', 'V139', 'V143', 'V149',
    'V156', 'V165', 'V169', 'V171', 'V187', 'V189', 'V197', 'V206', 'V220',
    'V256', 'V258', 'V261', 'V262', 'V281', 'V283', 'V285', 'V291', 'V294',
    'V295', 'V296', 'V307', 'V308', 'V309', 'V310', 'V311', 'V312', 'V313',
    'V314', 'V315', 'V317', 'V320', 'V326', 'id_01', 'id_02', 'id_03', 'id_04',
    'id_05', 'id_06', 'id_09', 'id_13', 'id_14', 'id_17', 'id_19', 'id_20',
    'id_30', 'id_31', 'id_32', 'id_33', 'DeviceInfo'
]

# The model input features (excluding 'isFraud')
MODEL_FEATURES = [col for col in ALL_REQUIRED_COLUMNS if col not in ['isFraud']]

NUM_ROWS = 150

# --- 2. Generate Dummy Data ---
data = {}

# TransactionID: Unique integers starting from a high number
data['TransactionID'] = np.arange(1000000, 1000000 + NUM_ROWS)

# TransactionAmt: Random floats
data['TransactionAmt'] = np.round(np.random.uniform(10.0, 500.0, NUM_ROWS), 2)

# ProductCD, card4, card6 (Categorical/Object features)
data['ProductCD'] = np.random.choice(['W', 'C', 'R', 'H', 'S'], NUM_ROWS)
data['card4'] = np.random.choice(['visa', 'mastercard', 'amex', 'discover'], NUM_ROWS)
data['card6'] = np.random.choice(['debit', 'credit'], NUM_ROWS)

# C1-C14 (Count features - typical fraud variables)
c_cols = [f'C{i}' for i in range(1, 7)] + [f'C{i}' for i in range(8, 15)]
for col in c_cols:
    data[col] = np.random.randint(1, 100, NUM_ROWS)

# V-features and other numerical columns (Fill with random floats/NaNs to test imputation)
numerical_cols = [col for col in MODEL_FEATURES if col not in data]
for col in numerical_cols:
    # Introduce some NaNs (10% chance) to test the imputation logic in your preprocessor
    values = np.random.uniform(0.0, 500.0, NUM_ROWS)
    values[np.random.choice(NUM_ROWS, size=int(NUM_ROWS * 0.10), replace=False)] = np.nan
    data[col] = values

# --- 3. Create DataFrame and Output CSV ---

# Use only the features the model expects, and ensure the order is correct
df_test = pd.DataFrame(data, columns=MODEL_FEATURES)

# OPTIONAL: Drop a critical column to intentionally test the 'missing column' logic in FastAPI
# df_test = df_test.drop(columns=['D15']) 

output_filename = 'test_input1.csv'
df_test.to_csv(output_filename, index=False)

print(f"Successfully generated {NUM_ROWS} rows of test data in: {output_filename}")
print(f"File has {len(df_test.columns)} columns (should be 132 features + TransactionID).")

Successfully generated 150 rows of test data in: test_input1.csv
File has 132 columns (should be 132 features + TransactionID).


In [2]:
df=pd.read_csv("../data/train_cleaned.csv")

In [3]:
df.drop(columns=["TransactionID"],inplace=True)

In [7]:
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Data columns (total 132 columns):
 #    Column          Non-Null Count   Dtype  
---   ------          --------------   -----  
 0    isFraud         590540 non-null  int64  
 1    TransactionDT   590540 non-null  int64  
 2    TransactionAmt  590540 non-null  float64
 3    ProductCD       590540 non-null  object 
 4    card1           590540 non-null  int64  
 5    card2           590540 non-null  float64
 6    card3           590540 non-null  float64
 7    card4           590540 non-null  object 
 8    card5           590540 non-null  float64
 9    card6           590540 non-null  object 
 10   addr1           590540 non-null  float64
 11   dist1           590540 non-null  float64
 12   dist2           590540 non-null  float64
 13   P_emaildomain   590540 non-null  object 
 14   R_emaildomain   590540 non-null  object 
 15   C1              590540 non-null  float64
 16   C2              590540 non-null  flo

In [19]:
df['addr1']

0         315.0
1         325.0
2         330.0
3         476.0
4         420.0
          ...  
590535    272.0
590536    204.0
590537    231.0
590538    387.0
590539    299.0
Name: addr1, Length: 590540, dtype: float64

In [4]:
categorical_feature=[]
for col in df.columns:
    if df[col].dtype=="object":
        categorical_feature.append(col)

In [23]:
df['card6']

0         credit
1         credit
2          debit
3          debit
4         credit
           ...  
590535     debit
590536     debit
590537     debit
590538     debit
590539    credit
Name: card6, Length: 590540, dtype: object

In [94]:
df['isFraud'].value_counts()

isFraud
0    569877
1     20663
Name: count, dtype: int64

In [5]:
nominal_cols=['ProductCD','card4','card6']#low cardinality categorical features
nominal_cols_high=['P_emaildomain','R_emaildomain',"id_30","id_31","DeviceInfo","id_33"]#high cardinality categorical features
ordinal_cols=['M4','M2','M3','M5','M6']

In [8]:
numeric_cols=[]
for col in df.columns:
    if col not in nominal_cols_high and col not in nominal_cols and col not in ordinal_cols and df[col].dtype!="object":
        if col!="isFraud":
         numeric_cols.append(col)

In [7]:
numeric_cols

['TransactionDT',
 'TransactionAmt',
 'card1',
 'card2',
 'card3',
 'card5',
 'addr1',
 'dist1',
 'dist2',
 'C1',
 'C2',
 'C3',
 'C4',
 'C5',
 'C6',
 'C8',
 'C9',
 'C10',
 'C11',
 'C12',
 'C13',
 'C14',
 'D1',
 'D2',
 'D3',
 'D4',
 'D5',
 'D8',
 'D10',
 'D11',
 'D12',
 'D13',
 'D14',
 'D15',
 'V6',
 'V12',
 'V13',
 'V19',
 'V20',
 'V35',
 'V38',
 'V45',
 'V48',
 'V53',
 'V54',
 'V55',
 'V56',
 'V61',
 'V62',
 'V66',
 'V67',
 'V70',
 'V73',
 'V75',
 'V76',
 'V78',
 'V82',
 'V83',
 'V87',
 'V91',
 'V94',
 'V96',
 'V99',
 'V115',
 'V126',
 'V128',
 'V131',
 'V133',
 'V136',
 'V139',
 'V143',
 'V149',
 'V156',
 'V165',
 'V169',
 'V171',
 'V187',
 'V189',
 'V197',
 'V206',
 'V220',
 'V256',
 'V258',
 'V261',
 'V262',
 'V281',
 'V283',
 'V285',
 'V291',
 'V294',
 'V295',
 'V296',
 'V307',
 'V308',
 'V309',
 'V310',
 'V311',
 'V312',
 'V313',
 'V314',
 'V315',
 'V317',
 'V320',
 'V326',
 'id_01',
 'id_02',
 'id_03',
 'id_04',
 'id_05',
 'id_06',
 'id_09',
 'id_13',
 'id_14',
 'id_17',
 'id_19

In [9]:
df=df.sort_values(by="TransactionDT")
split_index=int(0.8*len(df))
train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]

In [10]:
#frequency encoding 
from sklearn.base import BaseEstimator, TransformerMixin
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps={}
    
    def fit(self,X,y=None):
        X=pd.DataFrame(X)
        for col in X.columns:
            freq_map=X[col].value_counts(normalize=True)
            self.freq_maps[col]=freq_map
        return self
    
    def transform(self,X):
        X=pd.DataFrame(X)
        X_transformed=X.copy()
        for col in X.columns:
            X_transformed[col]=X_transformed[col].map(self.freq_maps[col]).fillna(0)
        return X_transformed

In [11]:
from sklearn.preprocessing import OneHotEncoder ,OrdinalEncoder,StandardScaler
ohe=OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
oe=OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
scaler=StandardScaler()
df[nominal_cols_high].fillna('missing')

Unnamed: 0,P_emaildomain,R_emaildomain,id_30,id_31,DeviceInfo,id_33
0,unknown,unknown,unknown,unknown,unknown,unknown
1,gmail.com,unknown,unknown,unknown,unknown,unknown
2,outlook.com,unknown,unknown,unknown,unknown,unknown
3,yahoo.com,unknown,unknown,unknown,unknown,unknown
4,gmail.com,unknown,Android 7.0,samsung browser 6.2,SAMSUNG SM-G892A Build/NRD90M,2220x1080
...,...,...,...,...,...,...
590535,unknown,unknown,unknown,unknown,unknown,unknown
590536,gmail.com,unknown,unknown,unknown,unknown,unknown
590537,gmail.com,unknown,unknown,unknown,unknown,unknown
590538,aol.com,unknown,unknown,unknown,unknown,unknown


In [None]:
df.head(5)

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D8,D10,D11,D12,D13,D14,D15,M2,...,V256,V258,V261,V262,V281,V283,V285,V291,V294,V295,V296,V307,V308,V309,V310,V311,V312,V313,V314,V315,V317,V320,V326,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_13,id_14,id_17,id_19,id_20,id_30,id_31,id_32,id_33,DeviceInfo
40809,1008491,100.0,R,6177,399.0,150.0,american express,150.0,credit,264.0,8.0,1.0,anonymous.com,anonymous.com,1.0,1.0,0.0,2.0,0.0,1.0,2.0,0.0,2.0,1.0,0.0,2.0,1.0,609.0,609.0,8.0,26.0,10.0,609.666687,15.0,43.0,0.0,0.0,0.0,52.0,unknown,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,58410.0,0.0,0.0,0.0,0.0,0.0,52.0,-360.0,166.0,300.0,214.0,Windows 7,ie 11.0 for desktop,24.0,1920x1080,Trident/7.0
285886,7008212,29.99,W,7900,345.0,150.0,mastercard,224.0,debit,143.0,4.0,37.0,gmail.com,unknown,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,97.0,8.0,0.0,10.0,37.875,0.0,0.0,0.0,0.0,0.0,0.0,T,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown
104256,2071522,107.95,W,11690,111.0,150.0,visa,226.0,credit,191.0,8.0,37.0,comcast.net,unknown,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,15.0,1.0,501.0,501.0,18.0,502.0,18.0,37.875,502.0,43.0,0.0,0.0,0.0,502.0,unknown,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,200.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown
507860,13299752,241.95,W,2616,327.0,150.0,discover,102.0,credit,330.0,3.0,37.0,unknown,unknown,1.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,4.0,1.0,177.0,177.0,86.0,26.0,10.0,37.875,177.0,177.0,0.0,0.0,0.0,177.0,T,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown
196382,4412283,117.0,W,13780,298.0,150.0,visa,226.0,debit,441.0,5.0,37.0,unknown,unknown,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,26.0,10.0,37.875,0.0,0.0,0.0,0.0,0.0,0.0,T,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown


In [43]:
X_test.head(5)

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D8,D10,D11,D12,D13,D14,D15,M2,...,V256,V258,V261,V262,V281,V283,V285,V291,V294,V295,V296,V307,V308,V309,V310,V311,V312,V313,V314,V315,V317,V320,V326,id_01,id_02,id_03,id_04,id_05,id_06,id_09,id_13,id_14,id_17,id_19,id_20,id_30,id_31,id_32,id_33,DeviceInfo
316078,7864752,107.95,W,7585,553.0,150.0,visa,226.0,credit,264.0,19.0,37.0,aol.com,unknown,2.0,2.0,0.0,0.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,9.0,2.0,122.0,122.0,33.0,216.0,33.0,37.875,33.0,216.0,0.0,0.0,0.0,216.0,T,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown
116888,2260318,37.021,C,3154,408.0,185.0,mastercard,224.0,debit,299.0,8.0,37.0,hotmail.com,hotmail.com,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,97.0,8.0,0.0,10.0,37.875,0.0,43.0,0.0,0.0,0.0,0.0,unknown,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,120692.0,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,225.0,254.0,507.0,unknown,chrome 63.0,24.0,unknown,Windows
410211,10361225,59.0,W,10057,225.0,150.0,mastercard,224.0,debit,181.0,0.0,37.0,unknown,unknown,1.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,1.0,313.0,19.0,11.0,26.0,10.0,37.875,313.0,313.0,0.0,0.0,0.0,313.0,T,...,1.0,1.0,1.0,1.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,394.0,0.0,0.0,394.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown
251027,5974846,57.95,W,6207,355.0,150.0,visa,166.0,debit,143.0,8.0,37.0,gmail.com,unknown,3.0,2.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,3.0,0.0,97.0,8.0,211.0,36.0,37.875,0.0,43.0,0.0,0.0,0.0,538.0,unknown,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown
191265,4296816,107.95,W,11207,361.0,150.0,visa,226.0,debit,325.0,0.0,37.0,aol.com,unknown,5.0,5.0,0.0,0.0,5.0,4.0,0.0,4.0,0.0,3.0,0.0,26.0,5.0,205.0,205.0,57.0,288.0,57.0,37.875,288.0,275.0,0.0,0.0,0.0,288.0,T,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.0,125800.5,0.0,0.0,0.0,0.0,0.0,52.0,-300.0,166.0,341.0,472.0,unknown,unknown,24.0,unknown,unknown


In [12]:
from sklearn.compose import ColumnTransformer
preprocessor=ColumnTransformer(transformers=[
    ('nominal',ohe,nominal_cols),
    ('ordinal',oe,ordinal_cols),
    ('freq',FrequencyEncoder(),nominal_cols_high),
    ('numeric',scaler,numeric_cols)
],remainder='passthrough')

In [13]:
X_train=train_df.drop(columns=['isFraud'])
y_train=train_df['isFraud']
X_test=test_df.drop(columns=['isFraud'])
y_test=test_df['isFraud']

In [14]:
X_train_preprocessed=preprocessor.fit_transform(X_train)
X_test_preprocessed=preprocessor.transform(X_test)

In [15]:
X_test_preprocessed.shape

(118108, 140)

In [16]:
def get_feature_names(preprocessor):
    feature_names=[]
    for name, transformer, columns in preprocessor.transformers_:
        if name != 'remainder':
            if hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns)
            else:
                names = columns
            feature_names.extend(names)
    return feature_names

In [17]:
x_train_trans=pd.DataFrame(X_train_preprocessed,columns=get_feature_names(preprocessor))
x_test_trans=pd.DataFrame(X_test_preprocessed,columns=get_feature_names(preprocessor))

In [18]:
x_train_trans.head(5)

Unnamed: 0,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_unknown,card4_visa,card6_credit,card6_debit,...,id_04,id_05,id_06,id_09,id_13,id_14,id_17,id_19,id_20,id_32
0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.028753,-0.154705,0.186745,-0.036239,0.179111,0.167209,-0.312222,-0.037241,0.214857,-0.217027
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.028753,-0.154705,0.186745,-0.036239,0.179111,0.167209,-0.312222,-0.037241,0.214857,-0.217027
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.028753,-0.154705,0.186745,-0.036239,0.179111,0.167209,-0.312222,-0.037241,0.214857,-0.217027
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.028753,-0.154705,0.186745,-0.036239,0.179111,0.167209,-0.312222,-0.037241,0.214857,-0.217027
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.028753,-0.154705,0.186745,-0.036239,0.179111,-4.397336,-0.312222,2.848053,-3.836656,4.562278


In [36]:
from sklearn.decomposition import PCA
pca=PCA(n_components=0.90,random_state=42)
x_train_pca=pca.fit_transform(x_train_trans)
x_test_pca=pca.transform(x_test_trans)

In [89]:
import numpy as np
np.isinf(X_train_preprocessed).sum()

np.int64(0)

In [110]:
from lightgbm import LGBMClassifier
model=LGBMClassifier()
model.fit(x_train_pca,y_train)
y_pred=model.predict(x_test_pca)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

[LightGBM] [Info] Number of positive: 16530, number of negative: 455902
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.129946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17595
[LightGBM] [Info] Number of data points in the train set: 472432, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.034989 -> initscore=-3.317101
[LightGBM] [Info] Start training from score -3.317101




              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.82      0.30      0.44      4133

    accuracy                           0.97    118108
   macro avg       0.90      0.65      0.71    118108
weighted avg       0.97      0.97      0.97    118108

[[113710    265]
 [  2891   1242]]


In [38]:
pca_cols=[f"[ca_{i}]" for i in range(x_train_pca.shape[1])]
df_aug=pd.DataFrame(x_train_pca, columns=pca_cols)
df_aug['isFraud']=y_train.values

In [None]:
mh_aug=MHSysGen(method="parallel",ratio=15,minority_class=1)
X_aug,y_aug=mh_aug.fit_resample(df_aug,target="isFraud")
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=600,
                               max_depth=18,
                               min_samples_split=5,
                               min_samples_leaf=2,
                               max_features="sqrt",
                               class_weight='balanced',
                               random_state=42,
                               )
model.fit(X_aug,y_aug)
y_pred=model.predict(x_test_pca)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.92      0.95    114044
           1       0.20      0.58      0.30      4064

    accuracy                           0.91    118108
   macro avg       0.59      0.75      0.63    118108
weighted avg       0.96      0.91      0.93    118108

[[104800   9244]
 [  1717   2347]]


In [19]:
from imblearn.over_sampling import SMOTE
smote=SMOTE(sampling_strategy=0.2,random_state=42)
X_smote,y_smote=smote.fit_resample(x_train_pca,y_train)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_smote,y_smote)
y_pred=model.predict(x_test_pca)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.85      0.51      0.63      4133

    accuracy                           0.98    118108
   macro avg       0.91      0.75      0.81    118108
weighted avg       0.98      0.98      0.98    118108

[[113594    381]
 [  2040   2093]]


In [None]:
import numpy as np
from sklearn.neighbors import KDTree

class PWMv5Fast:
    def __init__(self, ratio=20, random_state=42):
        self.ratio = ratio
        np.random.seed(random_state)

    def fit_resample(self, X, y):
        X = np.array(X)
        y = np.array(y)

        classes, counts = np.unique(y, return_counts=True)
        min_class = classes[np.argmin(counts)]
        X_min = X[y == min_class]
        X_maj = X[y != min_class]

        # Precompute covariance noise
        cov = np.cov(X_min.T)
        L = np.linalg.cholesky(cov + np.eye(cov.shape[0])*1e-6)

        # Build KDTree only once
        tree = KDTree(X_maj)

        n_new = len(X_min) * self.ratio

        # 1) Compute majority outward direction ONCE for each minority point
        dists, idxs = tree.query(X_min, k=10)
        maj_centers = np.array([X_maj[idx].mean(axis=0) for idx in idxs])
        outward = X_min - maj_centers
        outward /= np.linalg.norm(outward, axis=1, keepdims=True) + 1e-9

        # 2) Sample synthetic points in a vectorized form
        choices = np.random.randint(0, len(X_min), size=n_new)
        base = X_min[choices]
        out = outward[choices]

        # scaling
        scale = np.random.uniform(1.0, 2.0, size=n_new).reshape(-1, 1)

        # random walk vector, normalized
        rand = np.random.randn(n_new, X.shape[1])
        rand /= np.linalg.norm(rand, axis=1, keepdims=True) + 1e-9

        # covariance noise in vectorized manner
        noise = np.dot(np.random.randn(n_new, X.shape[1]), L.T) * 0.03

        # final synthetic points
        synth = base + out * scale + rand * 0.5 + noise

        X_new = np.vstack([X, synth])
        y_new = np.concatenate([y, np.full(n_new, min_class)])

        return X_new, y_new



p5 = PWMv5Fast(ratio=20)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=600,
                               max_depth=18,
                               min_samples_split=5,
                               min_samples_leaf=2,
                               max_features="sqrt",
                               class_weight='balanced',
                               random_state=42,
                               
                               )
X5, y5 = p5.fit_resample(x_train_pca, y_train)
model.fit(X5,y5)
y_pred=model.predict(x_test_pca)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.98      0.92      0.95    114044
           1       0.20      0.58      0.30      4064

    accuracy                           0.91    118108
   macro avg       0.59      0.75      0.63    118108
weighted avg       0.96      0.91      0.93    118108

[[104800   9244]
 [  1717   2347]]


In [111]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(x_train_pca,y_train)
y_pred=model.predict(x_test_pca)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99    113975
           1       0.94      0.37      0.53      4133

    accuracy                           0.98    118108
   macro avg       0.96      0.68      0.76    118108
weighted avg       0.98      0.98      0.97    118108

[[113875    100]
 [  2606   1527]]


In [None]:
import pandas as pd
from mh_sys_gen import MHSysGen
from sklearn.model_selection import train_test_split
df=pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"],inplace=True)
categorical_feature=[]
for col in df.columns:
    if df[col].dtype=="object":
        categorical_feature.append(col)
nominal_cols=['ProductCD','card4','card6']#low cardinality categorical features
nominal_cols_high=['P_emaildomain','R_emaildomain',"id_30","id_31","DeviceInfo","id_33"]#high cardinality categorical features 
ordinal_cols=['M4','M2','M3','M5','M6']
numeric_cols=[]
for col in df.columns:
    if col not in nominal_cols_high and col not in nominal_cols and col not in ordinal_cols and df[col].dtype!="object":
        if col!="isFraud":
         numeric_cols.append(col)
df=df.sort_values(by="TransactionDT")
split_index=int(0.8*len(df))
train_df = df.iloc[:split_index]
test_df = df.iloc[split_index:]
#frequency encoding 
from sklearn.base import BaseEstimator, TransformerMixin
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def init(self,freq_maps=None):
        self.freq_maps=freq_maps
    
    def fit(self,X,y=None):
        X=pd.DataFrame(X)
        self.freq_maps={}
        for col in X.columns:
            freq_map=X[col].value_counts(normalize=True)
            self.freq_maps[col]=freq_map
        return self
    
    def transform(self,X):
        X=pd.DataFrame(X)
        X_transformed=X.copy()
        for col in X.columns:
            X_transformed[col]=X_transformed[col].map(self.freq_maps[col]).fillna(0)
        return X_transformed
from sklearn.preprocessing import OneHotEncoder ,OrdinalEncoder,StandardScaler
ohe=OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore')
oe=OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
scaler=StandardScaler()
df[nominal_cols_high].fillna('missing')
from sklearn.compose import ColumnTransformer
preprocessor=ColumnTransformer(transformers=[
    ('nominal',ohe,nominal_cols),
    ('ordinal',oe,ordinal_cols),
    ('freq',FrequencyEncoder(),nominal_cols_high),
    ('numeric',scaler,numeric_cols)
],remainder='passthrough')
X_train=train_df.drop(columns=['isFraud'])
y_train=train_df['isFraud']
X_test=test_df.drop(columns=['isFraud'])
y_test=test_df['isFraud']
X_train_preprocessed=preprocessor.fit_transform(X_train)
X_test_preprocessed=preprocessor.transform(X_test)
def get_feature_names(preprocessor):
    feature_names=[]
    for name, transformer, columns in preprocessor.transformers_:
        if name != 'remainder':
            if hasattr(transformer, 'get_feature_names_out'):
                names = transformer.get_feature_names_out(columns)
            else:
                names = columns
            feature_names.extend(names)
    return feature_names
x_train_trans=pd.DataFrame(X_train_preprocessed,columns=get_feature_names(preprocessor))
x_test_trans=pd.DataFrame(X_test_preprocessed,columns=get_feature_names(preprocessor))
print("started")

from sklearn.decomposition import PCA
pca=PCA(n_components=0.90,random_state=42)
x_train_pca=pca.fit_transform(x_train_trans)
x_test_pca=pca.transform(x_test_trans)


print("started dividing")

df_raw=x_train_trans.copy()
df_raw['isFraud']=y_train.values
mh_aug=MHSysGen(method="parallel",ratio=15,minority_class=1)
print("started aug")
X_aug,y_aug=mh_aug.fit_resample(df_raw,target="isFraud")
x_aug_pca=pca.transform(X_aug)
y_aug_pca=y_aug
x_aug_pca=pca.transform(X_smote)
y_aug_pca=y_smote
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
model_mhsysgen=XGBClassifier(n_estimators=600,
                               max_depth=18,
                               min_child_weight=5,
                               learning_rate=0.1,
                               subsample=0.8,
                               colsample_bytree=0.8,
                               scale_pos_weight=1,
                               random_state=42,
                               use_label_encoder=False,
                               eval_metric='logloss'
                               )
model_mhsysgen.fit(x_aug_pca,y_aug_pca)
y_pred=model_mhsysgen.predict(x_test_pca)
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


started
started dividing


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.98      1.00      0.99    114044
           1       0.72      0.32      0.44      4064

    accuracy                           0.97    118108
   macro avg       0.85      0.66      0.71    118108
weighted avg       0.97      0.97      0.97    118108

[[113534    510]
 [  2774   1290]]


In [None]:
import pandas as pd
import numpy as np
from mh_sys_gen import MHSysGen
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

numeric_cols = []
for col in df.columns:
    if col not in nominal_cols and \
       col not in nominal_high and \
       col not in ordinal_cols and \
       col != "isFraud" and \
       df[col].dtype != "object":
        numeric_cols.append(col)


# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]


# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def _init_(self):
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            X2[col] = X2[col].map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        ("freq", freq, nominal_high),
        ("num", scaler, numeric_cols),
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.90, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
model = XGBClassifier(
    n_estimators=600,
    max_depth=18,
    min_child_weight=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

print("\nTraining XGBoost...")
model.fit(X_aug_pca, y_aug)

y_pred = model.predict(x_test_pca)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (MH-SysGen + XGBoost) =====")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

SyntaxError: invalid non-printable character U+00A0 (2429730967.py, line 172)

In [None]:
import pandas as pd
import numpy as np
from mh_sys_gen import MHSysGen
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

numeric_cols = []
for col in df.columns:
    if col not in nominal_cols and \
       col not in nominal_high and \
       col not in ordinal_cols and \
       col != "isFraud" and \
       df[col].dtype != "object":
        numeric_cols.append(col)


# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]


# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            self.freq_maps[col] = X[col].value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            X2[col] = X2[col].map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        ("freq", freq, nominal_high),
        ("num", scaler, numeric_cols),
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.90, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
model = XGBClassifier(
    n_estimators=600,
    max_depth=18,
    min_child_weight=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

print("\nTraining XGBoost...")
model.fit(X_aug_pca, y_aug)

y_pred = model.predict(x_test_pca)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (MH-SysGen + XGBoost) =====")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

Preprocessing completed. Shape: (472432, 140)
PCA completed. Train PCA shape: (472432, 58)

Starting MH-SysGen augmentation...
Augmentation complete. New shape: (970402, 140)

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



===== FINAL RESULTS (MH-SysGen + XGBoost) =====
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    114044
           1       0.45      0.47      0.46      4064

    accuracy                           0.96    118108
   macro avg       0.72      0.72      0.72    118108
weighted avg       0.96      0.96      0.96    118108

[[111723   2321]
 [  2166   1898]]


In [11]:
import pandas as pd
import numpy as np
# Assuming mh_sys_gen is available in your environment
from mh_sys_gen import MHSysGen 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 0. CUSTOM FEATURE ENGINEER CLASS (NEW)
# =========================================================
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    A transformer to create time-based and aggregation features 
    specific to the IEEE Fraud Detection dataset.
    """
    def __init__(self, time_col='TransactionDT', amt_col='TransactionAmt'):
        self.time_col = time_col
        self.amt_col = amt_col
        self.agg_features = {} # Stores aggregation maps during fit

    def fit(self, X, y=None):
        X_copy = X.copy()
        X_copy['isFraud'] = y # Temporarily add target for aggregation/target encoding
        
        # 1. Base Aggregation Keys
        self.base_agg_keys = ['card1', 'addr1']
        
        # 2. Prepare Aggregation Maps
        for col in self.base_agg_keys:
            # Frequency Map (for velocity/cardinality)
            self.agg_features[f'{col}_Count_Map'] = X_copy[col].value_counts()
            
            # Amount Mean Map
            self.agg_features[f'{col}_Amt_Mean_Map'] = X_copy.groupby(col)[self.amt_col].mean()
            
            # Amount Std Map
            self.agg_features[f'{col}_Amt_Std_Map'] = X_copy.groupby(col)[self.amt_col].std().fillna(1.0)
            
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # --- Time-Based Features (Requires TransactionDT) ---
        if self.time_col in X_copy.columns:
            # Hour of Day (0-23)
            X_copy['Transaction_Hour'] = (X_copy[self.time_col] // 3600) % 24
            # Day of Week (0-6)
            X_copy['Transaction_DayOfWeek'] = (X_copy[self.time_col] // (3600 * 24)) % 7
            X_copy['Transaction_Day'] = X_copy[self.time_col] // (3600 * 24)

        # --- Aggregation and Ratio Features ---
        for col in self.base_agg_keys:
            # Frequency Feature
            count_map = self.agg_features[f'{col}_Count_Map']
            X_copy[f'{col}_Count'] = X_copy[col].map(count_map).fillna(0)
            
            # Amount Mean Feature
            mean_map = self.agg_features[f'{col}_Amt_Mean_Map']
            X_copy[f'{col}_Amt_Mean'] = X_copy[col].map(mean_map).fillna(X_copy[self.amt_col].mean())
            
            # Amount Std Feature
            std_map = self.agg_features[f'{col}_Amt_Std_Map']
            X_copy[f'{col}_Amt_Std'] = X_copy[col].map(std_map).fillna(1.0)
        
        # Amount-to-Mean Ratio (highly predictive)
        X_copy['Amt_Div_Mean_card1'] = X_copy[self.amt_col] / X_copy['card1_Amt_Mean']
        
        # Log Transform
        X_copy['TransactionAmt_Log'] = np.log1p(X_copy[self.amt_col])
        
        return X_copy


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
# NOTE: The new engineered features will automatically be picked up 
# by the numeric_cols list after the CustomFeatureEngineer step.

nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

# Create a temporary list of numeric columns based on original data
numeric_cols_original = []
for col in df.columns:
    if col not in nominal_cols and \
        col not in nominal_high and \
        col not in ordinal_cols and \
        col != "isFraud" and \
        df[col].dtype != "object":
        numeric_cols_original.append(col)
# We will use this list to initialize the `numeric_cols` after FE

# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]

# =========================================================
# 3a. APPLY CUSTOM FEATURE ENGINEER (NEW STEP)
# =========================================================
fe = CustomFeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)

# Update numeric_cols to include the new engineered features
engineered_features = [
    'Transaction_Hour', 'Transaction_DayOfWeek', 'Transaction_Day',
    'card1_Count', 'card1_Amt_Mean', 'card1_Amt_Std',
    'addr1_Count', 'addr1_Amt_Mean', 'addr1_Amt_Std',
    'Amt_Div_Mean_card1', 'TransactionAmt_Log'
]
numeric_cols = numeric_cols_original + engineered_features

# Drop the original TransactionDT after extracting time features
if 'TransactionDT' in X_train.columns:
    X_train.drop(columns=['TransactionDT'], inplace=True)
    X_test.drop(columns=['TransactionDT'], inplace=True)
    numeric_cols.remove('TransactionDT')

# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self): # Fix: use double underscore
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            # Handle string conversion for mapping consistency
            self.freq_maps[col] = X[col].astype(str).value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            # Map based on string values and fill NaNs (new categories in test) with 0
            X2[col] = X2[col].astype(str).map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        # NOTE: nominal_high columns (emails, device info) are now Frequency Encoded
        ("freq", freq, nominal_high), 
        # NOTE: numeric_cols now includes the engineered features
        ("num", scaler, [col for col in numeric_cols if col in X_train.columns]), 
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            # Check if the transformer is a OneHotEncoder to use get_feature_names_out
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.95, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
model = XGBClassifier(
    n_estimators=600,
    max_depth=18,
    min_child_weight=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    random_state=42,
    # use_label_encoder=False is deprecated and removed in recent XGBoost versions.
    eval_metric="logloss" 
)

print("\nTraining XGBoost...")
model.fit(X_aug_pca, y_aug)

y_pred = model.predict(x_test_pca)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

Preprocessing completed. Shape: (472432, 150)
PCA completed. Train PCA shape: (472432, 79)

Starting MH-SysGen augmentation...


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
# Assuming mh_sys_gen is available in your environment
from mh_sys_gen import MHSysGen 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 0. CUSTOM FEATURE ENGINEER CLASS (NEW)
# =========================================================

class FullFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    FULL FEATURE ENGINEERING for IEEE-CIS Fraud Dataset.
    Includes:
    - Time features
    - Email grouping
    - Device parsing
    - id_33 resolution split
    - Amount log
    - Card + Addr aggregations
    - Ratio features
    - Cyclic features
    """

    def _init_(self, 
                 time_col='TransactionDT', 
                 amt_col='TransactionAmt'):
        
        self.time_col = time_col
        self.amt_col = amt_col
        
        # saved mappings
        self.email_map = {}
        self.device_brand_map = {}
        self.agg_features = {}
        self.base_agg_keys = ['card1', 'addr1']

    def fit(self, X, y=None):
        X = X.copy()
        X['isFraud'] = y

        # -----------------------------
        # 1. EMAIL GROUP MAPPING
        # -----------------------------
        def map_email(x):
            x = str(x).lower()
            if "gmail" in x: return "gmail"
            if "outlook" in x or "hotmail" in x or "live" in x: return "microsoft"
            if "yahoo" in x: return "yahoo"
            if "icloud" in x or "mac" in x or "apple" in x: return "apple"
            if x == "nan": return "unknown"
            return "other"

        email_cols = ["P_emaildomain", "R_emaildomain"]
        for col in email_cols:
            self.email_map[col] = X[col].astype(str).apply(map_email)

        # -----------------------------
        # 2. DEVICE BRAND MAPPING
        # -----------------------------
        def parse_device(x):
            x = str(x).lower()
            if "sm" in x: return "samsung"
            if "moto" in x: return "motorola"
            if "iphone" in x: return "iphone"
            if "mac" in x: return "apple"
            if "huawei" in x: return "huawei"
            if "lg" in x: return "lg"
            if x == "nan": return "unknown"
            return "other"

        self.device_brand_map = X["DeviceInfo"].astype(str).apply(parse_device)

        # -----------------------------
        # 3. AGGREGATION FEATURES
        # -----------------------------
        for col in self.base_agg_keys:
            # count / frequency
            self.agg_features[f"{col}_Count_Map"] = X[col].value_counts()

            # amount mean
            self.agg_features[f"{col}_Amt_Mean_Map"] = X.groupby(col)[self.amt_col].mean()

            # amount std
            self.agg_features[f"{col}_Amt_Std_Map"] = (
                X.groupby(col)[self.amt_col].std().fillna(1.0)
            )

        return self


    def transform(self, X):
        X = X.copy()

        # -----------------------------
        # EMAIL GROUPING
        # -----------------------------
        def map_email(x):
            x = str(x).lower()
            if "gmail" in x: return "gmail"
            if "outlook" in x or "hotmail" in x or "live" in x: return "microsoft"
            if "yahoo" in x: return "yahoo"
            if "icloud" in x or "mac" in x or "apple" in x: return "apple"
            if x == "nan": return "unknown"
            return "other"

        X["P_email_group"] = X["P_emaildomain"].astype(str).apply(map_email)
        X["R_email_group"] = X["R_emaildomain"].astype(str).apply(map_email)

        # -----------------------------
        # DEVICE BRAND
        # -----------------------------
        def parse_device(x):
            x = str(x).lower()
            if "sm" in x: return "samsung"
            if "moto" in x: return "motorola"
            if "iphone" in x: return "iphone"
            if "mac" in x: return "apple"
            if "huawei" in x: return "huawei"
            if "lg" in x: return "lg"
            return "other"

        X["Device_brand"] = X["DeviceInfo"].astype(str).apply(parse_device)

        # -----------------------------
        # ID_33 RESOLUTION SPLIT
        # -----------------------------
        X["id_33_h"] = X["id_33"].astype(str).str.split("x").str[0].astype(float)
        X["id_33_w"] = X["id_33"].astype(str).str.split("x").str[1].astype(float)

        # -----------------------------
        # TIME FEATURES
        # -----------------------------
        if self.time_col in X.columns:
            X["hour"] = (X[self.time_col] // 3600) % 24
            X["day"] = (X[self.time_col] // (3600 * 24))
            X["weekday"] = X["day"] % 7
            X["is_weekend"] = (X["weekday"] >= 5).astype(int)

            # cyclic features
            X["hour_sin"] = np.sin(2 * np.pi * X["hour"] / 24)
            X["hour_cos"] = np.cos(2 * np.pi * X["hour"] / 24)

        # -----------------------------
        # AMOUNT FEATURES
        # -----------------------------
        X["amt_log"] = np.log1p(X[self.amt_col])

        # -----------------------------
        # AGGREGATE FEATURES
        # -----------------------------
        for col in self.base_agg_keys:
            X[f"{col}_Count"] = X[col].map(self.agg_features[f"{col}_Count_Map"]).fillna(0)
            X[f"{col}_Amt_Mean"] = X[col].map(self.agg_features[f"{col}_Amt_Mean_Map"]).fillna(X[self.amt_col].mean())
            X[f"{col}_Amt_Std"] = X[col].map(self.agg_features[f"{col}_Amt_Std_Map"]).fillna(1.0)

        # ratio feature
        X["Amt_over_card1_mean"] = X[self.amt_col] / (X["card1_Amt_Mean"] + 1)

        return X

# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
# NOTE: The new engineered features will automatically be picked up 
# by the numeric_cols list after the CustomFeatureEngineer step.

nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

# Create a temporary list of numeric columns based on original data
numeric_cols_original = []
for col in df.columns:
    if col not in nominal_cols and \
        col not in nominal_high and \
        col not in ordinal_cols and \
        col != "isFraud" and \
        df[col].dtype != "object":
        numeric_cols_original.append(col)
# We will use this list to initialize the `numeric_cols` after FE

# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]

# =========================================================
# 3a. APPLY CUSTOM FEATURE ENGINEER (NEW STEP)
# =========================================================
fe = CustomFeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)

# Update numeric_cols to include the new engineered features
engineered_features = [
    'Transaction_Hour', 'Transaction_DayOfWeek', 'Transaction_Day',
    'card1_Count', 'card1_Amt_Mean', 'card1_Amt_Std',
    'addr1_Count', 'addr1_Amt_Mean', 'addr1_Amt_Std',
    'Amt_Div_Mean_card1', 'TransactionAmt_Log'
]
numeric_cols = numeric_cols_original + engineered_features

# Drop the original TransactionDT after extracting time features
if 'TransactionDT' in X_train.columns:
    X_train.drop(columns=['TransactionDT'], inplace=True)
    X_test.drop(columns=['TransactionDT'], inplace=True)
    numeric_cols.remove('TransactionDT')

# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self): # Fix: use double underscore
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            # Handle string conversion for mapping consistency
            self.freq_maps[col] = X[col].astype(str).value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            # Map based on string values and fill NaNs (new categories in test) with 0
            X2[col] = X2[col].astype(str).map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        # NOTE: nominal_high columns (emails, device info) are now Frequency Encoded
        ("freq", freq, nominal_high), 
        # NOTE: numeric_cols now includes the engineered features
        ("num", scaler, [col for col in numeric_cols if col in X_train.columns]), 
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            # Check if the transformer is a OneHotEncoder to use get_feature_names_out
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.95, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
model = XGBClassifier(
    n_estimators=600,
    max_depth=18,
    min_child_weight=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    random_state=42,
    # use_label_encoder=False is deprecated and removed in recent XGBoost versions.
    eval_metric="logloss" 
)

print("\nTraining XGBoost...")
model.fit(X_aug_pca, y_aug)

y_pred = model.predict(x_test_pca)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

Preprocessing completed. Shape: (472432, 150)
PCA completed. Train PCA shape: (472432, 79)

Starting MH-SysGen augmentation...
Augmentation complete. New shape: (970402, 150)

Training XGBoost...

===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    114044
           1       0.42      0.50      0.46      4064

    accuracy                           0.96    118108
   macro avg       0.70      0.74      0.72    118108
weighted avg       0.96      0.96      0.96    118108

[[111258   2786]
 [  2043   2021]]


In [12]:
import pandas as pd
import numpy as np
# Assuming mh_sys_gen is available in your environment
from mh_sys_gen import MHSysGen 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 0. CUSTOM FEATURE ENGINEER CLASS (NEW)
# =========================================================
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    A transformer to create time-based and aggregation features 
    specific to the IEEE Fraud Detection dataset.
    """
    def __init__(self, time_col='TransactionDT', amt_col='TransactionAmt'):
        self.time_col = time_col
        self.amt_col = amt_col
        self.agg_features = {} # Stores aggregation maps during fit

    def fit(self, X, y=None):
        X_copy = X.copy()
        X_copy['isFraud'] = y # Temporarily add target for aggregation/target encoding
        
        # 1. Base Aggregation Keys
        self.base_agg_keys = ['card1', 'addr1']
        
        # 2. Prepare Aggregation Maps
        for col in self.base_agg_keys:
            # Frequency Map (for velocity/cardinality)
            self.agg_features[f'{col}_Count_Map'] = X_copy[col].value_counts()
            
            # Amount Mean Map
            self.agg_features[f'{col}_Amt_Mean_Map'] = X_copy.groupby(col)[self.amt_col].mean()
            
            # Amount Std Map
            self.agg_features[f'{col}_Amt_Std_Map'] = X_copy.groupby(col)[self.amt_col].std().fillna(1.0)
            
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # --- Time-Based Features (Requires TransactionDT) ---
        if self.time_col in X_copy.columns:
            # Hour of Day (0-23)
            X_copy['Transaction_Hour'] = (X_copy[self.time_col] // 3600) % 24
            # Day of Week (0-6)
            X_copy['Transaction_DayOfWeek'] = (X_copy[self.time_col] // (3600 * 24)) % 7
            X_copy['Transaction_Day'] = X_copy[self.time_col] // (3600 * 24)

        # --- Aggregation and Ratio Features ---
        for col in self.base_agg_keys:
            # Frequency Feature
            count_map = self.agg_features[f'{col}_Count_Map']
            X_copy[f'{col}_Count'] = X_copy[col].map(count_map).fillna(0)
            
            # Amount Mean Feature
            mean_map = self.agg_features[f'{col}_Amt_Mean_Map']
            X_copy[f'{col}_Amt_Mean'] = X_copy[col].map(mean_map).fillna(X_copy[self.amt_col].mean())
            
            # Amount Std Feature
            std_map = self.agg_features[f'{col}_Amt_Std_Map']
            X_copy[f'{col}_Amt_Std'] = X_copy[col].map(std_map).fillna(1.0)
        
        # Amount-to-Mean Ratio (highly predictive)
        X_copy['Amt_Div_Mean_card1'] = X_copy[self.amt_col] / X_copy['card1_Amt_Mean']
        
        # Log Transform
        X_copy['TransactionAmt_Log'] = np.log1p(X_copy[self.amt_col])
        
        return X_copy


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
# NOTE: The new engineered features will automatically be picked up 
# by the numeric_cols list after the CustomFeatureEngineer step.

nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

# Create a temporary list of numeric columns based on original data
numeric_cols_original = []
for col in df.columns:
    if col not in nominal_cols and \
        col not in nominal_high and \
        col not in ordinal_cols and \
        col != "isFraud" and \
        df[col].dtype != "object":
        numeric_cols_original.append(col)
# We will use this list to initialize the `numeric_cols` after FE

# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]

# =========================================================
# 3a. APPLY CUSTOM FEATURE ENGINEER (NEW STEP)
# =========================================================
fe = CustomFeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)

# Update numeric_cols to include the new engineered features
engineered_features = [
    'Transaction_Hour', 'Transaction_DayOfWeek', 'Transaction_Day',
    'card1_Count', 'card1_Amt_Mean', 'card1_Amt_Std',
    'addr1_Count', 'addr1_Amt_Mean', 'addr1_Amt_Std',
    'Amt_Div_Mean_card1', 'TransactionAmt_Log'
]
numeric_cols = numeric_cols_original + engineered_features

# Drop the original TransactionDT after extracting time features
if 'TransactionDT' in X_train.columns:
    X_train.drop(columns=['TransactionDT'], inplace=True)
    X_test.drop(columns=['TransactionDT'], inplace=True)
    numeric_cols.remove('TransactionDT')

# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self): # Fix: use double underscore
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            # Handle string conversion for mapping consistency
            self.freq_maps[col] = X[col].astype(str).value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            # Map based on string values and fill NaNs (new categories in test) with 0
            X2[col] = X2[col].astype(str).map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        # NOTE: nominal_high columns (emails, device info) are now Frequency Encoded
        ("freq", freq, nominal_high), 
        # NOTE: numeric_cols now includes the engineered features
        ("num", scaler, [col for col in numeric_cols if col in X_train.columns]), 
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            # Check if the transformer is a OneHotEncoder to use get_feature_names_out
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.95, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=1500,
                           depth=10,
                           learning_rate=0.05,
                           loss_function='Logloss',
                           eval_metric='F1',
                           scale_pos_weight=15,
                           random_seed=42,
                           verbose=200
    
)

print("\nTraining XGBoost...")
model.fit(X_aug_pca, y_aug)

y_pred = model.predict(x_test_pca)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

Preprocessing completed. Shape: (472432, 150)
PCA completed. Train PCA shape: (472432, 79)

Starting MH-SysGen augmentation...
Augmentation complete. New shape: (970402, 150)

Training XGBoost...
0:	learn: 0.9713755	total: 818ms	remaining: 20m 25s
200:	learn: 0.9795176	total: 2m 21s	remaining: 15m 11s
400:	learn: 0.9842915	total: 4m 41s	remaining: 12m 51s
600:	learn: 0.9873482	total: 6m 49s	remaining: 10m 12s
800:	learn: 0.9895507	total: 8m 32s	remaining: 7m 27s
1000:	learn: 0.9913690	total: 10m 19s	remaining: 5m 8s
1200:	learn: 0.9928230	total: 12m 5s	remaining: 3m
1400:	learn: 0.9940061	total: 13m 49s	remaining: 58.6s
1499:	learn: 0.9944945	total: 14m 42s	remaining: 0us

===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====
              precision    recall  f1-score   support

           0       0.99      0.78      0.87    114044
           1       0.11      0.79      0.20      4064

    accuracy                           0.78    118108
   macro avg       0.55      0.79      0.54    

In [15]:
import pandas as pd
import numpy as np
# Assuming mh_sys_gen is available in your environment
from mh_sys_gen import MHSysGen 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 0. CUSTOM FEATURE ENGINEER CLASS (NEW)
# =========================================================
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    A transformer to create time-based and aggregation features 
    specific to the IEEE Fraud Detection dataset.
    """
    def __init__(self, time_col='TransactionDT', amt_col='TransactionAmt'):
        self.time_col = time_col
        self.amt_col = amt_col
        self.agg_features = {} # Stores aggregation maps during fit

    def fit(self, X, y=None):
        X_copy = X.copy()
        X_copy['isFraud'] = y # Temporarily add target for aggregation/target encoding
        
        # 1. Base Aggregation Keys
        self.base_agg_keys = ['card1', 'addr1']
        
        # 2. Prepare Aggregation Maps
        for col in self.base_agg_keys:
            # Frequency Map (for velocity/cardinality)
            self.agg_features[f'{col}_Count_Map'] = X_copy[col].value_counts()
            
            # Amount Mean Map
            self.agg_features[f'{col}_Amt_Mean_Map'] = X_copy.groupby(col)[self.amt_col].mean()
            
            # Amount Std Map
            self.agg_features[f'{col}_Amt_Std_Map'] = X_copy.groupby(col)[self.amt_col].std().fillna(1.0)
            
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # --- Time-Based Features (Requires TransactionDT) ---
        if self.time_col in X_copy.columns:
            # Hour of Day (0-23)
            X_copy['Transaction_Hour'] = (X_copy[self.time_col] // 3600) % 24
            # Day of Week (0-6)
            X_copy['Transaction_DayOfWeek'] = (X_copy[self.time_col] // (3600 * 24)) % 7
            X_copy['Transaction_Day'] = X_copy[self.time_col] // (3600 * 24)

        # --- Aggregation and Ratio Features ---
        for col in self.base_agg_keys:
            # Frequency Feature
            count_map = self.agg_features[f'{col}_Count_Map']
            X_copy[f'{col}_Count'] = X_copy[col].map(count_map).fillna(0)
            
            # Amount Mean Feature
            mean_map = self.agg_features[f'{col}_Amt_Mean_Map']
            X_copy[f'{col}_Amt_Mean'] = X_copy[col].map(mean_map).fillna(X_copy[self.amt_col].mean())
            
            # Amount Std Feature
            std_map = self.agg_features[f'{col}_Amt_Std_Map']
            X_copy[f'{col}_Amt_Std'] = X_copy[col].map(std_map).fillna(1.0)
        
        # Amount-to-Mean Ratio (highly predictive)
        X_copy['Amt_Div_Mean_card1'] = X_copy[self.amt_col] / X_copy['card1_Amt_Mean']
        
        # Log Transform
        X_copy['TransactionAmt_Log'] = np.log1p(X_copy[self.amt_col])
        
        return X_copy


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
# NOTE: The new engineered features will automatically be picked up 
# by the numeric_cols list after the CustomFeatureEngineer step.

nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

# Create a temporary list of numeric columns based on original data
numeric_cols_original = []
for col in df.columns:
    if col not in nominal_cols and \
        col not in nominal_high and \
        col not in ordinal_cols and \
        col != "isFraud" and \
        df[col].dtype != "object":
        numeric_cols_original.append(col)
# We will use this list to initialize the `numeric_cols` after FE

# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]

# =========================================================
# 3a. APPLY CUSTOM FEATURE ENGINEER (NEW STEP)
# =========================================================
fe = CustomFeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)

# Update numeric_cols to include the new engineered features
engineered_features = [
    'Transaction_Hour', 'Transaction_DayOfWeek', 'Transaction_Day',
    'card1_Count', 'card1_Amt_Mean', 'card1_Amt_Std',
    'addr1_Count', 'addr1_Amt_Mean', 'addr1_Amt_Std',
    'Amt_Div_Mean_card1', 'TransactionAmt_Log'
]
numeric_cols = numeric_cols_original + engineered_features

# Drop the original TransactionDT after extracting time features
if 'TransactionDT' in X_train.columns:
    X_train.drop(columns=['TransactionDT'], inplace=True)
    X_test.drop(columns=['TransactionDT'], inplace=True)
    numeric_cols.remove('TransactionDT')

# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self): # Fix: use double underscore
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            # Handle string conversion for mapping consistency
            self.freq_maps[col] = X[col].astype(str).value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            # Map based on string values and fill NaNs (new categories in test) with 0
            X2[col] = X2[col].astype(str).map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        # NOTE: nominal_high columns (emails, device info) are now Frequency Encoded
        ("freq", freq, nominal_high), 
        # NOTE: numeric_cols now includes the engineered features
        ("num", scaler, [col for col in numeric_cols if col in X_train.columns]), 
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            # Check if the transformer is a OneHotEncoder to use get_feature_names_out
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.95, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)





import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

class FastSequentialShadowMirror:
    """
    FAST version of Sequential Shadow-Mirror oversampling.
    Same ideology:
      1. Shadow scaling
      2. Mirror using nearest neighbor reflection
      3. Vectorized & optimized (10-40x faster)
    """

    def __init__(self, target_col, minority_class=1, ratio=1.0):
        self.target_col = target_col
        self.minority_class = minority_class
        self.ratio = ratio

    def fit_resample(self, df):
        """
        df : DataFrame containing both features and target_col  
        returns: X_resampled, y_resampled
        """

        # ----------------------------
        # PREPARE DATA
        # ----------------------------
        features = [c for c in df.columns if c != self.target_col]
        minority_df = df[df[self.target_col] == self.minority_class]

        if minority_df.empty:
            raise ValueError(f"Minority class {self.minority_class} not found.")

        total_synth = int(len(minority_df) * self.ratio)
        if total_synth < 1:
            raise ValueError("Ratio too small; produced 0 samples.")

        X_min = minority_df[features].values
        n_features = X_min.shape[1]

        # ----------------------------
        # PHASE 1 — SHADOW (VECTORIZED)
        # ----------------------------
        idxs = np.random.randint(0, len(X_min), size=total_synth)
        originals = X_min[idxs]

        scales = np.random.uniform(0.6, 1.4, size=(total_synth, n_features))
        shadows = originals * scales

        # ----------------------------
        # PHASE 2 — MIRROR (VECTORIZED KNN)
        # ----------------------------
        nbrs = NearestNeighbors(n_neighbors=5).fit(shadows)
        _, idx_arrays = nbrs.kneighbors(shadows)

        neighbors = shadows[idx_arrays[:, 1]]

        reflect_scale = np.random.uniform(0.5, 1.3, size=(total_synth, 1))
        mirrored = shadows + (neighbors - shadows) * reflect_scale

        # ----------------------------
        # BUILD SYNTHETIC DF
        # ----------------------------
        synthetic_df = pd.DataFrame(mirrored, columns=features)
        synthetic_df[self.target_col] = self.minority_class

        combined = pd.concat([df, synthetic_df], ignore_index=True)

        X_out = combined.drop(columns=[self.target_col])
        y_out = combined[self.target_col]

        return X_out, y_out


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values
sm=FastSequentialShadowMirror(target_col="isFraud",minority_class=1,ratio=20)
print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = sm.fit_resample(df_aug)
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
from catboost import CatBoostClassifier
model = XGBClassifier(
    n_estimators=600,
    max_depth=18,
    min_child_weight=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    random_state=42,
    # use_label_encoder=False is deprecated and removed in recent XGBoost versions.
    eval_metric="logloss" 
)

print("\nTraining XGBoost...")
model.fit(X_aug_pca, y_aug)

y_pred = model.predict(x_test_pca)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

Preprocessing completed. Shape: (472432, 150)
PCA completed. Train PCA shape: (472432, 79)

Starting MH-SysGen augmentation...
Augmentation complete. New shape: (804412, 150)

Training XGBoost...

===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    114044
           1       0.43      0.50      0.46      4064

    accuracy                           0.96    118108
   macro avg       0.71      0.74      0.72    118108
weighted avg       0.96      0.96      0.96    118108

[[111396   2648]
 [  2031   2033]]


In [25]:
import pandas as pd
import numpy as np
# Assuming mh_sys_gen is available in your environment
from mh_sys_gen import MHSysGen 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 0. CUSTOM FEATURE ENGINEER CLASS (NEW)
# =========================================================
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    A transformer to create time-based and aggregation features 
    specific to the IEEE Fraud Detection dataset.
    """
    def __init__(self, time_col='TransactionDT', amt_col='TransactionAmt'):
        self.time_col = time_col
        self.amt_col = amt_col
        self.agg_features = {} # Stores aggregation maps during fit

    def fit(self, X, y=None):
        X_copy = X.copy()
        X_copy['isFraud'] = y # Temporarily add target for aggregation/target encoding
        
        # 1. Base Aggregation Keys
        self.base_agg_keys = ['card1', 'addr1']
        
        # 2. Prepare Aggregation Maps
        for col in self.base_agg_keys:
            # Frequency Map (for velocity/cardinality)
            self.agg_features[f'{col}_Count_Map'] = X_copy[col].value_counts()
            
            # Amount Mean Map
            self.agg_features[f'{col}_Amt_Mean_Map'] = X_copy.groupby(col)[self.amt_col].mean()
            
            # Amount Std Map
            self.agg_features[f'{col}_Amt_Std_Map'] = X_copy.groupby(col)[self.amt_col].std().fillna(1.0)
            
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # --- Time-Based Features (Requires TransactionDT) ---
        if self.time_col in X_copy.columns:
            # Hour of Day (0-23)
            X_copy['Transaction_Hour'] = (X_copy[self.time_col] // 3600) % 24
            # Day of Week (0-6)
            X_copy['Transaction_DayOfWeek'] = (X_copy[self.time_col] // (3600 * 24)) % 7
            X_copy['Transaction_Day'] = X_copy[self.time_col] // (3600 * 24)

        # --- Aggregation and Ratio Features ---
        for col in self.base_agg_keys:
            # Frequency Feature
            count_map = self.agg_features[f'{col}_Count_Map']
            X_copy[f'{col}_Count'] = X_copy[col].map(count_map).fillna(0)
            
            # Amount Mean Feature
            mean_map = self.agg_features[f'{col}_Amt_Mean_Map']
            X_copy[f'{col}_Amt_Mean'] = X_copy[col].map(mean_map).fillna(X_copy[self.amt_col].mean())
            
            # Amount Std Feature
            std_map = self.agg_features[f'{col}_Amt_Std_Map']
            X_copy[f'{col}_Amt_Std'] = X_copy[col].map(std_map).fillna(1.0)
        
        # Amount-to-Mean Ratio (highly predictive)
        X_copy['Amt_Div_Mean_card1'] = X_copy[self.amt_col] / X_copy['card1_Amt_Mean']
        
        # Log Transform
        X_copy['TransactionAmt_Log'] = np.log1p(X_copy[self.amt_col])
        
        return X_copy


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
# NOTE: The new engineered features will automatically be picked up 
# by the numeric_cols list after the CustomFeatureEngineer step.

nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

# Create a temporary list of numeric columns based on original data
numeric_cols_original = []
for col in df.columns:
    if col not in nominal_cols and \
        col not in nominal_high and \
        col not in ordinal_cols and \
        col != "isFraud" and \
        df[col].dtype != "object":
        numeric_cols_original.append(col)
# We will use this list to initialize the `numeric_cols` after FE

# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]

# =========================================================
# 3a. APPLY CUSTOM FEATURE ENGINEER (NEW STEP)
# =========================================================
fe = CustomFeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)

# Update numeric_cols to include the new engineered features
engineered_features = [
    'Transaction_Hour', 'Transaction_DayOfWeek', 'Transaction_Day',
    'card1_Count', 'card1_Amt_Mean', 'card1_Amt_Std',
    'addr1_Count', 'addr1_Amt_Mean', 'addr1_Amt_Std',
    'Amt_Div_Mean_card1', 'TransactionAmt_Log'
]
numeric_cols = numeric_cols_original + engineered_features

# Drop the original TransactionDT after extracting time features
if 'TransactionDT' in X_train.columns:
    X_train.drop(columns=['TransactionDT'], inplace=True)
    X_test.drop(columns=['TransactionDT'], inplace=True)
    numeric_cols.remove('TransactionDT')

# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self): # Fix: use double underscore
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            # Handle string conversion for mapping consistency
            self.freq_maps[col] = X[col].astype(str).value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            # Map based on string values and fill NaNs (new categories in test) with 0
            X2[col] = X2[col].astype(str).map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        # NOTE: nominal_high columns (emails, device info) are now Frequency Encoded
        ("freq", freq, nominal_high), 
        # NOTE: numeric_cols now includes the engineered features
        ("num", scaler, [col for col in numeric_cols if col in X_train.columns]), 
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            # Check if the transformer is a OneHotEncoder to use get_feature_names_out
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.90, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
from catboost import CatBoostClassifier
model_cat = CatBoostClassifier(iterations=1500,
                           depth=10,
                           learning_rate=0.05,
                           loss_function='Logloss',
                           eval_metric='F1',
                           scale_pos_weight=15,
                           random_seed=42,
                           verbose=200
    
)
print("Training catboost")
model_cat.fit(X_aug_pca, y_aug)

model_xgb = XGBClassifier(
    n_estimators=600,
    max_depth=18,
    min_child_weight=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1,
    random_state=42,
    # use_label_encoder=False is deprecated and removed in recent XGBoost versions.
    eval_metric="logloss" 
)
print("training xgboost")
model_xgb.fit(X_aug_pca, y_aug)

print("finished training")

Preprocessing completed. Shape: (472432, 150)
PCA completed. Train PCA shape: (472432, 63)

Starting MH-SysGen augmentation...
Augmentation complete. New shape: (970402, 150)
Training catboost
0:	learn: 0.9714687	total: 400ms	remaining: 9m 59s
200:	learn: 0.9792732	total: 1m 27s	remaining: 9m 25s
400:	learn: 0.9838024	total: 2m 56s	remaining: 8m 3s
600:	learn: 0.9868530	total: 4m 25s	remaining: 6m 37s
800:	learn: 0.9890632	total: 5m 53s	remaining: 5m 8s
1000:	learn: 0.9908265	total: 7m 22s	remaining: 3m 40s
1200:	learn: 0.9922523	total: 8m 51s	remaining: 2m 12s
1400:	learn: 0.9934665	total: 10m 21s	remaining: 43.9s
1499:	learn: 0.9939615	total: 11m 5s	remaining: 0us
training xgboost
finished training


In [35]:
p_xgb_train=model_xgb.predict_proba(x_train_pca)[:,1]
p_cat_train=model_cat.predict_proba(x_train_pca)[:,1]
stack_train=np.column_stack([p_xgb_train,p_cat_train])

from sklearn.linear_model import LogisticRegression
meta_model=LogisticRegression(max_iter=1000)
meta_model.fit(stack_train,y_train)

p_xgb_test=model_xgb.predict_proba(x_test_pca)[:,1]
p_cat_test=model_cat.predict_proba(x_test_pca)[:,1]
stack_test=np.column_stack([p_xgb_test,p_cat_test])

proba=meta_model.predict_proba(stack_test)[:,1]
final_pred=(proba >=0.27).astype(int)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====")
print(classification_report(y_test, final_pred))
print(confusion_matrix(y_test,final_pred))
from sklearn.metrics import roc_auc_score
rc_score=roc_auc_score(y_test, final_pred)
print("roc-auc score",rc_score)


===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    114044
           1       0.48      0.47      0.48      4064

    accuracy                           0.96    118108
   macro avg       0.73      0.73      0.73    118108
weighted avg       0.96      0.96      0.96    118108

[[111993   2051]
 [  2152   1912]]
roc-auc score 0.7262440770891854


In [48]:
import pandas as pd
import numpy as np
# Assuming mh_sys_gen is available in your environment
from mh_sys_gen import MHSysGen 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 0. CUSTOM FEATURE ENGINEER CLASS (NEW)
# =========================================================
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    A transformer to create time-based and aggregation features 
    specific to the IEEE Fraud Detection dataset.
    """
    def __init__(self, time_col='TransactionDT', amt_col='TransactionAmt'):
        self.time_col = time_col
        self.amt_col = amt_col
        self.agg_features = {} # Stores aggregation maps during fit

    def fit(self, X, y=None):
        X_copy = X.copy()
        X_copy['isFraud'] = y # Temporarily add target for aggregation/target encoding
        
        # 1. Base Aggregation Keys
        self.base_agg_keys = ['card1', 'addr1']
        
        # 2. Prepare Aggregation Maps
        for col in self.base_agg_keys:
            # Frequency Map (for velocity/cardinality)
            self.agg_features[f'{col}_Count_Map'] = X_copy[col].value_counts()
            
            # Amount Mean Map
            self.agg_features[f'{col}_Amt_Mean_Map'] = X_copy.groupby(col)[self.amt_col].mean()
            
            # Amount Std Map
            self.agg_features[f'{col}_Amt_Std_Map'] = X_copy.groupby(col)[self.amt_col].std().fillna(1.0)
            
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # --- Time-Based Features (Requires TransactionDT) ---
        if self.time_col in X_copy.columns:
            # Hour of Day (0-23)
            X_copy['Transaction_Hour'] = (X_copy[self.time_col] // 3600) % 24
            # Day of Week (0-6)
            X_copy['Transaction_DayOfWeek'] = (X_copy[self.time_col] // (3600 * 24)) % 7
            X_copy['Transaction_Day'] = X_copy[self.time_col] // (3600 * 24)

        # --- Aggregation and Ratio Features ---
        for col in self.base_agg_keys:
            # Frequency Feature
            count_map = self.agg_features[f'{col}_Count_Map']
            X_copy[f'{col}_Count'] = X_copy[col].map(count_map).fillna(0)
            
            # Amount Mean Feature
            mean_map = self.agg_features[f'{col}_Amt_Mean_Map']
            X_copy[f'{col}_Amt_Mean'] = X_copy[col].map(mean_map).fillna(X_copy[self.amt_col].mean())
            
            # Amount Std Feature
            std_map = self.agg_features[f'{col}_Amt_Std_Map']
            X_copy[f'{col}_Amt_Std'] = X_copy[col].map(std_map).fillna(1.0)
        
        # Amount-to-Mean Ratio (highly predictive)
        X_copy['Amt_Div_Mean_card1'] = X_copy[self.amt_col] / X_copy['card1_Amt_Mean']
        
        # Log Transform
        X_copy['TransactionAmt_Log'] = np.log1p(X_copy[self.amt_col])
        
        return X_copy


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
# NOTE: The new engineered features will automatically be picked up 
# by the numeric_cols list after the CustomFeatureEngineer step.

nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

# Create a temporary list of numeric columns based on original data
numeric_cols_original = []
for col in df.columns:
    if col not in nominal_cols and \
        col not in nominal_high and \
        col not in ordinal_cols and \
        col != "isFraud" and \
        df[col].dtype != "object":
        numeric_cols_original.append(col)
# We will use this list to initialize the `numeric_cols` after FE

# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]

# =========================================================
# 3a. APPLY CUSTOM FEATURE ENGINEER (NEW STEP)
# =========================================================
fe = CustomFeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)

# Update numeric_cols to include the new engineered features
engineered_features = [
    'Transaction_Hour', 'Transaction_DayOfWeek', 'Transaction_Day',
    'card1_Count', 'card1_Amt_Mean', 'card1_Amt_Std',
    'addr1_Count', 'addr1_Amt_Mean', 'addr1_Amt_Std',
    'Amt_Div_Mean_card1', 'TransactionAmt_Log'
]
numeric_cols = numeric_cols_original + engineered_features

# Drop the original TransactionDT after extracting time features
if 'TransactionDT' in X_train.columns:
    X_train.drop(columns=['TransactionDT'], inplace=True)
    X_test.drop(columns=['TransactionDT'], inplace=True)
    numeric_cols.remove('TransactionDT')

# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self): # Fix: use double underscore
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            # Handle string conversion for mapping consistency
            self.freq_maps[col] = X[col].astype(str).value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            # Map based on string values and fill NaNs (new categories in test) with 0
            X2[col] = X2[col].astype(str).map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        # NOTE: nominal_high columns (emails, device info) are now Frequency Encoded
        ("freq", freq, nominal_high), 
        # NOTE: numeric_cols now includes the engineered features
        ("num", scaler, [col for col in numeric_cols if col in X_train.columns]), 
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            # Check if the transformer is a OneHotEncoder to use get_feature_names_out
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.95, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)




def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 900),
        "max_depth": trial.suggest_int("max_depth", 4, 18),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "scale_pos_weight": 1.0,  # you can change if needed
    }

    model = XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        n_jobs=-1,
        random_state=42,
        **params,
    )

    model.fit(X_aug_pca, y_aug)
    y_val_proba = model.predict_proba(x_test_pca)[:, 1]
    auc = roc_auc_score(y_test, y_val_proba)
    return auc
import optuna
print("\nStarting Optuna tuning...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20, show_progress_bar=False)

print("\nBest AUC from Optuna:", study.best_value)
print("Best params:", study.best_params)

best_params = study.best_params

Preprocessing completed. Shape: (472432, 150)
PCA completed. Train PCA shape: (472432, 79)

Starting MH-SysGen augmentation...
Augmentation complete. New shape: (970402, 150)


[I 2025-12-09 14:46:42,918] A new study created in memory with name: no-name-4c7a9bbb-a3d0-4e44-991f-17ec6f4579c8



Starting Optuna tuning...


[I 2025-12-09 14:48:43,470] Trial 0 finished with value: 0.885928767486689 and parameters: {'n_estimators': 372, 'max_depth': 15, 'min_child_weight': 9, 'learning_rate': 0.04260807738692856, 'subsample': 0.6322739922792978, 'colsample_bytree': 0.789378175020648, 'gamma': 0.5473581155773255, 'reg_lambda': 0.0019094698638261935, 'reg_alpha': 0.16454201700646068}. Best is trial 0 with value: 0.885928767486689.
[I 2025-12-09 14:49:59,773] Trial 1 finished with value: 0.8743644865053466 and parameters: {'n_estimators': 406, 'max_depth': 10, 'min_child_weight': 5, 'learning_rate': 0.035339559893218984, 'subsample': 0.9008341499754189, 'colsample_bytree': 0.7553248133039082, 'gamma': 1.8510517324939242, 'reg_lambda': 0.20117386869376658, 'reg_alpha': 1.002212647746794}. Best is trial 0 with value: 0.885928767486689.
[I 2025-12-09 14:50:34,608] Trial 2 finished with value: 0.8529619989104219 and parameters: {'n_estimators': 343, 'max_depth': 5, 'min_child_weight': 7, 'learning_rate': 0.0302476

XGBoostError: [14:55:06] C:\actions-runner\_work\xgboost\xgboost\src\common\io.h:362: bad_malloc: Failed to allocate 3438727168 bytes.

In [45]:
import pandas as pd
import numpy as np
# Assuming mh_sys_gen is available in your environment
from mh_sys_gen import MHSysGen 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# =========================================================
# 0. CUSTOM FEATURE ENGINEER CLASS (NEW)
# =========================================================
class CustomFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    A transformer to create time-based and aggregation features 
    specific to the IEEE Fraud Detection dataset.
    """
    def __init__(self, time_col='TransactionDT', amt_col='TransactionAmt'):
        self.time_col = time_col
        self.amt_col = amt_col
        self.agg_features = {} # Stores aggregation maps during fit

    def fit(self, X, y=None):
        X_copy = X.copy()
        X_copy['isFraud'] = y # Temporarily add target for aggregation/target encoding
        
        # 1. Base Aggregation Keys
        self.base_agg_keys = ['card1', 'addr1']
        
        # 2. Prepare Aggregation Maps
        for col in self.base_agg_keys:
            # Frequency Map (for velocity/cardinality)
            self.agg_features[f'{col}_Count_Map'] = X_copy[col].value_counts()
            
            # Amount Mean Map
            self.agg_features[f'{col}_Amt_Mean_Map'] = X_copy.groupby(col)[self.amt_col].mean()
            
            # Amount Std Map
            self.agg_features[f'{col}_Amt_Std_Map'] = X_copy.groupby(col)[self.amt_col].std().fillna(1.0)
            
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # --- Time-Based Features (Requires TransactionDT) ---
        if self.time_col in X_copy.columns:
            # Hour of Day (0-23)
            X_copy['Transaction_Hour'] = (X_copy[self.time_col] // 3600) % 24
            # Day of Week (0-6)
            X_copy['Transaction_DayOfWeek'] = (X_copy[self.time_col] // (3600 * 24)) % 7
            X_copy['Transaction_Day'] = X_copy[self.time_col] // (3600 * 24)

        # --- Aggregation and Ratio Features ---
        for col in self.base_agg_keys:
            # Frequency Feature
            count_map = self.agg_features[f'{col}_Count_Map']
            X_copy[f'{col}_Count'] = X_copy[col].map(count_map).fillna(0)
            
            # Amount Mean Feature
            mean_map = self.agg_features[f'{col}_Amt_Mean_Map']
            X_copy[f'{col}_Amt_Mean'] = X_copy[col].map(mean_map).fillna(X_copy[self.amt_col].mean())
            
            # Amount Std Feature
            std_map = self.agg_features[f'{col}_Amt_Std_Map']
            X_copy[f'{col}_Amt_Std'] = X_copy[col].map(std_map).fillna(1.0)
        
        # Amount-to-Mean Ratio (highly predictive)
        X_copy['Amt_Div_Mean_card1'] = X_copy[self.amt_col] / X_copy['card1_Amt_Mean']
        
        # Log Transform
        X_copy['TransactionAmt_Log'] = np.log1p(X_copy[self.amt_col])
        
        return X_copy


# =========================================================
# 1. LOAD DATA
# =========================================================
df = pd.read_csv("../data/train_cleaned.csv")
df.drop(columns=["TransactionID"], inplace=True)

# Sort by time — IMPORTANT
df = df.sort_values(by="TransactionDT").reset_index(drop=True)


# =========================================================
# 2. FEATURE GROUPS
# =========================================================
# NOTE: The new engineered features will automatically be picked up 
# by the numeric_cols list after the CustomFeatureEngineer step.

nominal_cols = ['ProductCD', 'card4', 'card6']
nominal_high = ['P_emaildomain', 'R_emaildomain', "id_30", "id_31", "DeviceInfo", "id_33"]
ordinal_cols = ['M4', 'M2', 'M3', 'M5', 'M6']

# Create a temporary list of numeric columns based on original data
numeric_cols_original = []
for col in df.columns:
    if col not in nominal_cols and \
        col not in nominal_high and \
        col not in ordinal_cols and \
        col != "isFraud" and \
        df[col].dtype != "object":
        numeric_cols_original.append(col)
# We will use this list to initialize the `numeric_cols` after FE

# =========================================================
# 3. TIME-BASED SPLIT
# =========================================================
split_index = int(0.8 * len(df))
train_df = df.iloc[:split_index].copy()
test_df = df.iloc[split_index:].copy()

X_train = train_df.drop(columns=["isFraud"])
y_train = train_df["isFraud"]
X_test = test_df.drop(columns=["isFraud"])
y_test = test_df["isFraud"]

# =========================================================
# 3a. APPLY CUSTOM FEATURE ENGINEER (NEW STEP)
# =========================================================
fe = CustomFeatureEngineer()
X_train = fe.fit_transform(X_train, y_train)
X_test = fe.transform(X_test)

# Update numeric_cols to include the new engineered features
engineered_features = [
    'Transaction_Hour', 'Transaction_DayOfWeek', 'Transaction_Day',
    'card1_Count', 'card1_Amt_Mean', 'card1_Amt_Std',
    'addr1_Count', 'addr1_Amt_Mean', 'addr1_Amt_Std',
    'Amt_Div_Mean_card1', 'TransactionAmt_Log'
]
numeric_cols = numeric_cols_original + engineered_features

# Drop the original TransactionDT after extracting time features
if 'TransactionDT' in X_train.columns:
    X_train.drop(columns=['TransactionDT'], inplace=True)
    X_test.drop(columns=['TransactionDT'], inplace=True)
    numeric_cols.remove('TransactionDT')

# =========================================================
# 4. Frequency Encoder (FIXED VERSION)
# =========================================================
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self): # Fix: use double underscore
        self.freq_maps = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.freq_maps = {}
        for col in X.columns:
            # Handle string conversion for mapping consistency
            self.freq_maps[col] = X[col].astype(str).value_counts(normalize=True)
        return self

    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X.copy()
        for col in X.columns:
            # Map based on string values and fill NaNs (new categories in test) with 0
            X2[col] = X2[col].astype(str).map(self.freq_maps[col]).fillna(0)
        return X2


# =========================================================
# 5. COLUMN TRANSFORMER
# =========================================================
ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
freq = FrequencyEncoder()
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", ohe, nominal_cols),
        ("ordinal", oe, ordinal_cols),
        # NOTE: nominal_high columns (emails, device info) are now Frequency Encoded
        ("freq", freq, nominal_high), 
        # NOTE: numeric_cols now includes the engineered features
        ("num", scaler, [col for col in numeric_cols if col in X_train.columns]), 
    ],
    remainder='drop'
)


# =========================================================
# 6. APPLY PREPROCESSOR
# =========================================================
X_train_pre = preprocessor.fit_transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Get output feature names
def get_feature_names(ct):
    names = []
    for name, trans, cols in ct.transformers_:
        if name != 'remainder':
            # Check if the transformer is a OneHotEncoder to use get_feature_names_out
            if hasattr(trans, "get_feature_names_out"):
                names.extend(trans.get_feature_names_out(cols))
            else:
                names.extend(cols)
    return names

train_cols = get_feature_names(preprocessor)

x_train_trans = pd.DataFrame(X_train_pre, columns=train_cols)
x_test_trans = pd.DataFrame(X_test_pre, columns=train_cols)

print("Preprocessing completed. Shape:", x_train_trans.shape)


# =========================================================
# 7. PCA (FIT ONLY ON TRAIN — NO LEAKAGE)
# =========================================================
pca = PCA(n_components=0.90, random_state=42)
x_train_pca = pca.fit_transform(x_train_trans)
x_test_pca = pca.transform(x_test_trans)

print("PCA completed. Train PCA shape:", x_train_pca.shape)


# =========================================================
# 8. MHSysGen AUGMENTATION
# =========================================================
df_aug = pd.DataFrame(x_train_trans.copy())
df_aug["isFraud"] = y_train.values

mh = MHSysGen(method="parallel", ratio=15, minority_class=1)

print("\nStarting MH-SysGen augmentation...")
X_aug, y_aug = mh.fit_resample(df_aug, target="isFraud")
print("Augmentation complete. New shape:", X_aug.shape)

# PCA transform on augmented data (NO FIT AGAIN!)
X_aug_pca = pca.transform(X_aug)


# =========================================================
# 9. XGBOOST MODEL (Your hyperparameters)
# =========================================================
from catboost import CatBoostClassifier
model_cat = CatBoostClassifier(iterations=1500,
                           depth=15,
                           learning_rate=0.1,
                           loss_function='Logloss',
                           eval_metric='F1',
                           scale_pos_weight=15,
                           random_seed=42,
                           verbose=200, 
                           l2_leaf_reg= 0.007680412132464856, 
                           bagging_temperature= 0.2503415910965906,
                           random_strength= 1.0004685436237792, 
                           border_count=153,
    
)
print("Training catboost")
model_cat.fit(X_aug_pca, y_aug)

model_xgb = XGBClassifier(
    n_estimators= 600, max_depth= 20, 
    min_child_weight= 9, learning_rate= 0.1, 
    subsample= 0.6322739922792978, 
    colsample_bytree= 0.789378175020648, 
    gamma= 0.5473581155773255, 
    reg_lambda= 0.0019094698638261935, 
    reg_alpha= 0.16454201700646068,
    objective="binary:logistic",
    tree_method="hist",
    n_jobs=-1,
    random_state=42,
    scale_pos_weight=1,
    # use_label_encoder=False is deprecated and removed in recent XGBoost versions.
    eval_metric="logloss" 
)
print("training xgboost")
model_xgb.fit(X_aug_pca, y_aug)

print("finished training")

Preprocessing completed. Shape: (472432, 150)
PCA completed. Train PCA shape: (472432, 63)

Starting MH-SysGen augmentation...
Augmentation complete. New shape: (970402, 150)
Training catboost
0:	learn: 0.9721342	total: 5.29s	remaining: 2h 12m 2s
200:	learn: 0.9996422	total: 13m 1s	remaining: 1h 24m 8s
400:	learn: 0.9998761	total: 21m 49s	remaining: 59m 47s
600:	learn: 0.9999054	total: 30m 12s	remaining: 45m 10s
800:	learn: 0.9999201	total: 38m 18s	remaining: 33m 25s
1000:	learn: 0.9999411	total: 46m 46s	remaining: 23m 18s
1200:	learn: 0.9999585	total: 55m 15s	remaining: 13m 45s
1400:	learn: 0.9999627	total: 1h 3m 43s	remaining: 4m 30s
1499:	learn: 0.9999627	total: 1h 7m 52s	remaining: 0us
training xgboost
finished training


In [47]:
p_xgb_train=model_xgb.predict_proba(x_train_pca)[:,1]
p_cat_train=model_cat.predict_proba(x_train_pca)[:,1]
stack_train=np.column_stack([p_xgb_train,p_cat_train])

from sklearn.linear_model import LogisticRegression
meta_model=LogisticRegression(max_iter=1000)
meta_model.fit(stack_train,y_train)

p_xgb_test=model_xgb.predict_proba(x_test_pca)[:,1]
p_cat_test=model_cat.predict_proba(x_test_pca)[:,1]
stack_test=np.column_stack([p_xgb_test,p_cat_test])

proba=meta_model.predict_proba(stack_test)[:,1]
final_pred=(proba >=0.2).astype(int)


# =========================================================
# 10. RESULTS
# =========================================================
print("\n===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====")
print(classification_report(y_test, final_pred))
print(confusion_matrix(y_test,final_pred))
from sklearn.metrics import roc_auc_score
rc_score=roc_auc_score(y_test, final_pred)
print("roc-auc score",rc_score)


===== FINAL RESULTS (FE + MH-SysGen + XGBoost) =====
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    114044
           1       0.48      0.45      0.46      4064

    accuracy                           0.96    118108
   macro avg       0.73      0.72      0.72    118108
weighted avg       0.96      0.96      0.96    118108

[[112060   1984]
 [  2233   1831]]
roc-auc score 0.7165722721814511


In [None]:
##faster sequential code mhsysgen

import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors

class FastSequentialShadowMirror:
    """
    FAST version of Sequential Shadow-Mirror oversampling.
    Same ideology:
      1. Shadow scaling
      2. Mirror using nearest neighbor reflection
      3. Vectorized & optimized (10-40x faster)
    """

    def __init__(self, target_col, minority_class=1, ratio=1.0):
        self.target_col = target_col
        self.minority_class = minority_class
        self.ratio = ratio

    def fit_resample(self, df):
        """
        df : DataFrame containing both features and target_col  
        returns: X_resampled, y_resampled
        """

        # ----------------------------
        # PREPARE DATA
        # ----------------------------
        features = [c for c in df.columns if c != self.target_col]
        minority_df = df[df[self.target_col] == self.minority_class]

        if minority_df.empty:
            raise ValueError(f"Minority class {self.minority_class} not found.")

        total_synth = int(len(minority_df) * self.ratio)
        if total_synth < 1:
            raise ValueError("Ratio too small; produced 0 samples.")

        X_min = minority_df[features].values
        n_features = X_min.shape[1]

        # ----------------------------
        # PHASE 1 — SHADOW (VECTORIZED)
        # ----------------------------
        idxs = np.random.randint(0, len(X_min), size=total_synth)
        originals = X_min[idxs]

        scales = np.random.uniform(0.6, 1.4, size=(total_synth, n_features))
        shadows = originals * scales

        # ----------------------------
        # PHASE 2 — MIRROR (VECTORIZED KNN)
        # ----------------------------
        nbrs = NearestNeighbors(n_neighbors=2).fit(shadows)
        _, idx_arrays = nbrs.kneighbors(shadows)

        neighbors = shadows[idx_arrays[:, 1]]

        reflect_scale = np.random.uniform(0.5, 1.3, size=(total_synth, 1))
        mirrored = shadows + (neighbors - shadows) * reflect_scale

        # ----------------------------
        # BUILD SYNTHETIC DF
        # ----------------------------
        synthetic_df = pd.DataFrame(mirrored, columns=features)
        synthetic_df[self.target_col] = self.minority_class

        combined = pd.concat([df, synthetic_df], ignore_index=True)

        X_out = combined.drop(columns=[self.target_col])
        y_out = combined[self.target_col]

        return X_out, y_out