In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder

In [2]:
def haversine_distance(lat1, lon1, lat2, lon2):
    # Convert to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    km = 6371 * c  # Radius of Earth in km
    return km

## test

In [84]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

# ---------- Custom Risk Encoders ----------
class RiskEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, feature_name, n_bins=3, labels=['low', 'medium', 'high']):
        self.feature_name = feature_name
        self.n_bins = n_bins
        self.labels = labels

    def fit(self, X, y=None):
        if y is None:
            raise ValueError("y target is required for RiskEncoder")
        fraud_rate = pd.Series(y).groupby(X[self.feature_name]).mean()
        try:
            fraud_rate_binned = pd.cut(fraud_rate, bins=self.n_bins, 
                                       labels=self.labels, duplicates='drop')
        except ValueError:
            fraud_rate_binned = pd.Series([self.labels[0]] * len(fraud_rate), 
                                          index=fraud_rate.index)
        self.risk_map_ = fraud_rate_binned.to_dict()
        return self

    def transform(self, X):
        X_new = X.copy()
        X_new[f"{self.feature_name}_risk"] = X_new[self.feature_name].map(self.risk_map_).fillna(self.labels[0])
        return X_new

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names, n_bins=3, labels=['low', 'medium', 'high']):
        self.feature_names = feature_names if isinstance(feature_names, list) else [feature_names]
        self.n_bins = n_bins
        self.labels = labels

    def fit(self, X, y=None):
        if y is None:
            raise ValueError("y target is required for TargetMeanEncoder")
        self.encodings_ = {}
        for feature in self.feature_names:
            fraud_rate = pd.Series(y).groupby(X[feature]).mean()
            try:
                fraud_rate_binned = pd.cut(fraud_rate, bins=self.n_bins,
                                           labels=self.labels, duplicates='drop')
            except ValueError:
                fraud_rate_binned = pd.Series([self.labels[0]] * len(fraud_rate), 
                                              index=fraud_rate.index)
            self.encodings_[feature] = fraud_rate_binned.to_dict()
        return self

    def transform(self, X):
        X_new = X.copy()
        for feature in self.feature_names:
            X_new[f'{feature}_risk'] = X_new[feature].map(self.encodings_[feature]).fillna(self.labels[0])
        return X_new

# ---------- Distance function ----------
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = phi2 - phi1
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(dlambda/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

# ---------- Preprocessing Step ----------
def preprocess_data_safe(df):
    df = df.copy()
    df['distance'] = haversine_distance(df['merch_lat'], df['merch_long'], df['lat'], df['long'])
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df = df.sort_values(['cc_num', 'trans_date_trans_time'])
    df['unique_time'] = df['trans_date_trans_time'] + pd.to_timedelta(
        df.groupby(['cc_num', 'trans_date_trans_time']).cumcount(), unit='us')

    df['trans_count_24h'] = df.groupby('cc_num').rolling('24h', on='unique_time')['amt'].count().values
    df['avg_amt_24h'] = df.groupby('cc_num').rolling('24h', on='unique_time')['amt'].mean().values
    df['sum_amt_24h'] = df.groupby('cc_num').rolling('24h', on='unique_time')['amt'].sum().values
    df['trans_count_7d'] = df.groupby('cc_num').rolling('7d', on='unique_time')['amt'].count().values
    df = df.drop('unique_time', axis=1)

    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['is_night'] = df['hour'].apply(lambda x: int((x >= 22) or (x <= 5)))
    df['amt_deviation'] = df['amt'] - df.groupby('cc_num')['amt'].transform('mean')

    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year
    df['amt_bin'] = pd.qcut(df['amt'], q=4, labels=['low', 'medium', 'high', 'very_high'])
    df['age_bin'] = pd.qcut(df['age'], 4, labels=['young','young_adult','adult','old'])
    df['city_pop_log'] = np.log(df['city_pop'] + 1)

    df['month_sin'] = np.sin(2 * np.pi * df['trans_date_trans_time'].dt.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['trans_date_trans_time'].dt.month / 12)
    df['day_sin'] = np.sin(2 * np.pi * df['trans_date_trans_time'].dt.day / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['trans_date_trans_time'].dt.day / 31)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    return df

# ---------- Create Pipeline ----------
def create_pipeline(classifier):
    numerical_columns = [
        'distance', 'trans_count_24h', 'avg_amt_24h', 'sum_amt_24h',
        'trans_count_7d', 'day_of_week', 'is_weekend', 'amt_deviation',
        'is_night', 'city_pop_log', 'month_sin', 'month_cos',
        'day_sin', 'day_cos', 'hour_sin', 'hour_cos'
    ]
    categorical_columns = ['amt_bin', 'age_bin', 'gender']
    risk_categorical_columns = [
        'category_risk', 'job_risk', 'merchant_risk',
        'city_risk', 'zip_risk', 'state_risk'
    ]

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_columns),
        ('risk_cat', OneHotEncoder(drop='first', handle_unknown='ignore'), risk_categorical_columns)
    ], remainder='drop')

    pipeline = ImbPipeline([
        ('risk_category', TargetMeanEncoder(['category'])),
        ('risk_job', TargetMeanEncoder(['job'])),
        ('risk_merchant', RiskEncoder('merchant')),
        ('risk_city', RiskEncoder('city')),
        ('risk_zip', RiskEncoder('zip')),
        ('risk_state', TargetMeanEncoder(['state'])),
        ('preprocessing', preprocessor),
        ('smote', SMOTE(random_state=42)),
        ('classifier', classifier)
    ])
    return pipeline

# ---------- Usage ----------
drop_columns = [
    'lat', 'long', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long',
    'trans_date_trans_time', 'cc_num', 'day_of_trans', 'month_of_trans',
    'age', 'amt', 'hour', 'Unnamed: 0', 'first', 'last', 'street'
]

df = pd.read_csv('datasets/1/fraudTrain.csv')
df = preprocess_data_safe(df)
df = df.drop(columns=[c for c in drop_columns if c in df.columns])
y = df['is_fraud']
X = df.drop(columns=['is_fraud'])

clf = LGBMClassifier(random_state=42)
pipeline = create_pipeline(clf)

param_distributions = {
    'smote__sampling_strategy': [0.3, 0.5],
    'smote__k_neighbors': [3, 5],
    'classifier__n_estimators': [200, 300],
    'classifier__max_depth': [5, 7],
    'classifier__learning_rate': [0.05, 0.1]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=5,
    scoring='f1',
    n_jobs=-1,
    cv=cv,
    verbose=2
)
search.fit(X, y)

print("Best params:", search.best_params_)
print("Best F1:", search.best_score_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits




[LightGBM] [Info] Number of positive: 257833, number of negative: 859446
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.164152 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7669
[LightGBM] [Info] Number of data points in the train set: 1117279, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.230769 -> initscore=-1.203976
[LightGBM] [Info] Start training from score -1.203976
[LightGBM] [Info] Number of positive: 257833, number of negative: 859446
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.156404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7686
[LightGBM] [Info] Number of data points in the train set: 1117279, number of used features: 31
[LightGBM]

In [None]:
search.best_estimator_.predict(X_test)

In [85]:
y_pred_4 = search.predict(X_test)

In [88]:
print(classification_report(y_test,search.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.89      0.63      0.74      2145

    accuracy                           1.00    555719
   macro avg       0.95      0.81      0.87    555719
weighted avg       1.00      1.00      1.00    555719



In [3]:
# This calculates the actual risk of each value (frequency of fraud), bins it (e.g., low/medium/high), and encodes it in the dataframe without erroneous mappings.

def risk_encoding(data, feature, bins, labels):
    # Calculate fraud rates per unique feature value
    fraud_rate = data.groupby(feature)['is_fraud'].mean()
    # Bin the fraud rates
    fraud_rate_binned = pd.cut(fraud_rate, bins=bins, labels=labels)
    # Map binned fraud rate back to main data
    data[f'{feature}_risk'] = data[feature].map(fraud_rate_binned).fillna(labels[0])


In [4]:
drop_columns = ['lat','long','dob','trans_num','unix_time','merch_lat','merch_long','trans_date_trans_time',
                    'cc_num','merchant','category','city','state','zip','city_pop',
                    'day_of_trans','month_of_trans','age','amt','job','Unnamed: 0','first','last','street']

In [None]:
def preprocess_data(df : pd.DataFrame):

    df['distance'] = haversine_distance(
        df['merch_lat'],
        df['merch_long'],
        df['lat'],
        df['long']
    )

    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

    print("Start:", df.shape)
    
    # Sort by 'cc_num' and 'trans_date_trans_time'
    df = df.sort_values(['cc_num', 'trans_date_trans_time'])

    # Set 'trans_date_trans_time' as index for time-based rolling
    df = df.set_index('trans_date_trans_time')

    

    # Rolling transaction count per card in last 24 hours
    df['trans_count_24h'] = df.groupby('cc_num')['amt'].transform(
        lambda x: x.rolling('24h').count())
    
    print("After trans_count_24h:", df.shape)

    # Rolling average amount per card in last 24 hours
    df['avg_amt_24h'] = df.groupby('cc_num')['amt'].transform(
        lambda x: x.rolling('24h').mean())

    # Rolling sum of amounts per card in last 24 hours
    df['sum_amt_24h'] = df.groupby('cc_num')['amt'].transform(
        lambda x: x.rolling('24h').sum())

    # Rolling transaction count per card in last 7 days
    df['trans_count_7d'] = df.groupby('cc_num')['amt'].transform(
        lambda x: x.rolling('7d').count())

    # Reset index to make 'trans_date_trans_time' a column again
    df = df.reset_index()

    # Additional feature engineering
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        
    # Calculate deviation from mean amount per card
    df['amt_deviation'] = df['amt'] - df.groupby('cc_num')['amt'].transform('mean')

    # Bin amount
    df['amt_bin'] = pd.qcut(df['amt'], q=4, labels=['low', 'medium', 'high', 'very_high'])

    # Calculate risk scores
    df['category_risk'] = df.groupby('category')['is_fraud'].transform('mean')
    df['job_risk'] = df.groupby('job')['is_fraud'].transform('mean')

    # Calculate age from dob
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year

    # Calculate unusual hour
    customer_hour_mean = df.groupby('cc_num')['hour'].mean()
    df['unusual_hour'] = df.apply(
        lambda x: 1 if abs(x['hour'] - customer_hour_mean.get(x['cc_num'], x['hour'])) > 6 else 0,
        axis=1
    )

    df['day_of_trans'] = pd.to_datetime(df['trans_date_trans_time']).dt.day

    df['month_of_trans'] = pd.to_datetime(df['trans_date_trans_time']).dt.month

    df['is_night'] = df['hour'].apply(lambda x : 1 if x >=22 and x<=5 else 0)


    df['age_bin'] = pd.qcut(df['age'], 4, labels=['young','young_adult','adult','old'])

    df['city_pop_log'] = df['city_pop'].transform(np.log)
    

    risk_encoding(df, 'merchant', 3, labels=['low','medium','high'])

    risk_encoding(df, 'city', 3, labels=['low','medium','high'])

    risk_encoding(df, 'zip', 3, labels=['low','medium','high'])


    fraud_rate_per_state = (df[df['is_fraud']==1]['state'].value_counts())/df['state'].value_counts().sort_values(ascending=False)

    df['fraud_rate_per_state'] = df['state'].map(fraud_rate_per_state)


    df['month_sin'] = np.sin(2 * np.pi * df['month_of_trans'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month_of_trans'] / 12)

    df['day_sin'] = np.sin(2 * np.pi * df['day_of_trans'] / 31)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_trans'] / 31)

    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    return df



In [6]:
train_df = pd.read_csv('datasets/1/fraudTrain.csv')

In [7]:
train_df.shape

(1296675, 23)

In [8]:
preprocessed_train_df = preprocess_data(train_df)

Start: (1296675, 24)
After trans_count_24h: (1296675, 24)


In [9]:
preprocessed_train_df.drop(columns=drop_columns, inplace=True)

In [10]:
y_train = preprocessed_train_df['is_fraud']
X_train = preprocessed_train_df.drop(columns=['is_fraud'])

In [11]:
num_col = ['distance', 'trans_count_24h', 'avg_amt_24h', 'sum_amt_24h', 'trans_count_7d', 'amt_deviation',
           'is_weekend', 'is_night', 'job_risk', 'city_pop_log', 'category_risk', 'fraud_rate_per_state',
           'day_sin', 'day_cos', 'unusual_hour', 'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour',
           'hour_sin', 'hour_cos', 'day_of_week']


cat_col = ['gender', 'age_bin', 'merchant_risk', 'zip_risk', 'city_risk', 'amt_bin']

In [12]:
def create_preprocessing_pipeline():
    return ColumnTransformer([
        ('scaling', StandardScaler(), num_col),
        ('ohe', OneHotEncoder(drop='first'), cat_col)
    ])

def create_full_pipeline(classifier, smote_params=None):
    if smote_params is None:
        smote_params = {'sampling_strategy': 'auto', 'random_state': 42}
    pipeline = ImbPipeline([
        ('preprocessor', create_preprocessing_pipeline()),
        ('smote', SMOTE(**smote_params)),
        ('classifier', classifier)
    ])
    return pipeline

In [None]:
clf = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    
)

pipeline = create_full_pipeline(clf)


In [53]:
clf2 = LGBMClassifier(
    random_state=42
)

pipeline2 = create_full_pipeline(clf2)

In [42]:
# Define hyperparameter space for SMOTE + classifier
param_distributions = {
    # SMOTE params
    'smote__sampling_strategy': [0.1, 0.3, 0.5, 0.7, 1.0],  # ratio of minority/majority after sampling
    'smote__k_neighbors': [3, 5, 7],
    
    # Classifier params
    'classifier__n_estimators': [200, 500],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.05, 0.1]
}

In [43]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [44]:
random_search = RandomizedSearchCV(
    estimator=pipeline2,
    param_distributions=param_distributions,   # rename param_grid -> param_distributions
    scoring='f1',
    n_jobs=-1,
    cv=cv,
    verbose=2,
    n_iter=20  # you should set n_iter explicitly for RandomizedSearchCV
)


In [45]:
random_search.fit(X_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits




[LightGBM] [Info] Number of positive: 309400, number of negative: 1031335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8412
[LightGBM] [Info] Number of data points in the train set: 1340735, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.230769 -> initscore=-1.203974
[LightGBM] [Info] Start training from score -1.203974
[LightGBM] [Info] Number of positive: 309400, number of negative: 1031335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.337037 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8411
[LightGBM] [Info] Number of data points in the train set: 1340735, number of used features: 33

[LightG







[LightGBM] [Info] Number of positive: 515667, number of negative: 1031335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144047 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8413
[LightGBM] [Info] Number of data points in the train set: 1547002, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.333333 -> initscore=-0.693148
[LightGBM] [Info] Start training from score -0.693148
[CV] END classifier__learning_rate=0.05, classifier__max_depth=7, classifier__n_estimators=500, smote__k_neighbors=7, smote__sampling_strategy=0.7; total time= 3.4min
[LightGBM] [Info] Number of positive: 515668, number of negative: 1031336
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.110290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c



[CV] END classifier__learning_rate=0.05, classifier__max_depth=7, classifier__n_estimators=500, smote__k_neighbors=7, smote__sampling_strategy=0.7; total time= 3.5min
[LightGBM] [Info] Number of positive: 103133, number of negative: 1031335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.369624 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8393
[LightGBM] [Info] Number of data points in the train set: 1134468, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.090909 -> initscore=-2.302590
[LightGBM] [Info] Start training from score -2.302590
[CV] END classifier__learning_rate=0.05, classifier__max_depth=5, classifier__n_estimators=500, smote__k_neighbors=3, smote__sampling_strategy=0.5; total time= 2.7min
[CV] END classifier__learning_rate=0.05, classifier__max_depth=7, classifier__n_estimators=500, smote__



[LightGBM] [Info] Number of positive: 721934, number of negative: 1031335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.196126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8414
[LightGBM] [Info] Number of data points in the train set: 1753269, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.411765 -> initscore=-0.356676
[LightGBM] [Info] Start training from score -0.356676
[CV] END classifier__learning_rate=0.1, classifier__max_depth=3, classifier__n_estimators=500, smote__k_neighbors=3, smote__sampling_strategy=1.0; total time= 2.3min

[LightGBM] [Info] Number of positive: 721934, number of negative: 1031335
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.176640 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you c

In [46]:
print("Best parameters:", random_search.best_params_)
print("Best F1 score:", random_search.best_score_)

Best parameters: {'smote__sampling_strategy': 1.0, 'smote__k_neighbors': 5, 'classifier__n_estimators': 500, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.1}
Best F1 score: 0.9272419168166058


In [47]:
best_model = random_search.best_estimator_

In [51]:
y_pred_3 = random_search.predict(X_test)

In [52]:
print(classification_report(y_test,y_pred_3))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.27      0.83      0.40      2145

    accuracy                           0.99    555719
   macro avg       0.63      0.91      0.70    555719
weighted avg       1.00      0.99      0.99    555719



In [17]:
pipeline.fit(X_train,y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [54]:
pipeline2.fit(X_train,y_train)

[LightGBM] [Info] Number of positive: 1289169, number of negative: 1289169
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.087213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8414
[LightGBM] [Info] Number of data points in the train set: 2578338, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [18]:
test_df = pd.read_csv('datasets/1/fraudTest.csv')

In [21]:
test_df = preprocess_data(test_df)

Start: (555719, 24)
After trans_count_24h: (555719, 24)


In [22]:
X_test = test_df.drop(columns=['is_fraud'])
y_test = test_df['is_fraud']

In [24]:
y_pred = pipeline.predict(X_test)

In [25]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.33      0.84      0.47      2145

    accuracy                           0.99    555719
   macro avg       0.66      0.92      0.73    555719
weighted avg       1.00      0.99      0.99    555719



In [26]:
confusion_matrix(y_test,y_pred)

array([[549839,   3735],
       [   333,   1812]])

In [55]:
y_pred_2 = pipeline2.predict(X_test)

In [56]:
print(classification_report(y_test,y_pred_2))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    553574
           1       0.38      0.88      0.53      2145

    accuracy                           0.99    555719
   macro avg       0.69      0.94      0.76    555719
weighted avg       1.00      0.99      1.00    555719



In [57]:
confusion_matrix(y_test,y_pred_2)

array([[550460,   3114],
       [   253,   1892]])