In [110]:
!pip install scikit-learn==1.5.1 --force-reinstall

Collecting scikit-learn==1.5.1
  Using cached scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting numpy>=1.19.5 (from scikit-learn==1.5.1)
  Using cached numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting scipy>=1.6.0 (from scikit-learn==1.5.1)
  Using cached scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn==1.5.1)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn==1.5.1)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl (16.9 MB)
Using cached scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.3 MB)
Using 

In [111]:
import sklearn
print("✅ Đang dùng sklearn version:", sklearn.__version__)


✅ Đang dùng sklearn version: 1.5.1


In [148]:
path = '/content/transaction_dataset.csv'
import pandas as pd

df = pd.read_csv(path, index_col = 0)
df.columns = df.columns.str.strip()

In [179]:
len(df.columns)

50

In [136]:
import numpy as np
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# This is for validation process
X_train = train_df.drop('FLAG', axis=1)
y_train = train_df['FLAG']
X_test = test_df.drop('FLAG', axis=1)
y_test = test_df['FLAG']

In [162]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

class IntelligentImputer(BaseEstimator, TransformerMixin):
  def __init__(self, numeric_fill_value=0, categorical_fill_value='Unknown'):
    self.numeric_fill_value = numeric_fill_value
    self.categorical_fill_value = categorical_fill_value


  def fit(self, X, y=None):
    self.numeric_cols_ = X.select_dtypes(include=np.number).columns.tolist()
    self.categorical_cols_ = X.select_dtypes(exclude=np.number).columns.tolist()

    self.encoder_ = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

    # Create temp data for encoder to learn, fit mustn't be used to fillna
    temp_categorical_data = X[self.categorical_cols_].fillna(self.categorical_fill_value)
    self.encoder_.fit(temp_categorical_data)

    return self
  def transform(self, X):
    X_transformed = X.copy()

    if self.numeric_cols_:
      X_transformed[self.numeric_cols_] = X_transformed[self.numeric_cols_].fillna(self.numeric_fill_value)

    if self.categorical_cols_:
      X_transformed[self.categorical_cols_] = X_transformed[self.categorical_cols_].fillna(self.categorical_fill_value)

    # Encode using encoder_ fitted before
    X_transformed[self.categorical_cols_] = self.encoder_.transform(X_transformed[self.categorical_cols_])
    return X_transformed
  def get_feature_names_out(self, input_features=None):
    return self.numeric_cols_ + self.categorical_cols_

In [138]:
class ControlCharacterCleaner(BaseEstimator, TransformerMixin):
  def __init__(self):
    pass

  def _remove_control_char(self, s):
    cleaned = ''.join(c for c in str(s) if ord(c) >= 32 or c in '\t\n\r')
    return cleaned if cleaned.strip() != '' else np.nan

  def fit(self, X, y=None):
    self.object_cols_ = X.select_dtypes(include=['object', 'category']).columns.tolist()
    return self
  def transform(self, X):
      X_transformed = X.copy()
      for col in self.object_cols_:
          X_transformed[col] = X_transformed[col].apply(self._remove_control_char)
      return X_transformed


In [152]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    """
    Một transformer tùy biến để loại bỏ các cột được chỉ định từ DataFrame.
    """
    def __init__(self, columns_to_drop):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        return X_transformed.drop(columns=self.columns_to_drop, errors='ignore')

In [153]:
columns_to_remove = [
    'Index',
    'Address',
    'ERC20 most sent token type',
    'ERC20_most_rec_token_type'
]

In [184]:
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from ml_transformers import ColumnDropper, ControlCharacterCleaner, IntelligentImputer

full_pipeline = Pipeline(steps = [
    ('column_dropper', ColumnDropper(columns_to_drop=columns_to_remove)),
    ('control_char_cleaner', ControlCharacterCleaner()),
    ('intelligent_imputer', IntelligentImputer()),
    ('classifier', XGBClassifier())
])


In [185]:
full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1542
           1       0.94      0.92      0.93       427

    accuracy                           0.97      1969
   macro avg       0.96      0.95      0.95      1969
weighted avg       0.97      0.97      0.97      1969



In [186]:
X_train_real = test_df.drop(columns=['FLAG'])
y_train_real = test_df['FLAG']
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(X_train_real, y_train_real, test_size=0.2, random_state=42)
full_pipeline.fit(X_train_real, y_train_real)
y_pred_real = full_pipeline.predict(X_test_real)
print(classification_report(y_test_real, y_pred_real))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       313
           1       0.92      0.89      0.91        81

    accuracy                           0.96       394
   macro avg       0.95      0.93      0.94       394
weighted avg       0.96      0.96      0.96       394



In [188]:
joblib.dump(full_pipeline, 'full_pipeline.pkl')

['full_pipeline.pkl']

In [171]:
encoder = full_pipeline.named_steps['intelligent_imputer'].encoder_
joblib.dump(encoder, 'label_encoder.pkl')

['label_encoder.pkl']