In [223]:
import os
file_path = r'C:\Users\Жанна\Desktop\OMGTU\ML_RGR\ML_RGR\data\final_data_card_transdata.csv'
output_dir = r'C:\Users\Жанна\Desktop\OMGTU\ML_RGR\ML_RGR\rgr_models'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [224]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
import tensorflow as tf
import pickle


In [225]:
data = pd.read_csv(file_path)
y = data["fraud"]
X = data.drop(["fraud"], axis=1)

In [226]:
data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,repeat_retailer,used_chip,used_pin_number,online_order,fraud,transaction_speed,secure_online_transaction
count,657984.0,657984.0,657984.0,657984.0,657984.0,657984.0,657984.0,657984.0,657984.0
mean,14.270848,1.275361,0.914623,0.350671,0.100521,0.650481,0.012351,0.174999,0.022722
std,13.282287,1.567461,0.279443,0.477181,0.300693,0.476818,0.110448,0.233419,0.149017
min,0.053961,0.000118,0.0,0.0,0.0,0.0,0.0,7e-06,0.0
25%,4.342518,0.219989,1.0,0.0,0.0,0.0,0.0,0.02142,0.0
50%,9.49037,0.641951,1.0,0.0,0.0,1.0,0.0,0.074894,0.0
75%,20.127328,1.689908,1.0,1.0,0.0,1.0,0.0,0.227952,0.0
max,58.542735,7.944152,1.0,1.0,1.0,1.0,1.0,1.136048,1.0


In [227]:
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

In [228]:
y_balanced.value_counts(normalize=True)

fraud
0    0.5
1    0.5
Name: proportion, dtype: float64

In [229]:
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42)

In [230]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [231]:
with open(os.path.join(output_dir, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)

In [232]:
def model_already_saved(model_name, method='pickle'):
    model_path = os.path.join(output_dir, f'{model_name}.{method}' if method != 'tensorflow' else f'{model_name}.keras')
    return os.path.exists(model_path)

def save_model(model, model_name, method='pickle'):
    model_path = os.path.join(output_dir, f'{model_name}.{method}' if method != 'tensorflow' else f'{model_name}.keras')
    
    if os.path.exists(model_path): return
    
    if method == 'pickle':
        with open(model_path, 'wb') as f:
            pickle.dump(model, f)
    elif method == 'catboost':
        model.save_model(model_path)
    elif method == 'tensorflow':
        model.save(model_path)

In [233]:
import tensorflow as tf
print(tf.__version__)

2.19.0


In [234]:
if not model_already_saved('knn_manhattan', method='pickle'):
    knn = KNeighborsClassifier(n_neighbors=7, metric='manhattan', weights='distance')
    knn.fit(X_train_scaled, y_train)
    save_model(knn, 'knn_manhattan', method='pickle')

In [235]:
if not model_already_saved('gradient_boosting', method='pickle'):
    gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=0.2, min_samples_leaf=5, max_depth=7, random_state=42)
    gbc.fit(X_train_scaled, y_train)
    save_model(gbc, 'gradient_boosting', method='pickle')

In [236]:
if not model_already_saved('catboost', method='catboost'):
    catboost = CatBoostClassifier(iterations=200, depth=7, learning_rate=0.2, random_state=42)
    catboost.fit(X_train_scaled, y_train)
    save_model(catboost, 'catboost', method='catboost')

In [237]:
if not model_already_saved('bagging', method='pickle'):
    bagging = BaggingClassifier(
        estimator=DecisionTreeClassifier(max_depth=5, random_state=42),
        n_estimators=10, max_samples=0.5, max_features=0.8, random_state=42
    )
    bagging.fit(X_train_scaled, y_train)
    save_model(bagging, 'bagging', method='pickle')

In [238]:
if not model_already_saved('stacking', method='pickle'):
    base_learners = [
        ('dtc', DecisionTreeClassifier(max_depth=10, min_samples_split=20, random_state=42)),
        ('gbc', GradientBoostingClassifier(n_estimators=50, max_depth=7, learning_rate=0.2, min_samples_leaf=5, random_state=42))
    ]
    meta_model = LogisticRegression()
    stacking = StackingClassifier(estimators=base_learners, final_estimator=meta_model, cv=5, n_jobs=-1)
    stacking.fit(X_train_scaled, y_train)
    save_model(stacking, 'stacking', method='pickle')

In [239]:
if not model_already_saved('neural_network', method='tensorflow'):
    model_nn = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model_nn.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)
    save_model(model_nn, 'neural_network', method='tensorflow')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m25995/25995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 2ms/step - accuracy: 0.6979 - loss: 0.5336 - val_accuracy: 0.7020 - val_loss: 0.5274
Epoch 2/10
[1m25995/25995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 2ms/step - accuracy: 0.7014 - loss: 0.5282 - val_accuracy: 0.7020 - val_loss: 0.5274
Epoch 3/10
[1m25995/25995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.7025 - loss: 0.5272 - val_accuracy: 0.7021 - val_loss: 0.5274
Epoch 4/10
[1m25995/25995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 2ms/step - accuracy: 0.7015 - loss: 0.5276 - val_accuracy: 0.7016 - val_loss: 0.5274
Epoch 5/10
[1m25995/25995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 2ms/step - accuracy: 0.7013 - loss: 0.5282 - val_accuracy: 0.7023 - val_loss: 0.5271
Epoch 6/10
[1m25995/25995[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 2ms/step - accuracy: 0.7020 - loss: 0.5276 - val_accuracy: 0.7022 - val_loss: 0.527