In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import itertools
import warnings
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

Disclaimer: due to hardware limitations, I had to train the model with an under sample from the dataset. The complete set would have been used otherwise, as well as further optimization of hyper parameters and aditional machine learning models (randomforest, svc), and also a recurrent neural network.

Index:
    1 - Data preprocesing
    
    2 - Looking for patterns and relations between fraudulent transactions
    
    3 - Training the models
    
    4 - Conclusion

In [None]:
df = pd.read_csv('Fraud.csv')

In [None]:
%matplotlib inline
df.columns

In [None]:
df.describe()

In [None]:
# Filter rows that nameDest starts with 'M' (Merchant)
merchant_rows = df[df['nameDest'].str.startswith('M')]

print(merchant_rows['newbalanceDest'])

At first glance, the dataset had no missing values, so no null treatment was done. However, given that Merchant movements doesn't have meaningful information, all the M customers were removed.

In [None]:
df = df[~df['nameDest'].str.startswith('M')]
df.isnull().sum()

In [None]:
df = df.sort_values(by='step')

train_split = 575

train = df[df['step'] <= train_split]
test = df[df['step'] > train_split]
train.isnull().sum()

In [None]:
train['train_test'] = 1
test['train_test'] = 0

In [None]:
true_count = df['isFraud'].sum()
false_count = len(df) - true_count

print(f'True values (1): {true_count}')
print(f'False values (0): {false_count}')

In [None]:
def apply_transformations(df):
    categories = ['CASH_IN', 'CASH_OUT', 'TRANSFER', 'DEBIT']  # Lista de categorías conocidas
    
    # Reemplazamos cualquier categoría desconocida por 'PAYMENT' (o la categoría que desees)
    df['type'] = df['type'].apply(lambda x: x if x in categories else 'PAYMENT')
    
    category_mapping = {'CASH_IN': 1, 'CASH_OUT': 2, 'TRANSFER': 3, 'DEBIT': 4}
    
    df['type_encoded'] = df['type'].map(category_mapping)
    
    numeric_columns = ['step', 'amount', 
                       'oldbalanceOrg', 'newbalanceOrig', 
                       'oldbalanceDest', 'newbalanceDest']
    
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    
    df_encoded = df.drop(['type'], axis=1)
    
    return df_encoded

# Aplicar transformaciones a los conjuntos de datos
training_final = apply_transformations(train.copy())
test_final = apply_transformations(test.copy())



numeric_columns = ['step', 'amount',
                   'oldbalanceOrg', 'newbalanceOrig',
                   'oldbalanceDest', 'newbalanceDest']

scaler = StandardScaler()
X_train_scaled = training_final.drop(['isFraud', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)
X_train_scaled[numeric_columns] = scaler.fit_transform(X_train_scaled[numeric_columns])
y_train = training_final['isFraud']
X_train_scaled.columns = X_train_scaled.columns.astype(str)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline

# Imputación y escalado
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

X_train_scaled_imputed = preprocessor.fit_transform(X_train_scaled)

# Sobremuestreo con SMOTE
smote = SMOTE(sampling_strategy=0.0075, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled_imputed, y_train)

# Convertir a DataFrames
X_train_resampled_df = pd.DataFrame(X_train_resampled, columns=X_train_scaled.columns)
y_train_resampled_df = pd.Series(y_train_resampled, name='isFraud')

# Crear conjunto de prueba
X_test_scaled = test_final.drop(['isFraud', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)
X_test_scaled.columns = X_test_scaled.columns.astype(str)
y_test = test_final['isFraud']

# Mostrar información sobre el tamaño del conjunto de entrenamiento y prueba
print("Train set transactions:", X_train_resampled_df.shape[0])
print("Test set transactions:", X_test_scaled.shape[0])

# Procesamiento por lotes para el conjunto de entrenamiento
batch_size = 10000
num_batches = len(X_train_resampled_df) // batch_size

X_train_batches = []
y_train_batches = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    X_batch = X_train_resampled_df.iloc[start_idx:end_idx, :]
    y_batch = y_train_resampled_df.iloc[start_idx:end_idx]
    X_train_batches.append(X_batch)
    y_train_batches.append(y_batch)

# Si hay un lote final con un tamaño menor que batch_size
if len(X_train_resampled_df) % batch_size != 0:
    X_batch = X_train_resampled_df.iloc[num_batches * batch_size:, :]
    y_batch = y_train_resampled_df.iloc[num_batches * batch_size:]
    X_train_batches.append(X_batch)
    y_train_batches.append(y_batch)

# Combinar todos los lotes
X_train_final = pd.concat(X_train_batches)
y_train_final = pd.concat(y_train_batches)

In [None]:
def load_model(filename):
    return tf.keras.models.load_model(filename)


In [None]:
print(y_test)


In [None]:

from sklearn.metrics import mean_squared_error
import numpy as np

# Cargar el modelo
loaded_model = load_model('modelo_entrenado.h5')

def create_sequence_data(df, sequence_length):
    X, y = [], []

    for i in range(len(df) - sequence_length):
        # Obtén las secuencias de características
        X.append(df.iloc[i:i+sequence_length, :].values)
        
        # Obtén la etiqueta de la última fila en la secuencia
        y.append(df.iloc[i+sequence_length-1, -1])

    return np.array(X), np.array(y)


def evaluate_model(model, X_test, y_test):
    # Realiza predicciones en los datos de prueba
    y_pred = model.predict(X_test)

    # Ajusta las dimensiones de y_test
    y_test_reshaped = y_test.values.reshape(-1, 1)

    # Evalúa el rendimiento del modelo (puedes usar diferentes métricas según tu problema)
    mse = mean_squared_error(y_test_reshaped, y_pred)
    print(f'Mean Squared Error on Test Data: {mse}')


# Ajustar la longitud de las secuencias de prueba a 10
sequence_length = 10
X_test_scaled_reshaped, y_test = create_sequence_data(X_test_scaled, sequence_length)

# Asegurémonos de que las dimensiones sean correctas
X_test_scaled_reshaped = np.reshape(X_test_scaled_reshaped, (X_test_scaled_reshaped.shape[0], sequence_length, X_test_scaled_reshaped.shape[2]))

# Evaluar el modelo cargado
evaluate_model(loaded_model, X_test_scaled_reshaped, y_test)





In [None]:
test.shape