In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import itertools
import warnings
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

Disclaimer: due to hardware limitations, I had to train the model with an under sample from the dataset. The complete set would have been used otherwise, as well as further optimization of hyper parameters and aditional machine learning models (randomforest, svc), and also a recurrent neural network.

Index:
    1 - Data preprocesing
    
    2 - Looking for patterns and relations between fraudulent transactions
    
    3 - Training the models
    
    4 - Conclusion

In [2]:
df = pd.read_csv('Fraud.csv')

In [3]:
%matplotlib inline
df.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [4]:
df.describe()

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0,6362620.0
mean,243.3972,179861.9,833883.1,855113.7,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,603858.2,2888243.0,2924049.0,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,156.0,13389.57,0.0,0.0,0.0,0.0,0.0,0.0
50%,239.0,74871.94,14208.0,0.0,132705.7,214661.4,0.0,0.0
75%,335.0,208721.5,107315.2,144258.4,943036.7,1111909.0,0.0,0.0
max,743.0,92445520.0,59585040.0,49585040.0,356015900.0,356179300.0,1.0,1.0


In [5]:
# Filter rows that nameDest starts with 'M' (Merchant)
merchant_rows = df[df['nameDest'].str.startswith('M')]

print(merchant_rows['newbalanceDest'])

0          0.0
1          0.0
4          0.0
5          0.0
6          0.0
          ... 
6362312    0.0
6362314    0.0
6362316    0.0
6362318    0.0
6362319    0.0
Name: newbalanceDest, Length: 2151495, dtype: float64


At first glance, the dataset had no missing values, so no null treatment was done. However, given that Merchant movements doesn't have meaningful information, all the M customers were removed.

In [6]:
df = df[~df['nameDest'].str.startswith('M')]
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [7]:
df = df.sort_values(by='step')

train_split = 575

train = df[df['step'] <= train_split]
test = df[df['step'] > train_split]
train.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [8]:
train['train_test'] = 1
test['train_test'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['train_test'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['train_test'] = 0


In [9]:
true_count = df['isFraud'].sum()
false_count = len(df) - true_count

print(f'True values (1): {true_count}')
print(f'False values (0): {false_count}')

True values (1): 8213
False values (0): 4202912


In [10]:
def apply_transformations(df):
    categories = ['CASH_IN', 'CASH_OUT', 'TRANSFER', 'DEBIT']  # Lista de categorías conocidas
    
    # Reemplazamos cualquier categoría desconocida por 'PAYMENT' (o la categoría que desees)
    df['type'] = df['type'].apply(lambda x: x if x in categories else 'PAYMENT')
    
    category_mapping = {'CASH_IN': 1, 'CASH_OUT': 2, 'TRANSFER': 3, 'DEBIT': 4}
    
    df['type_encoded'] = df['type'].map(category_mapping)
    
    numeric_columns = ['step', 'amount', 
                       'oldbalanceOrg', 'newbalanceOrig', 
                       'oldbalanceDest', 'newbalanceDest']
    
    scaler = StandardScaler()
    df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
    
    df_encoded = df.drop(['type'], axis=1)
    
    return df_encoded

# Aplicar transformaciones a los conjuntos de datos
training_final = apply_transformations(train.copy())
test_final = apply_transformations(test.copy())



numeric_columns = ['step', 'amount',
                   'oldbalanceOrg', 'newbalanceOrig',
                   'oldbalanceDest', 'newbalanceDest']

scaler = StandardScaler()
X_train_scaled = training_final.drop(['isFraud', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)
X_train_scaled[numeric_columns] = scaler.fit_transform(X_train_scaled[numeric_columns])
y_train = training_final['isFraud']
X_train_scaled.columns = X_train_scaled.columns.astype(str)

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle
from sklearn.pipeline import Pipeline

# Imputación y escalado
preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

X_train_scaled_imputed = preprocessor.fit_transform(X_train_scaled)

# Sobremuestreo con SMOTE
smote = SMOTE(sampling_strategy=0.0075, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled_imputed, y_train)

# Convertir a DataFrames
X_train_resampled_df = pd.DataFrame(X_train_resampled, columns=X_train_scaled.columns)
y_train_resampled_df = pd.Series(y_train_resampled, name='isFraud')

# Crear conjunto de prueba
X_test_scaled = test_final.drop(['isFraud', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis=1)
X_test_scaled.columns = X_test_scaled.columns.astype(str)
y_test = test_final['isFraud']

# Mostrar información sobre el tamaño del conjunto de entrenamiento y prueba
print("Train set transactions:", X_train_resampled_df.shape[0])
print("Test set transactions:", X_test_scaled.shape[0])

# Procesamiento por lotes para el conjunto de entrenamiento
batch_size = 10000
num_batches = len(X_train_resampled_df) // batch_size

X_train_batches = []
y_train_batches = []

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    X_batch = X_train_resampled_df.iloc[start_idx:end_idx, :]
    y_batch = y_train_resampled_df.iloc[start_idx:end_idx]
    X_train_batches.append(X_batch)
    y_train_batches.append(y_batch)

# Si hay un lote final con un tamaño menor que batch_size
if len(X_train_resampled_df) % batch_size != 0:
    X_batch = X_train_resampled_df.iloc[num_batches * batch_size:, :]
    y_batch = y_train_resampled_df.iloc[num_batches * batch_size:]
    X_train_batches.append(X_batch)
    y_train_batches.append(y_batch)

# Combinar todos los lotes
X_train_final = pd.concat(X_train_batches)
y_train_final = pd.concat(y_train_batches)

Train set transactions: 4128735
Test set transactions: 106766


In [12]:
print("Forma de X_train:", X_train_final.shape)
print("Forma de y_train:", y_train_final.shape)

Forma de X_train: (4128735, 8)
Forma de y_train: (4128735,)


In [13]:

# 2. Creación de secuencias de datos para el modelo LSTM
def create_sequence_data(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        X.append(data[i:i+sequence_length])
        y.append(data[i+sequence_length])
    return np.array(X), np.array(y)

# 3. Construcción del modelo LSTM
def build_lstm_model(input_shape):
    model = tf.keras.Sequential([
            tf.keras.layers.LSTM(50, return_sequences=True, input_shape=input_shape),
            tf.keras.layers.LSTM(50, return_sequences=False),
            tf.keras.layers.Dense(1)
            ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# 4. Entrenamiento del modelo LSTM
def train_lstm_model(model, X_train, y_train, epochs):
    history = model.fit(X_train, y_train, epochs=epochs, verbose=1)
    return history

# 5. Visualización de resultados (opcional)
def visualize_lstm_results(model, X_test, y_test, scaler):
    y_pred = model.predict(X_test)
    
    # Asegúrate de que y_pred tiene la misma cantidad de columnas que y_test
    if y_pred.shape[1] != y_test.shape[1]:
        y_pred = scaler.inverse_transform(y_pred.reshape(-1, 1))  # Ajusta la forma si es necesario
        y_test = scaler.inverse_transform(y_test.reshape(-1, 1))
    else:
        y_pred = scaler.inverse_transform(y_pred)
        y_test = scaler.inverse_transform(y_test)

    plt.figure(figsize=(12, 6))
    plt.plot(y_test, label='Observado')
    plt.plot(y_pred, label='Predicción', color='red')
    plt.legend(loc='best')
    plt.title('Resultados del Modelo LSTM')
    plt.show()

# Modificaciones para usar tus datos
sequence_length = 10  # Ajusta la longitud de la secuencia según tus necesidades
X_train, y_train = create_sequence_data(X_train_final.values, sequence_length)
X_test, y_test = create_sequence_data(X_test_scaled.values, sequence_length)

# Ajustes para procesamiento por lotes
batch_size = 10000  # Ajusta el tamaño del lote según tus necesidades
epochs = 10  # Ajusta el número de épocas según tus necesidades
input_shape = (X_train.shape[1], X_train.shape[2])
lstm_model = build_lstm_model(input_shape)



In [14]:
print("Forma de X_train:", X_train.shape)
print("Forma de y_train:", y_train.shape)

Forma de X_train: (4128725, 10, 8)
Forma de y_train: (4128725, 8)


In [15]:
history = lstm_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1)





Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# 5. Visualización de resultados (opcional)
def visualize_lstm_results(model, X_test, y_test, scaler):
    y_pred = model.predict(X_test)
    
    # Asegúrate de que y_pred tiene la misma cantidad de columnas que y_test
    if y_pred.shape[1] != y_test.shape[1]:
        # Si las dimensiones no coinciden, ajusta directamente la forma de y_pred
        y_pred = y_pred[:, :y_test.shape[1]]
    
    # Invierte la transformación solo para las columnas relevantes
    y_pred[:, :] = scaler.inverse_transform(y_pred[:, :])
    
    y_test = scaler.inverse_transform(y_test)

    plt.figure(figsize=(12, 6))
    plt.plot(y_test, label='Observado')
    plt.plot(y_pred, label='Predicción', color='red')
    plt.legend(loc='best')
    plt.title('Resultados del Modelo LSTM')
    plt.show()





In [31]:
# Visualización de resultados
visualize_lstm_results(lstm_model, X_test, y_test, scaler)



ValueError: non-broadcastable output operand with shape (106756,1) doesn't match the broadcast shape (106756,6)

In [17]:
# Crear secuencias para el conjunto de prueba
X_test_sequences, y_test_sequences = create_sequence_data(X_test_scaled.values, sequence_length)
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Asumiendo que y_train_final es una serie temporal unidimensional
sarimax_model = SARIMAX(y_train_final, order=(p, d, q), seasonal_order=(P, D, Q, s))
sarimax_results = sarimax_model.fit()

# Realizar predicciones
sarimax_predictions = sarimax_results.get_forecast(steps=len(X_test_sequences))
# Suponiendo que sarimax_predictions.values contiene las predicciones
combined_predictions = np.concatenate([sarimax_predictions.values, lstm_predictions], axis=1)


NameError: name 'p' is not defined

In [33]:
# 5. Guardar el modelo entrenado
def save_model(model, filename):
    model.save(filename)

save_model(lstm_model, 'modelo_entrenado.h5')

  saving_api.save_model(


In [34]:
def load_model(filename):
    return tf.keras.models.load_model(filename)

# Cargar el modelo en futuras sesiones
loaded_model = load_model('modelo_entrenado.h5')

In [37]:

from sklearn.metrics import mean_squared_error
import numpy as np
# 7. Evaluación del modelo cargado
def evaluate_model(model, X_test, y_test, scaler):
    # Realiza predicciones en los datos de prueba
    y_pred = model.predict(X_test)

    # Ajusta la forma de y_pred si es necesario
    if y_pred.shape[1] != y_test.shape[1]:
        # Si las dimensiones no coinciden, ajusta directamente la forma de y_pred
        y_pred = y_pred[:, :y_test.shape[1]]

    # Invierte la transformación solo para las columnas relevantes
    y_pred_inverse = scaler.inverse_transform(y_pred)

    # Invierte la transformación de y_test
    y_test_inverse = scaler.inverse_transform(y_test)

    # Evalúa el rendimiento del modelo (puedes usar diferentes métricas según tu problema)
    mse = mean_squared_error(y_test_inverse, y_pred_inverse)
    print(f'Mean Squared Error on Test Data: {mse}')

# Ejemplo de uso para evaluar el modelo cargado
# Suponiendo que tienes un conjunto de prueba X_test y y_test
# Puedes cargar el modelo entrenado y evaluar su rendimiento en los datos de prueba

# Cargar el modelo
loaded_model = load_model('modelo_entrenado.h5')

# Preparar los datos de prueba (puedes cargarlos de tu conjunto de datos)
X_test, y_test = create_sequence_data(test, sequence_length)

# Escalar los datos de prueba usando el mismo scaler utilizado durante el entrenamiento
X_test_scaled = scaler.transform(X_test)

# Evaluar el modelo cargado
evaluate_model(loaded_model, X_test_scaled, y_test, scaler)








KeyError: 10