In [12]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.callbacks import EarlyStopping


#This is for the RNN Model.

# Load the transaction dataset
data = pd.read_csv('mock_transactions.csv')

# Preprocessing the dataset
# Selecting relevant columns for RNN training
features = ['Amount', 'City', 'State', 'Latitude', 'Longitude', 'Time']
target = 'Is Fraud?'

# Encode categorical columns (City and State)
label_encoders = {}
for col in ['City', 'State']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Extract time-based features (hour, day of week, etc.)
data['Time'] = pd.to_datetime(data['Time'])
data['Hour'] = data['Time'].dt.hour
data['DayOfWeek'] = data['Time'].dt.dayofweek
data['Day'] = data['Time'].dt.day

# Update features list to include new time-based features
features = ['Amount', 'City', 'State', 'Latitude', 'Longitude', 'Hour', 'DayOfWeek', 'Day']

# Scaling numerical features
scaler = MinMaxScaler()
data[['Amount', 'Latitude', 'Longitude', 'Hour', 'DayOfWeek', 'Day']] = scaler.fit_transform(
    data[['Amount', 'Latitude', 'Longitude', 'Hour', 'DayOfWeek', 'Day']]
)

# Create feature and target arrays
X = data[features].values
y = data[target].values

# Reshape target for compatibility
y = np.reshape(y, (-1, 1))

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the input data for LSTM (samples, timesteps, features)
# Assuming each transaction is independent (timesteps=1)
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Building the RNN model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))

model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=64, activation='relu'))  # Hidden dense layer
model.add(Dropout(0.2))
model.add(Dense(units=1, activation='sigmoid'))  # Output layer


# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Add EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

# Make predictions on the test set
y_pred = (model.predict(X_test) > 0.5)

# Print classification report and AUC
print(classification_report(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, y_pred)}")

# Save the model
model.save('fraud_detection_rnn.h5')


  data['Time'] = pd.to_datetime(data['Time'])


Epoch 1/50


  super().__init__(**kwargs)


[1m3833/3833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.7750 - loss: 0.5382 - val_accuracy: 0.7747 - val_loss: 0.5342
Epoch 2/50
[1m3833/3833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7790 - loss: 0.5286 - val_accuracy: 0.7766 - val_loss: 0.5003
Epoch 3/50
[1m3833/3833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.7891 - loss: 0.5029 - val_accuracy: 0.8192 - val_loss: 0.4650
Epoch 4/50
[1m3833/3833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.8078 - loss: 0.4806 - val_accuracy: 0.8223 - val_loss: 0.4617
Epoch 5/50
[1m3833/3833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.8106 - loss: 0.4779 - val_accuracy: 0.8028 - val_loss: 0.4864
Epoch 6/50
[1m3833/3833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.8135 - loss: 0.4733 - val_accuracy: 0.8160 - val_loss: 0.4706
Epoch 7/50
[1m3833/3833[



              precision    recall  f1-score   support

           0       0.83      0.97      0.90     23754
           1       0.79      0.33      0.46      6909

    accuracy                           0.83     30663
   macro avg       0.81      0.65      0.68     30663
weighted avg       0.82      0.83      0.80     30663

AUC: 0.65110034168069


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.model_selection import GridSearchCV

#This is for the Random Forest Model

# Define hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Apply GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Use best model
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)


# Load the dataset
data = pd.read_csv('mock_transactions.csv')

# Drop sensitive features
data = data.drop(['Card Number', 'CVV', 'Expires'], axis=1)

# Encode categorical features
label_encoders = {}
for col in ['Errors?', 'Has Chip', 'City', 'State']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Extract time-based features from 'Time'
data['Time'] = pd.to_datetime(data['Time'])
data['Hour'] = data['Time'].dt.hour
data['DayOfWeek'] = data['Time'].dt.dayofweek
data['Day'] = data['Time'].dt.day

# Update feature list
features = ['Amount', 'Errors?', 'Use Chip', 'Has Chip', 'City', 'State', 'Latitude', 'Longitude', 'Hour', 'DayOfWeek', 'Day']
target = 'Is Fraud?'

# Separate features and target
X = data[features]
y = data[target]

# Normalize numerical features
scaler = StandardScaler()
X[['Amount', 'Latitude', 'Longitude']] = scaler.fit_transform(X[['Amount', 'Latitude', 'Longitude']])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"AUC: {roc_auc_score(y_test, y_pred)}")


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Hyperparameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


  data['Time'] = pd.to_datetime(data['Time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Amount', 'Latitude', 'Longitude']] = scaler.fit_transform(X[['Amount', 'Latitude', 'Longitude']])


              precision    recall  f1-score   support

           0       0.89      0.99      0.94     27243
           1       0.38      0.05      0.08      3446

    accuracy                           0.88     30689
   macro avg       0.64      0.52      0.51     30689
weighted avg       0.83      0.88      0.84     30689

AUC: 0.5185518431960637
