In [2]:
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
import pickle as pkl


file_path="../data/transaction_detail.csv"
def load_data(file_path):
    df = pd.read_csv("../data/transaction_detail.csv")
    return df



def train_models(X_train, y_train):
    # XGBoost model
    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(X_train, y_train)

    # Random Forest model
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    # Neural Network model
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    model_nn = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dense(32, activation='relu'),
        layers.Dense(16, activation='relu'),
        layers.Dense(8, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

    model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model_nn.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

    return xgb_model, rf_model, model_nn

def train_ensemble(xgb_model, rf_model, model_nn, X_train, y_train):
    # Ensemble model
    ensemble_model = VotingClassifier(estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('nn', KerasClassifier(build_fn=model_nn, epochs=1, batch_size=32, verbose=0))
    ], voting='soft')  # Use 'soft' for averaging probabilities, 'hard' for voting

    # Fit the ensemble model
    ensemble_model.fit(X_train, y_train)

    return ensemble_model

def save_model(model, file_name):
    with open(file_name, 'wb') as model_file:
        pkl.dump(model, model_file)


def preprocess_input(input_data):
    # Create a DataFrame from the input data
    processed_input = pd.DataFrame(input_data)

    # Convert 'Transaction_Date' to datetime
    processed_input['Transaction_Date'] = pd.to_datetime(processed_input['Transaction_Date'], format='%Y-%m-%d')

    # Extract additional features from datetime columns if needed
    processed_input['Transaction_Year'] = processed_input['Transaction_Date'].dt.year
    processed_input['Transaction_Month'] = processed_input['Transaction_Date'].dt.month
    processed_input['Transaction_Day'] = processed_input['Transaction_Date'].dt.day
    processed_input['Transaction_Hour'] = processed_input['Transaction_Date'].dt.hour
    processed_input['Transaction_Minute'] = processed_input['Transaction_Date'].dt.minute

    # Drop original datetime columns
    processed_input = processed_input.drop(['Transaction_Date'], axis=1)

    # Perform one-hot encoding for categorical columns
    processed_input = pd.get_dummies(processed_input)

    return processed_input



def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)

    return accuracy, conf_matrix, class_report

# Load and preprocess data
file_path = '../data/transaction_detail.csv'
df = load_data(file_path)
data = preprocess_input(df)

# Split the data into features (X) and target variable (y)
X = data.drop('Fraud_Label', axis=1)
y = df['Fraud_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)




In [3]:
X_train.shape

(54, 248)

In [4]:
def predict_fraud(ensemble_model, input_data):
    # Load the ensemble model from the pickle file
    with open(ensemble_model, 'rb') as model_file:
        loaded_model = pkl.load(model_file)

    # Preprocess the input data
    processed_input = input_data # preprocess_input(input_data)

    # Make predictions using the loaded model
    fraud_prediction = loaded_model.predict(processed_input)

    return fraud_prediction


In [5]:

# Provided input data
input_data1 ={
    'Transaction_Amount': [1500],
    'User_Account_ID': [104],
    'Transaction_Date': ['2022-11-15'],
    'Payment_Method': ['Credit Card'],
    'Billing_Location': ['Bangalore'],
    'Shipping_Location': ['Hyderabad'],
    'Device_IP_Address': ['192.168.1.40'],
    'Session_Duration': [500],
    'Transaction_Time': [500],
    'Frequency_of_Transactions': [7],
    'Time_Between_Transactions': [80],
    'Unusual_Time_of_Transaction': [0],
    'Unusual_Transaction_Amounts': [0],
    'IP_Address_History': ['192.168.1.40'],
}

a=preprocess_input(input_data1)

# Create a DataFrame from the input data
input_df = pd.DataFrame(a)

# Convert the DataFrame to a NumPy array
input_array = input_df.to_numpy()

# Display the NumPy array
print(input_array)


[[1500 104 500 500 7 80 0 0 2022 11 15 0 0 True True True True True]]


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from tensorflow import keras
from tensorflow.keras import layers
from scikeras.wrappers import KerasClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle as pkl

# Load data from a CSV file
def load_data(file_path):
    df = pd.read_csv("../data/transaction_detail.csv")
    return df

# Preprocess the input data
def preprocess_input(input_data):
    # Drop unnecessary columns
    cols_to_drop = ['Transaction_ID', 'Transaction_Time', 'User_Account_ID', 'Account_Creation_Date', 'IP_Address_History']
    processed_input = input_data.drop(cols_to_drop, axis=1)

    # Convert 'Transaction_Date' to datetime
    processed_input['Transaction_Date'] = pd.to_datetime(processed_input['Transaction_Date'], format='%Y-%m-%d')

    # Perform one-hot encoding for categorical columns
    processed_input = pd.get_dummies(processed_input, columns=['Payment_Method', 'Billing_Location', 'Shipping_Location'])

    return processed_input

# Train models using XGBoost, Random Forest, and Neural Network
def train_models(X_train, y_train):
    # XGBoost model
    xgb_model = XGBClassifier()
    xgb_model.fit(X_train, y_train)

    # Random Forest model
    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)

    # Neural Network model
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    model_nn = keras.Sequential([
        layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        layers.Dense(32, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

    model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model_nn.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)

    return xgb_model, rf_model, model_nn

# Train an ensemble model using the trained models
def train_ensemble(xgb_model, rf_model, model_nn, X_train, y_train):
    ensemble_model = VotingClassifier(estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model),
        ('nn', KerasClassifier(build_fn=model_nn, epochs=1, batch_size=32, verbose=0))
    ], voting='soft')

    ensemble_model.fit(X_train, y_train)
    return ensemble_model

# Save the trained ensemble model to a file
def save_model(model, file_name):
    with open(file_name, 'wb') as model_file:
        pkl.dump(model, model_file)

# Load the dataset
file_path = 'path_to_your_file.csv'  # Replace with the actual path to your dataset file
df = load_data(file_path)

# Preprocess the data
data = preprocess_input(df)

# Split the data into features (X) and target variable (y)
X = data.drop('Fraud_Label', axis=1)
y = data['Fraud_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

# Train models
xgb_model, rf_model, model_nn = train_models(X_train, y_train)

# Train the ensemble model
ensemble_model = train_ensemble(xgb_model, rf_model, model_nn, X_train, y_train)

# Save the ensemble model
save_model(ensemble_model, 'ensemble_model.pkl')

# Predict with new input data
def predict_with_new_input(new_input_data):
    new_data = preprocess_input(pd.DataFrame(new_input_data))
    new_data_aligned = new_data.reindex(columns=X.columns, fill_value=0)
    predictions = ensemble_model.predict(new_data_aligned)
    return predictions

# Example usage of predict_with_new_input function
new_input_data = {
    'Transaction_Amount': [1200],
    'Transaction_Date': ['2023-06-01'],
    # Add other relevant columns similar to your dataset
}

new_predictions = predict_with_new_input(new_input_data)
print(new_predictions)


KeyError: "None of [Index(['Payment_Method', 'Other_Categorical_Columns'], dtype='object')] are in the [columns]"