In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import VotingClassifier

In [2]:
# Read the data
data=pd.read_csv("../data/synthetic_dataset.csv")
data.sample(5)
# Drop unnecessary columns
data1 = data.drop(columns=["Transaction_ID", "User_Account_ID"])

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def preprocess_data(df):
    # Convert Transaction_Date to numerical (year, month, day)
    df['Transaction_Date'] = pd.to_datetime(df['Transaction_Date'])
    df['Transaction_Year'] = df['Transaction_Date'].dt.year
    df['Transaction_Month'] = df['Transaction_Date'].dt.month
    df['Transaction_Day'] = df['Transaction_Date'].dt.day

    # Convert Transaction_Time to numerical (hour, minute, second)
    df['Transaction_Time'] = pd.to_datetime(df['Transaction_Time'])
    df['Transaction_Hour'] = df['Transaction_Time'].dt.hour
    df['Transaction_Minute'] = df['Transaction_Time'].dt.minute
    df['Transaction_Second'] = df['Transaction_Time'].dt.second

    # Convert Account_Creation_Date to numerical (year, month, day)
    df['Account_Creation_Date'] = pd.to_datetime(df['Account_Creation_Date'])
    df['Account_Creation_Year'] = df['Account_Creation_Date'].dt.year
    df['Account_Creation_Month'] = df['Account_Creation_Date'].dt.month
    df['Account_Creation_Day'] = df['Account_Creation_Date'].dt.day

    # Encoding categorical column 'Payment_Method' using Label Encoding
    label_encoder = LabelEncoder()
    df['Payment_Method'] = label_encoder.fit_transform(df['Payment_Method'])
    df['Billing_Location']=label_encoder.fit_transform(df['Billing_Location'])
    df['Shipping_Location']=label_encoder.fit_transform(df['Shipping_Location'])
    # Convert Session_Duration and Time_Between_Transactions to numerical (remove 'seconds' suffix)
    df['Session_Duration'] = df['Session_Duration'].apply(lambda x: int(x.split()[0]))
    df['Time_Between_Transactions'] = df['Time_Between_Transactions'].apply(lambda x: int(x.split()[0]))
    
    

    # Convert IP_Address_History and Device_IP_Address to numerical (last part)
    df['IP_Address_History'] = df['IP_Address_History'].apply(lambda x: int(x.split('.')[-1]))
    df['Device_IP_Address'] = df['Device_IP_Address'].apply(lambda x: int(x.split('.')[-1]))

    # Dropping original non-useful columns after preprocessing
    columns_to_drop = ['Transaction_Date', 'Transaction_Time', 'Account_Creation_Date',]
    df.drop(columns=columns_to_drop, inplace=True)

    return df

# Usage example:
preprocess_data(data1)


  df['Transaction_Time'] = pd.to_datetime(df['Transaction_Time'])


Unnamed: 0,Transaction_Amount,Payment_Method,Billing_Location,Shipping_Location,Device_IP_Address,Session_Duration,Frequency_of_Transactions,Time_Between_Transactions,Unusual_Time_of_Transaction,Unusual_Transaction_Amounts,...,Fraud_Label,Transaction_Year,Transaction_Month,Transaction_Day,Transaction_Hour,Transaction_Minute,Transaction_Second,Account_Creation_Year,Account_Creation_Month,Account_Creation_Day
0,19122.08,0,31,31,225,454,4,466,0,1,...,0,2022,12,18,14,32,43,2022,5,4
1,2090.68,2,0,0,8,806,2,203,3,4,...,0,2022,2,24,7,58,38,2022,11,16
2,9247.22,0,30,30,158,462,3,1422,2,3,...,0,2022,8,9,12,59,59,2022,12,29
3,7686.80,1,68,68,60,363,1,1895,1,3,...,0,2022,12,22,0,20,42,2022,2,14
4,3693.87,1,95,69,188,263,8,1281,4,0,...,0,2022,2,21,19,20,27,2022,8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,270.93,1,15,95,72,793,10,1082,2,4,...,0,2022,10,15,23,45,15,2022,8,9
9996,17618.49,2,70,70,94,780,1,883,1,2,...,0,2022,1,6,16,13,36,2022,6,1
9997,14286.99,0,100,100,184,275,1,106,4,4,...,0,2022,9,8,13,4,17,2022,6,11
9998,3036.76,1,1,71,6,602,4,1997,2,2,...,0,2022,12,17,0,42,10,2022,1,22


In [4]:
# Splitting into features (X) and target variable (y)
X = data1.drop(columns=['Fraud_Label'])
y = data1['Fraud_Label']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Initialize Random Forest, XGBoost, and Neural Network classifiers
rf_classifier = RandomForestClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)
nn_classifier = MLPClassifier(random_state=42)

In [7]:
# Train the classifiers
rf_classifier.fit(X_train, y_train)



In [8]:
xgb_classifier.fit(X_train, y_train)


In [9]:
nn_classifier.fit(X_train, y_train)

In [10]:
# Make predictions on the test set for each classifier
y_pred_rf = rf_classifier.predict(X_test)
y_pred_xgb = xgb_classifier.predict(X_test)
y_pred_nn = nn_classifier.predict(X_test)

In [11]:
# Evaluate Random Forest model performance
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf:.2f}")
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.87
Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92      1586
           1       1.00      0.36      0.53       414

    accuracy                           0.87      2000
   macro avg       0.93      0.68      0.73      2000
weighted avg       0.89      0.87      0.84      2000



In [12]:
# Evaluate XGBoost model performance
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}")
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.95
XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1586
           1       1.00      0.78      0.88       414

    accuracy                           0.95      2000
   macro avg       0.97      0.89      0.92      2000
weighted avg       0.96      0.95      0.95      2000



In [13]:
# Evaluate Neural Network model performance
accuracy_nn = accuracy_score(y_test, y_pred_nn)
print(f"Neural Network Accuracy: {accuracy_nn:.2f}")
print("Neural Network Classification Report:")
print(classification_report(y_test, y_pred_nn))

Neural Network Accuracy: 0.78
Neural Network Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.87      1586
           1       0.47      0.36      0.41       414

    accuracy                           0.78      2000
   macro avg       0.66      0.63      0.64      2000
weighted avg       0.77      0.78      0.77      2000



In [14]:
# Initialize an ensemble using VotingClassifier
ensemble_classifier = VotingClassifier(estimators=[
    ('random_forest', rf_classifier),
    ('xgboost', xgb_classifier),
    ('neural_network', nn_classifier)
], voting='soft')


In [15]:
# Train the ensemble
ensemble_classifier.fit(X_train, y_train)

In [16]:

# Make predictions on the test set for the ensemble
y_pred_ensemble = ensemble_classifier.predict(X_test)

In [17]:
# Evaluate the ensemble model performance
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Accuracy: {accuracy_ensemble:.2f}")
print("Ensemble Classification Report:")
print(classification_report(y_test, y_pred_ensemble))

Ensemble Accuracy: 0.88
Ensemble Classification Report:
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1586
           1       1.00      0.42      0.59       414

    accuracy                           0.88      2000
   macro avg       0.93      0.71      0.76      2000
weighted avg       0.90      0.88      0.86      2000



In [19]:
sample_input = {
    'Transaction_Amount': 1230.6,
    'Transaction_Date': '2023-06-08',
    'Transaction_Time': '15:00:00',
    'Account_Creation_Date': '2022-08-01',
    'Payment_Method': 'UPI',
    'Billing_Location': 'Mumbai',
    'Shipping_Location': 'Mumbai',
    'Device_IP_Address': '203.45.67.113',
    'Session_Duration': '720 seconds',
    'Frequency_of_Transactions': 9,
    'Time_Between_Transactions': '150 seconds',
    'Unusual_Time_of_Transaction': 1,
    'Unusual_Transaction_Amounts': 0,
    'IP_Address_History': '203.45.67.113'}
sample_df = pd.DataFrame([sample_input])
sample_df_processed = preprocess_data(sample_df)
fraud_prediction_ensemble = ensemble_classifier.predict(sample_df_processed)
print(f"Ensemble Predicted Fraud Label for Sample Input: {fraud_prediction_ensemble[0]}")

Ensemble Predicted Fraud Label for Sample Input: 0
