# Key Features

- **21 features** capturing various aspects of a financial transaction
- **Realistic structure** with numerical, categorical, and temporal data
- **Binary fraud labels** (`0 = Not Fraud`, `1 = Fraud`)
- **Useful for** anomaly detection, risk analysis, and security research


| Column Name                  | Description                                                         |
| ---------------------------- | ------------------------------------------------------------------- |
| Transaction_ID               | Unique identifier for each transaction                              |
| User_ID                      | Unique identifier for the user                                      |
| Transaction_Amount           | Amount of money involved in the transaction                         |
| Transaction_Type             | Type of transaction (Online, In-Store, ATM, etc.)                   |
| Timestamp                    | Date and time of the transaction                                    |
| Account_Balance              | User's current account balance before the transaction               |
| Device_Type                  | Type of device used (Mobile, Desktop, etc.)                         |
| Location                     | Geographical location of the transaction                            |
| Merchant_Category            | Type of merchant (Retail, Food, Travel, etc.)                       |
| IP_Address_Flag              | Whether the IP address was flagged as suspicious (0 or 1)           |
| Previous_Fraudulent_Activity | Number of past fraudulent activities by the user                    |
| Daily_Transaction_Count      | Number of transactions made by the user that day                    |
| Avg_Transaction_Amount_7d    | User's average transaction amount in the past 7 days                |
| Failed_Transaction_Count_7d  | Count of failed transactions in the past 7 days                     |
| Card_Type                    | Type of payment card used (Credit, Debit, Prepaid, etc.)            |
| Card_Age                     | Age of the card in months                                           |
| Transaction_Distance         | Distance between the user's usual location and transaction location |
| Authentication_Method        | How the user authenticated (PIN, Biometric, etc.)                   |
| Risk_Score                   | Fraud risk score computed for the transaction                       |
| Is_Weekend                   | Whether the transaction occurred on a weekend (0 or 1)              |
| Fraud_Label                  | Target variable (`0 = Not Fraud`, `1 = Fraud`)                      |


# Importing the Dependencies

In [None]:
!pip install xgboost


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
import joblib
from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Data Collection

In [None]:
# Loading the data from CSV file to pandas dataframe
file_path = 'synthetic_fraud_dataset.csv'

df= pd.read_csv(file_path)

# Data Exploration

In [None]:
# Printing first five rows of the data
df.head()

In [None]:
# Printing last five rows of the data
df.tail()

In [None]:
#printing the number of rows and columns
print("number of features in the Dataset:",df.shape[1])
print("number of instances in the Dataset:",df.shape[0])

In [None]:
#printing the names of the columns
print(df.columns)

In [None]:
#getting info about the Dataset
df.info()

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Check result
print(df['Timestamp'].head())
print(df['Timestamp'].dtype)

In [None]:
#Obtaining Statistical Information About the Dataset
df.describe()

In [None]:
# Checking For Duplicate Rows In Dataset
print('Number of Duplicated Rows :',df.duplicated().sum())

In [None]:
# Checking For Missing Values In Dataset
print(df.isna().sum())

In [None]:
numeric_cols = df.select_dtypes(include='number').columns
df.groupby('Fraud_Label')[numeric_cols].mean()

In [None]:
numeric_cols

In [None]:
categorical_cols = df.select_dtypes(include='object').columns

In [None]:
categorical_cols

# Data Visualization

In [None]:
# Count the occurrences of each category in the 'status' column
status_counts = df['Fraud_Label'].value_counts()
print(status_counts)

# 0 = Not Fraud
# 1 = Fraud

In [None]:
# Creating a pie plot
plt.figure(figsize=(6,6))
plt.pie(status_counts, labels=status_counts.index, autopct='%1.2f%%', startangle=90, colors=['lightpink', 'lightgreen'])
plt.title('Distribution of Parkinson\'s Status')
plt.tight_layout()
plt.legend()
plt.show()

 **Data seems imbalanced**

In [None]:
# Creating the heatmap
df_numeric=df.select_dtypes(include='number')
fig, ax = plt.subplots(figsize=(20,20))  
sns.heatmap(df_numeric.corr(),annot=True,ax=ax)
ax.set_title('Correlation Heatmap of Dataset', fontsize=16)
plt.show()

# Preprocessing

In [None]:
# deleting non relevent columns
df = df.drop(columns=['Transaction_ID', 'User_ID'])

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df['Hour'] = df['Timestamp'].dt.hour
df['DayOfWeek'] = df['Timestamp'].dt.dayofweek
df['Month'] = df['Timestamp'].dt.month
df['Is_Weekend'] = df['DayOfWeek'] >= 5  # optional if not already present

# Drop the raw Timestamp
df = df.drop(columns=['Timestamp'])

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = [
    'Transaction_Type',
    'Device_Type',
    'Location',
    'Merchant_Category',
    'Card_Type',
    'Authentication_Method'
]

# Initialize LabelEncoder
le = LabelEncoder()

# Apply Label Encoding to each categorical column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


# Balancing Dataset

In [None]:
# Count the occurrences of each category in the 'status' column
status_counts = df['Fraud_Label'].value_counts()
print(status_counts)

In [None]:
# Separating the features (X) and the target (y)
X = df.drop(columns='Fraud_Label')
y = df['Fraud_Label'] 

In [None]:
print('Feature Shape Before Balancing :', X.shape)
print('Target Shape Before Balancing :', y.shape)

In [None]:
# Intialising SMOTE Object
sm = SMOTE(random_state=300)

In [None]:
# Resampling The  Data
X, y = sm.fit_resample(X, y)

In [None]:
print('Feature Shape After Balancing :', X.shape)
print('Target Shape After Balancing :', y.shape)

In [None]:
# Count the occurrences of each category in the 'status' column
status_counts = y.value_counts()
print(status_counts)

In [None]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler((-1,1))

In [None]:
# Apply the scaler to X (features only)
X_features = scaler.fit_transform(X)
y_labels =y

In [None]:
# splitting the dataset into traning and testing sets 
X_train , X_test , y_train , y_test = train_test_split(X_features, y_labels , test_size=0.2, random_state=20)

In [None]:
print('X_train shape :', X_train.shape)
print('X_test shape :', X_test.shape)
print('y_train shape :', y_train.shape)
print('y_test shape :', y_test.shape)

# ML Model Training

# Logistic Regression

In [None]:
#initializing the logistic regression model
lrmodel = LogisticRegression()

In [None]:
lrmodel.fit(X_train, y_train)
y_test_predlr = lrmodel.predict(X_test)

In [None]:
#classification report
print(classification_report(y_test, y_test_predlr))

In [None]:
y_train_predlr = lrmodel.predict(X_train)

In [None]:
# Calculating accuracy scores
train_accuracy = accuracy_score(y_train, y_train_predlr)
test_accuracy = accuracy_score(y_test, y_test_predlr)

# Printing the accuracy scores
print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')

### Confusion matrix of Logistic regression model

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_test_predlr)

# Creating a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lrmodel.classes_)

# Plotting the confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Logistic Regression', y=1.1)
plt.show()

### AUC of Logistic regression model

In [None]:
y_pred_proba = lrmodel.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc="lower right")

In [None]:
# Dumping Logistic Regression Model
joblib.dump(lrmodel, 'lrmodel.pkl')

# DecisionTreeClassifier

In [None]:
#initializing the DecisionTreeClassifier model
DTmodel=DecisionTreeClassifier()

In [None]:
DTmodel.fit(X_train , y_train)
y_test_predDT = DTmodel.predict(X_test)

In [None]:
# classification report
print(classification_report(y_test, y_test_predDT))

In [None]:
y_train_predDT = DTmodel.predict(X_train)

In [None]:
# Calculating accuracy scores
train_accuracy = accuracy_score(y_train, y_train_predDT)
test_accuracy = accuracy_score(y_test, y_test_predDT)

# Printing the accuracy scores
print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')

# Confusion Matrix for DecisionTreeClassifier

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_test_predDT)

# Creating a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=DTmodel.classes_)

# Plotting the confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for DecisionTreeClassifier', y=1.1)
plt.show()

# AUC of DecisionTreeClassifier

In [None]:
y_pred_proba = DTmodel.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc="lower right")

# KNN Classifier

In [None]:
Ks = 10
mean_acc = []
ConfustionMx = [];
for n in range(2,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    y_pred=neigh.predict(X_test)
    mean_acc.append(metrics.accuracy_score(y_test, y_pred))  
print('Neighbor Accuracy List')
print(mean_acc)

In [None]:
plt.plot(range(2,Ks),mean_acc,'g')
plt.ylabel('Accuracy ')
plt.xlabel('Number of Neighbours (K)')
plt.tight_layout()
plt.show()

In [None]:
#initializing a KNN model
knn_model=KNeighborsClassifier(10)

In [None]:
knn_model.fit(X_train,y_train)

In [None]:
y_test_predknn =knn_model.predict(X_test)
y_train_predknn=knn_model.predict(X_train)

In [None]:
print(classification_report(y_test, y_test_predknn))

In [None]:
# Calculating accuracy scores
train_accuracy = accuracy_score(y_train, y_train_predknn)
test_accuracy = accuracy_score(y_test, y_test_predknn)

# Printing the accuracy scores
print(f'Training Accuracy: {train_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')

# Confusion Matrix for KNeighborsClassifier

In [None]:
# Calculating the confusion matrix for KNN
cm = confusion_matrix(y_test, y_test_predknn)

# Creating a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn_model.classes_)

# Plotting the confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for KNeighborsClassifier', y=1.1)
plt.show()

In [None]:
y_pred_proba = knn_model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc="lower right")

In [None]:
# Dumping KNN Classifier
joblib.dump(knn_model, 'knn_model.pkl')

# Random Forest Classifier

In [None]:
#initializing a Random Forest Classifier model
rf_model=RandomForestClassifier()

In [None]:
rf_model.fit(X_train , y_train)

In [None]:
y_test_predrf=rf_model.predict(X_test)
y_train_predrf=rf_model.predict(X_train)

In [None]:
print(classification_report(y_test, y_test_predrf))

In [None]:
# Calculating accuracy scores
training_accuracy=accuracy_score(y_train , y_train_predlr)
test_accuracy=accuracy_score(y_test , y_test_predlr)

# Plotting accuracy scores
print("training accuracy: ",training_accuracy)
print("test accuracy: ",test_accuracy)

In [None]:
# Calculating the confusion matrix for Random Forest Classifier
cm = confusion_matrix(y_test, y_test_predrf)

# Creating a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=rf_model.classes_)

# Plotting the confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for Random Forest Classifier', y=1.1)
plt.show()

In [None]:
y_pred_proba = rf_model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc="lower right")

In [None]:
# Dumping Random Forest Classifier
joblib.dump(rf_model, 'rf_model.pkl')

# XGBoost Classifer

In [None]:
# Defining Parameter Dictionary
param_dict = {'max_depth': range(4,8), 'eta' : [0.1, 0.2, 0.3, 0.4, 0.5],
              'reg_lambda' : [0.8, 0.9, 1, 1.1, 1.2],
              'random_state': [300, 600, 900]}

In [None]:
XGB = GridSearchCV(XGBClassifier(), param_grid = param_dict,
                   scoring = 'f1', cv = 3, verbose = 1)
XGB.fit(X_train, y_train)

In [None]:
print('Best Score :', XGB.best_score_)
print('Best Parameters :', XGB.best_params_)

In [None]:
# Extracting Best Classifier From GridSearchCV
XGB_model= XGB.best_estimator_

In [None]:
XGB_model.fit(X_train , y_train)
y_test_predx=XGB_model.predict(X_test)
y_train_predx=XGB_model.predict(X_train)

In [None]:
training_accuracy=accuracy_score(y_train , y_train_predx)
test_accuracy=accuracy_score(y_test , y_test_predx)

print("training accuracy: ",training_accuracy)
print("test accuracy: ",test_accuracy)

In [None]:
print(classification_report(y_test, y_test_predx))

# Confusion Matrix for XGBoost

In [None]:
# Calculating the confusion matrix for XGB
cm = confusion_matrix(y_test, y_test_predrf)

# Creating a ConfusionMatrixDisplay object
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=XGB_model.classes_)

# Plotting the confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix for XGBosst model', y=1.1)
plt.show()

In [None]:
y_pred_proba = XGB_model.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc="lower right")

In [None]:
# Dumping XGBoost Classifier
joblib.dump(XGB_model, 'XGB_model.pkl')

# Comparison table

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

# Dictionary of models 
models = {
    "Decision Tree": DTmodel,  
    "Random Forest": rf_model, 
    "Logistic Regression": lrmodel,    
    "KNN": knn_model, 
    "XGBoost": XGB_model  
}

# Training and evaluating each model
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
    
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        "Model": name,
        "Accuracy": accuracy,
        "AUC": auc,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

# Displaying results for each model
import pandas as pd
results_df = pd.DataFrame(results)
print(results_df)

# XGBoost performs best overall, with the highest scores in accuracy, precision, F1 ,and AUC while also having a strong Recall