In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score

Load Data Set

In [2]:
data = pd.read_csv("C:\\Users\\unkno\\Fraud Detection System\\train_hsbc_df.csv")

Drop Irrevalent Columns

In [3]:
data = data.drop(['step', 'merchant', 'zipcodeOri', 'zipMerchant'], axis=1)

Split Data Into Features and Target

In [4]:
X = data.drop(['fraud'], axis=1)
y = data['fraud']

Identify Category and numerical attributes

In [5]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

Split data for training and testing

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Replace missing values in numerical attributes with mean

In [7]:
imputer_numeric = SimpleImputer(strategy='mean')
X_train[numeric_cols] = imputer_numeric.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = imputer_numeric.transform(X_test[numeric_cols])

 Impute missing values in categorical columns with the most frequent value

In [8]:
imputer_categorical = SimpleImputer(strategy='most_frequent')
X_train[categorical_cols] = imputer_categorical.fit_transform(X_train[categorical_cols])
X_test[categorical_cols] = imputer_categorical.transform(X_test[categorical_cols])

Apply Label Encoding to categorical features

In [9]:
encoder = LabelEncoder()
for col in categorical_cols:
    X_train[col] = encoder.fit_transform(X_train[col])
    X_test[col] = encoder.transform(X_test[col])

Scale the numeric features

In [10]:
scaler = MinMaxScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

Create AdaBoost classifier with RandomForest as the base estimator

In [11]:
model = AdaBoostClassifier(
    estimator=RandomForestClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0
)

Train the model

In [12]:
model.fit(X_train, y_train)



Test the model

In [13]:
predictions = model.predict(X_test)

Evaluate the model

In [14]:
accuracy = accuracy_score(y_test, predictions)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

Model Accuracy: 99.39%


Saving predictions

In [16]:
predictions_df = pd.DataFrame({
    'customer_id': X_test.index,
    'predicted_fraud': predictions
})
predictions_df.to_csv('C:\\Users\\unkno\\Downloads\\Fraud Detection\\predictions.csv', index=False)

print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv
