In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


In [15]:
# Define the column types for reading the dataset
column_types = {
    'step': int,
    'type': str,
    'amount': float,
    'nameOrig': str,
    'oldbalanceOrg': float,
    'newbalanceOrig': float,
    'nameDest': str,
    'oldbalanceDest': float,
    'newbalanceDest': float,
    'isFraud': int,
    'isFlaggedFraud': int
}


In [18]:
# Read the dataset
df = pd.read_csv('Fraud.csv', dtype=column_types)
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [28]:
numerical_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())


In [32]:
# Handling outliers based on z-score for numerical columns
from scipy import stats
z_scores = stats.zscore(df[numerical_cols])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df = df[filtered_entries]


In [34]:
# Convert categorical data to numeric using one-hot encoding or label encoding
df = pd.get_dummies(df, columns=['type'])

# Drop columns that are not useful for the model
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)

# Separate features and target
X = df.drop('isFraud', axis=1)
y = df['isFraud']


In [36]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling (recommended for some algorithms)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
# Label encode the target variable (isFraud)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [40]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)


In [42]:
# Train the model
model.fit(X_train, y_train)


In [44]:
# Predict on the test set
y_pred = model.predict(X_test)



In [46]:
# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Confusion Matrix:
[[1216843      28]
 [    356     891]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1216871
           1       0.97      0.71      0.82      1247

    accuracy                           1.00   1218118
   macro avg       0.98      0.86      0.91   1218118
weighted avg       1.00      1.00      1.00   1218118



In [48]:

# Feature importance (useful for understanding which features are important)
importances = model.feature_importances_
feature_importance = pd.DataFrame(importances, index=X.columns, columns=['importance']).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
                importance
newbalanceDest    0.324243
oldbalanceOrg     0.234522
amount            0.137470
step              0.114818
oldbalanceDest    0.082408
type_TRANSFER     0.043493
newbalanceOrig    0.031320
type_CASH_OUT     0.022389
type_CASH_IN      0.006615
type_PAYMENT      0.002452
type_DEBIT        0.000227
isFlaggedFraud    0.000044
