In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, confusion_matrix

# Load the dataset (Make sure to replace the path with the correct one on your system)
data_path = r"C:\Users\SYED ANAS\Downloads\fraudTest.csv\fraudTest.csv"
try:
    data = pd.read_csv(data_path)
except FileNotFoundError:
    print("The file was not found. Please check the path and try again.")
    data = None

if data is not None:
    # Let's take a look at the column names to understand our dataset better
    print("Column names in the dataset:")
    print(data.columns)

    # Showing the first few rows of our dataset
    print("\nHere are the first few rows of the dataset:")
    print(data.head())

    # Checking the dimensions of our dataset
    print(f"\nThe dataset contains {data.shape[0]} rows and {data.shape[1]} columns.")

    # We need to find the target column that indicates fraud cases
    potential_target_columns = ['Class', 'class', 'Target', 'target', 'Fraud', 'fraud']
    target_column = None
    for col in potential_target_columns:
        if col in data.columns:
            target_column = col
            break

    # If no target column is found, we skip further processing
    if target_column is None:
        print("No target column found in the dataset. Please ensure the dataset contains a 'Class' or similar column indicating fraud cases.")
    else:
        # Identifying the number of fraud and valid cases
        fraud_cases = data[data[target_column] == 1]
        valid_cases = data[data[target_column] == 0]
        outlier_fraction = len(fraud_cases) / float(len(valid_cases))
        print(f"\nOutlier fraction (fraud cases to valid transactions): {outlier_fraction:.4f}")
        print(f"Number of fraud cases: {len(fraud_cases)}")
        print(f"Number of valid transactions: {len(valid_cases)}")

        # If there's an 'Amount' column, let's check out the stats for fraudulent and valid transactions
        if 'Amount' in data.columns:
            print("\nDetails of fraudulent transactions:")
            print(fraud_cases['Amount'].describe())
            print("\nDetails of valid transactions:")
            print(valid_cases['Amount'].describe())
        else:
            print("'Amount' column not found in the dataset. Skipping amount statistics.")

        # Creating a correlation matrix to understand the relationships between variables
        print("\nGenerating a correlation matrix to understand the relationships between variables...")
        correlation_matrix = data.corr()
        plt.figure(figsize=(12, 9))
        sns.heatmap(correlation_matrix, vmax=.8, square=True, cmap='coolwarm')
        plt.title("Correlation Matrix")
        plt.show()

        # Splitting the data into features (X) and the target variable (Y)
        X = data.drop([target_column], axis=1)
        Y = data[target_column]
        print(f"\nFeatures shape: {X.shape}")
        print(f"Target shape: {Y.shape}")

        # Splitting the data into training and testing sets
        X_train, X_test, Y_train, Y_test = train_test_split(X.values, Y.values, test_size=0.2, random_state=42)

        # Building the Random Forest Classifier
        print("\nTraining the Random Forest Classifier...")
        rfc = RandomForestClassifier(random_state=42)
        rfc.fit(X_train, Y_train)

        # Making predictions on the test set
        print("\nMaking predictions on the test set...")
        Y_pred = rfc.predict(X_test)

        # Evaluating the classifier using different metrics
        print("\nEvaluating the classifier...")
        accuracy = accuracy_score(Y_test, Y_pred)
        precision = precision_score(Y_test, Y_pred)
        recall = recall_score(Y_test, Y_pred)
        f1 = f1_score(Y_test, Y_pred)
        mcc = matthews_corrcoef(Y_test, Y_pred)

        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1-Score: {f1:.4f}")
        print(f"Matthews correlation coefficient: {mcc:.4f}")

        # Generating the confusion matrix to visualize the performance of our model
        print("\nGenerating the confusion matrix...")
        labels = ['Valid', 'Fraud']
        conf_matrix = confusion_matrix(Y_test, Y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, xticklabels=labels, yticklabels=labels, annot=True, fmt="d", cmap='Blues')
        plt.title("Confusion Matrix")
        plt.ylabel('True Class')
        plt.xlabel('Predicted Class')
        plt.show()


Column names in the dataset:
Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

Here are the first few rows of the dataset:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28 