In [1]:
# ===============================
# Cell 1: Imports and Warnings
# ===============================

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

sns.set()  # Seaborn style
%matplotlib inline

In [2]:
# ===============================
# Cell 2: Read the Dataset
# ===============================

# Extra CSV files
location_df = pd.read_csv('/kaggle/input/fraudulent-financial-transaction-prediction/Geo_scores.csv')
propindex_df = pd.read_csv('/kaggle/input/fraudulent-financial-transaction-prediction/Lambda_wts.csv')
nettat_df = pd.read_csv('/kaggle/input/fraudulent-financial-transaction-prediction/Qset_tats.csv')
vqs_df = pd.read_csv('/kaggle/input/fraudulent-financial-transaction-prediction/instance_scores.csv')

# Main training set
dataset_df = pd.read_csv('/kaggle/input/fraudulent-financial-transaction-prediction/train.csv')

# Main test set
test_df = pd.read_csv('/kaggle/input/fraudulent-financial-transaction-prediction/test_share.csv')

print("Training Data Shape:", dataset_df.shape)
print("Test Data Shape:", test_df.shape)


Training Data Shape: (227845, 28)
Test Data Shape: (56962, 27)


In [3]:
# ===============================
# Cell 3: Quick Info and Dropping Insignificant Columns
# ===============================

print("--- TRAINING DATA INFO ---")
dataset_df.info()

print("\n--- TEST DATA INFO ---")
test_df.info()

# The columns 'id' and 'Group' were found insignificant in previous analysis
# We drop them from both training and test data
dataset_df.drop(columns=['id','Group'], inplace=True, errors='ignore')
test_df.drop(columns=['id','Group'], inplace=True, errors='ignore')

# Check the columns after dropping
print("\nTraining columns:", dataset_df.columns.tolist())
print("Test columns:", test_df.columns.tolist())

--- TRAINING DATA INFO ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227845 entries, 0 to 227844
Data columns (total 28 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              227845 non-null  int64  
 1   Group           227845 non-null  object 
 2   Per1            227845 non-null  float64
 3   Per2            227845 non-null  float64
 4   Per3            227845 non-null  float64
 5   Per4            227845 non-null  float64
 6   Per5            227845 non-null  float64
 7   Per6            227845 non-null  float64
 8   Per7            227845 non-null  float64
 9   Per8            227845 non-null  float64
 10  Per9            227845 non-null  float64
 11  Dem1            227845 non-null  float64
 12  Dem2            227845 non-null  float64
 13  Dem3            227845 non-null  float64
 14  Dem4            227845 non-null  float64
 15  Dem5            227845 non-null  float64
 16  Dem6            227845 non-nu

In [4]:
# ===============================
# Cell 4: Train/Validation Split
# ===============================

# 'Target' is our Y
train_df, validation_df = train_test_split(
    dataset_df, 
    train_size=0.7, 
    random_state=1, 
    stratify=dataset_df['Target']
)

print("Train shape:", train_df.shape)
print("Validation shape:", validation_df.shape)

# Separate features and target
X_train = train_df.drop('Target', axis=1)
y_train = train_df['Target']

X_val = validation_df.drop('Target', axis=1)
y_val = validation_df['Target']


Train shape: (159491, 26)
Validation shape: (68354, 26)


In [5]:
# ===============================
# Cell 5: Handling Class Imbalance
# ===============================

print("--- Class distribution in y_train before oversampling ---")
print(y_train.value_counts())

ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

print("\n--- Class distribution in y_train after oversampling ---")
print(y_train_res.value_counts())

--- Class distribution in y_train before oversampling ---
Target
0    159215
1       276
Name: count, dtype: int64

--- Class distribution in y_train after oversampling ---
Target
0    159215
1    159215
Name: count, dtype: int64


In [6]:
# ===============================
# Cell 6: Outlier Capping (IQR)
# ===============================

def iqr_capping(df):
    """
    Caps values outside the [Q1-1.5*IQR, Q3+1.5*IQR] range for each numeric column.
    """
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            lower_limit = q1 - (1.5 * iqr)
            upper_limit = q3 + (1.5 * iqr)
            df[col] = np.where(df[col] < lower_limit, lower_limit, df[col])
            df[col] = np.where(df[col] > upper_limit, upper_limit, df[col])

# Apply outlier capping to training, validation, and test sets
iqr_capping(X_train_res)
iqr_capping(X_val)
iqr_capping(test_df)

In [7]:
# ===============================
# Cell 7: Scaling the Features
# ===============================

scaler = StandardScaler()

# Fit on oversampled X_train
scaler.fit(X_train_res)

# Transform X_train_res, X_val, and test_df
X_train_scl = pd.DataFrame(scaler.transform(X_train_res), columns=X_train_res.columns)
X_val_scl   = pd.DataFrame(scaler.transform(X_val),        columns=X_val.columns)
X_test_scl  = pd.DataFrame(scaler.transform(test_df),      columns=test_df.columns)

In [8]:
# ===============================
# Cell 8: Random Forest Model Training
# ===============================

# Initialize the Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

# Train on the oversampled, outlier-capped, scaled data
rf_model.fit(X_train_scl, y_train_res)

print("Random Forest training complete!")

Random Forest training complete!


In [9]:
# ===============================
# Cell 9: Evaluate on Validation Set
# ===============================

y_val_pred = rf_model.predict(X_val_scl)

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, digits=4))

Confusion Matrix:
[[68224    12]
 [   37    81]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9995    0.9998    0.9996     68236
           1     0.8710    0.6864    0.7678       118

    accuracy                         0.9993     68354
   macro avg     0.9352    0.8431    0.8837     68354
weighted avg     0.9992    0.9993    0.9992     68354



In [10]:
# ===============================
# Cell 10: Predict on the Test Set
# ===============================

test_predictions = rf_model.predict(X_test_scl)

# Convert to a DataFrame (optional if you want to save)
prediction_df = pd.DataFrame({
    "Prediction": test_predictions
})

print("--- Test Predictions (Head) ---")
display(prediction_df.head())

# Optionally, save predictions:
# prediction_df.to_csv("test_predictions.csv", index=False)
print("Done!")


--- Test Predictions (Head) ---


Unnamed: 0,Prediction
0,0
1,0
2,0
3,0
4,0


Done!
