# Setup

In [1]:
!pip install python-dotenv google-api-python-client

from google.colab import drive
drive.mount('/content/drive')

import os
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/.envs/.env_github")
github_pat = os.getenv("GITHUB_PAT")
print(f"✅ GITHUB_PAT loaded successfully")
if not github_pat:
    raise ValueError("❌ Error: 'GITHUB_PAT' is missing or invalid in your .env file.")

!git clone https://{github_pat}@github.com/vmagdale2/Fraud-Detection.git

import sys
sys.path.append('/content/Fraud-Detection.git/')
%cd /content/Fraud-Detection/
!pwd
!ls

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0
Mounted at /content/drive
✅ GITHUB_PAT loaded successfully
Cloning into 'Fraud-Detection'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 1), reused 5 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), 16.63 KiB | 532.00 KiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/Fraud-Detection
/content/Fraud-Detection
Data  LICENSE  README.md


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
file_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Raw/Base.csv/Base.csv"
df = pd.read_csv(file_path)

In [4]:
y = df['fraud_bool']
df.drop(columns=['fraud_bool'], inplace=True)

# Base

## Preprocessing

In [5]:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df.drop(columns=['month'], inplace=True)

In [6]:
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

In [7]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [8]:
X_scaled = preprocessor.fit_transform(df)

## Save

In [9]:
import joblib
joblib.dump(preprocessor, "base_preprocessor.pkl")
np.save("X_scaled_base.npy", X_scaled)
np.save("y_base.npy", y.values)

In [10]:
joblib.dump(preprocessor, "/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/base_preprocessor.pkl")


['/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/base_preprocessor.pkl']

In [11]:
np.save("/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/X_scaled_base.npy", X_scaled)
np.save("/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/y_base.npy", y.values)

# Variant IV

## Setup


In [13]:
file_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Raw/Variant IV.csv/Variant IV.csv"
df = pd.read_csv(file_path)

In [14]:
y = df['fraud_bool']
df.drop(columns=['fraud_bool'], inplace=True)

## Preprocessing

In [15]:
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df.drop(columns=['month'], inplace=True)

numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [16]:
X_scaled = preprocessor.fit_transform(df)

## Save


In [17]:
import joblib
joblib.dump(preprocessor, "variantiv_preprocessor.pkl")
np.save("X_scaled_variantiv.npy", X_scaled)
np.save("y_variantiv.npy", y.values)

In [18]:
joblib.dump(preprocessor, "/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/variantiv_preprocessor.pkl")


['/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/variantiv_preprocessor.pkl']

In [19]:
np.save("/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/X_scaled_variantiv.npy", X_scaled)
np.save("/content/drive/My Drive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Base/y_variantiv.npy", y.values)