# Clean & Process

## Setup

In [1]:
!pip install python-dotenv google-api-python-client

from google.colab import drive
drive.mount('/content/drive')

import os
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/.envs/.env_github")
github_pat = os.getenv("GITHUB_PAT")
print(f"✅ GITHUB_PAT loaded successfully")
if not github_pat:
    raise ValueError("❌ Error: 'GITHUB_PAT' is missing or invalid in your .env file.")

!git clone https://{github_pat}@github.com/vmagdale2/Fraud-Detection.git

import sys
sys.path.append('/content/Fraud-Detection.git/')
%cd /content/Fraud-Detection/
!pwd
!ls

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0
Mounted at /content/drive
✅ GITHUB_PAT loaded successfully
Cloning into 'Fraud-Detection'...
remote: Enumerating objects: 10, done.[K
remote: Counting objects: 100% (10/10), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 10 (delta 1), reused 5 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (10/10), 16.63 KiB | 3.33 MiB/s, done.
Resolving deltas: 100% (1/1), done.
/content/Fraud-Detection
/content/Fraud-Detection
Data  LICENSE  README.md


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
file_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Variant II.csv/fifar_variant_II.csv"

try:
  df = pd.read_csv(file_path)
  print("DataFrame loaded successfully.")
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
except pd.errors.ParserError:
  print(f"Error: Unable to parse the CSV file at {file_path}. Check the file format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")

DataFrame loaded successfully.


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Exploration

In [5]:
df.shape
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 32 columns):
 #   Column                            Non-Null Count    Dtype  
---  ------                            --------------    -----  
 0   fraud_bool                        1000000 non-null  int64  
 1   income                            1000000 non-null  float64
 2   name_email_similarity             1000000 non-null  float64
 3   prev_address_months_count         1000000 non-null  int64  
 4   current_address_months_count      1000000 non-null  int64  
 5   customer_age                      1000000 non-null  int64  
 6   days_since_request                1000000 non-null  float64
 7   intended_balcon_amount            1000000 non-null  float64
 8   payment_type                      1000000 non-null  object 
 9   zip_count_4w                      1000000 non-null  int64  
 10  velocity_6h                       1000000 non-null  float64
 11  velocity_24h                      1000

In [6]:
df.head()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,employment_status,credit_risk_score,email_is_free,housing_status,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.7,0.062,-1,24,50,0.017,-0.872,AB,3430,8645.19,5569.162,6258.579,615,5,CB,55,1,BC,1,1,30,1,200.0,0,INTERNET,6.804,other,0,1,0,0
1,0,0.9,0.098,-1,310,50,0.019,-1.024,AB,3492,1610.761,6214.332,6020.948,12,7,CA,268,1,BA,0,1,5,1,1500.0,0,INTERNET,1.412,macintosh,0,1,0,0
2,0,0.6,0.117,-1,189,60,0.047,-1.206,AB,4621,8382.98,6034.593,6772.412,7,8,CB,86,1,BC,1,1,30,0,200.0,0,INTERNET,14.489,other,1,1,0,0
3,0,0.3,0.059,10,40,60,0.008,-0.076,AA,1697,13872.725,7594.434,5961.464,29,14,CA,113,1,BC,0,1,2,1,200.0,0,INTERNET,6.152,linux,1,1,0,0
4,0,0.1,0.69,-1,128,30,2.514,-1.108,AD,1431,7755.173,5485.056,5993.34,10,10,CB,115,1,BC,0,1,30,0,200.0,0,INTERNET,5.6,other,0,1,0,0


In [7]:
df.describe()

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,velocity_4w,bank_branch_count_8w,date_of_birth_distinct_emails_4w,credit_risk_score,email_is_free,phone_home_valid,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,0.011,0.575,0.489,14.825,99.381,41.304,0.912,8.639,1567.401,5685.095,4787.41,4860.388,202.464,7.951,137.458,0.523,0.49,0.855,11.196,0.244,558.746,0.025,7.905,0.56,1.024,0.0,3.289
std,0.104,0.289,0.291,43.231,94.559,13.805,4.987,20.574,1009.619,3001.708,1470.369,916.809,474.134,4.961,72.199,0.499,0.5,0.352,12.109,0.43,513.848,0.155,8.338,0.496,0.197,0.0,2.21
min,0.0,0.1,0.0,-1.0,-1.0,10.0,0.0,-15.74,1.0,-174.11,1322.325,2870.592,0.0,0.0,-191.0,0.0,0.0,0.0,-1.0,0.0,190.0,0.0,-1.0,0.0,-1.0,0.0,0.0
25%,0.0,0.3,0.215,-1.0,26.0,30.0,0.007,-1.178,901.0,3470.243,3628.562,4271.193,1.0,4.0,87.0,0.0,0.0,1.0,1.0,0.0,200.0,0.0,3.21,0.0,1.0,0.0,1.0
50%,0.0,0.6,0.49,-1.0,64.0,50.0,0.016,-0.831,1236.0,5408.426,4765.968,4919.354,10.0,7.0,128.0,1.0,0.0,1.0,6.0,0.0,200.0,0.0,5.279,1.0,1.0,0.0,3.0
75%,0.0,0.8,0.754,-1.0,154.0,50.0,0.027,0.079,1909.0,7653.99,5750.775,5489.469,32.0,11.0,187.0,1.0,1.0,1.0,25.0,0.0,1000.0,0.0,9.42,1.0,1.0,0.0,5.0
max,1.0,0.9,1.0,399.0,429.0,90.0,76.578,112.703,6650.0,16801.34,9539.357,7019.201,2377.0,39.0,388.0,1.0,1.0,1.0,32.0,1.0,2100.0,1.0,87.237,1.0,2.0,0.0,7.0


In [8]:
df.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month'],
      dtype='object')

# Cleaning


In [9]:
df.isnull().sum()

Unnamed: 0,0
fraud_bool,0
income,0
name_email_similarity,0
prev_address_months_count,0
current_address_months_count,0
customer_age,0
days_since_request,0
intended_balcon_amount,0
payment_type,0
zip_count_4w,0


# Feature Engineering and Transformation

In [10]:
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

In [11]:
df.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month', 'month_sin',
       'month_cos'],
      dtype='object')

In [12]:
features = df.drop(columns=['fraud_bool', 'month'])

In [13]:
numerical_features = features.select_dtypes(include=np.number).columns
numerical_df = features[numerical_features]
print(numerical_df.head())

   income  name_email_similarity  prev_address_months_count  \
0   0.700                  0.062                         -1   
1   0.900                  0.098                         -1   
2   0.600                  0.117                         -1   
3   0.300                  0.059                         10   
4   0.100                  0.690                         -1   

   current_address_months_count  customer_age  days_since_request  \
0                            24            50               0.017   
1                           310            50               0.019   
2                           189            60               0.047   
3                            40            60               0.008   
4                           128            30               2.514   

   intended_balcon_amount  zip_count_4w  velocity_6h  velocity_24h  \
0                  -0.872          3430     8645.190      5569.162   
1                  -1.024          3492     1610.761      6214.332

In [14]:
categorical_features = features.select_dtypes(exclude=np.number).columns
categorical_df = features[categorical_features]
print(categorical_df.head())

  payment_type employment_status housing_status    source  device_os
0           AB                CB             BC  INTERNET      other
1           AB                CA             BA  INTERNET  macintosh
2           AB                CB             BC  INTERNET      other
3           AA                CA             BC  INTERNET      linux
4           AD                CB             BC  INTERNET      other


In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_scaled = preprocessor.fit_transform(features)
X_scaled_df = pd.DataFrame(X_scaled, columns=preprocessor.get_feature_names_out())
print(X_scaled_df.head())

   num__income  num__name_email_similarity  num__prev_address_months_count  \
0        0.434                      -1.463                          -0.366   
1        1.127                      -1.339                          -0.366   
2        0.088                      -1.275                          -0.366   
3       -0.951                      -1.474                          -0.112   
4       -1.643                       0.691                          -0.366   

   num__current_address_months_count  num__customer_age  \
0                             -0.797              0.630   
1                              2.227              0.630   
2                              0.948              1.354   
3                             -0.628              1.354   
4                              0.303             -0.819   

   num__days_since_request  num__intended_balcon_amount  num__zip_count_4w  \
0                   -0.180                       -0.462              1.845   
1                   

In [16]:
print(X_scaled_df.info())
print(X_scaled_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 53 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   num__income                            1000000 non-null  float64
 1   num__name_email_similarity             1000000 non-null  float64
 2   num__prev_address_months_count         1000000 non-null  float64
 3   num__current_address_months_count      1000000 non-null  float64
 4   num__customer_age                      1000000 non-null  float64
 5   num__days_since_request                1000000 non-null  float64
 6   num__intended_balcon_amount            1000000 non-null  float64
 7   num__zip_count_4w                      1000000 non-null  float64
 8   num__velocity_6h                       1000000 non-null  float64
 9   num__velocity_24h                      1000000 non-null  float64
 10  num__velocity_4w                       1000

In [17]:
X_scaled_df.columns

Index(['num__income', 'num__name_email_similarity',
       'num__prev_address_months_count', 'num__current_address_months_count',
       'num__customer_age', 'num__days_since_request',
       'num__intended_balcon_amount', 'num__zip_count_4w', 'num__velocity_6h',
       'num__velocity_24h', 'num__velocity_4w', 'num__bank_branch_count_8w',
       'num__date_of_birth_distinct_emails_4w', 'num__credit_risk_score',
       'num__email_is_free', 'num__phone_home_valid',
       'num__phone_mobile_valid', 'num__bank_months_count',
       'num__has_other_cards', 'num__proposed_credit_limit',
       'num__foreign_request', 'num__session_length_in_minutes',
       'num__keep_alive_session', 'num__device_distinct_emails_8w',
       'num__device_fraud_count', 'num__month_sin', 'num__month_cos',
       'cat__payment_type_AA', 'cat__payment_type_AB', 'cat__payment_type_AC',
       'cat__payment_type_AD', 'cat__payment_type_AE',
       'cat__employment_status_CA', 'cat__employment_status_CB',
       '

In [18]:
df.shape

(1000000, 34)

In [19]:
features.shape

(1000000, 32)

In [20]:
X_scaled.shape

(1000000, 53)

In [21]:
save_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Variant II/X_scaled.csv"
X_scaled_df.to_csv(save_path, index=False)

In [22]:
import pickle

save_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Variant II/X_scaled.pkl"
with open(save_path, 'wb') as file:
    pickle.dump(X_scaled, file)


# Fraud Detection Included

Below, I created a transformed dataset where fraud_bool is included for visualization purposes.

## Clean & Process

## Setup

In [23]:
file_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Variant II.csv/fifar_variant_II.csv"

try:
  df = pd.read_csv(file_path)
  print("DataFrame loaded successfully.")
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
except pd.errors.ParserError:
  print(f"Error: Unable to parse the CSV file at {file_path}. Check the file format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")

DataFrame loaded successfully.


In [24]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## Feature Engineering and Transformation

In [25]:
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)

In [26]:
df.columns

Index(['fraud_bool', 'income', 'name_email_similarity',
       'prev_address_months_count', 'current_address_months_count',
       'customer_age', 'days_since_request', 'intended_balcon_amount',
       'payment_type', 'zip_count_4w', 'velocity_6h', 'velocity_24h',
       'velocity_4w', 'bank_branch_count_8w',
       'date_of_birth_distinct_emails_4w', 'employment_status',
       'credit_risk_score', 'email_is_free', 'housing_status',
       'phone_home_valid', 'phone_mobile_valid', 'bank_months_count',
       'has_other_cards', 'proposed_credit_limit', 'foreign_request', 'source',
       'session_length_in_minutes', 'device_os', 'keep_alive_session',
       'device_distinct_emails_8w', 'device_fraud_count', 'month', 'month_sin',
       'month_cos'],
      dtype='object')

In [27]:
features = df.drop(columns=['month'])

In [28]:
numerical_features = features.select_dtypes(include=np.number).columns
numerical_df = features[numerical_features]
print(numerical_df.head())

   fraud_bool  income  name_email_similarity  prev_address_months_count  \
0           0   0.700                  0.062                         -1   
1           0   0.900                  0.098                         -1   
2           0   0.600                  0.117                         -1   
3           0   0.300                  0.059                         10   
4           0   0.100                  0.690                         -1   

   current_address_months_count  customer_age  days_since_request  \
0                            24            50               0.017   
1                           310            50               0.019   
2                           189            60               0.047   
3                            40            60               0.008   
4                           128            30               2.514   

   intended_balcon_amount  zip_count_4w  velocity_6h  velocity_24h  \
0                  -0.872          3430     8645.190      5569.1

In [29]:
categorical_features = features.select_dtypes(exclude=np.number).columns
categorical_df = features[categorical_features]
print(categorical_df.head())

  payment_type employment_status housing_status    source  device_os
0           AB                CB             BC  INTERNET      other
1           AB                CA             BA  INTERNET  macintosh
2           AB                CB             BC  INTERNET      other
3           AA                CA             BC  INTERNET      linux
4           AD                CB             BC  INTERNET      other


In [30]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
X_scaled_full = preprocessor.fit_transform(features)
X_scaled_full_df = pd.DataFrame(X_scaled_full, columns=preprocessor.get_feature_names_out())
print(X_scaled_full_df.head())

   num__fraud_bool  num__income  num__name_email_similarity  \
0           -0.106        0.434                      -1.463   
1           -0.106        1.127                      -1.339   
2           -0.106        0.088                      -1.275   
3           -0.106       -0.951                      -1.474   
4           -0.106       -1.643                       0.691   

   num__prev_address_months_count  num__current_address_months_count  \
0                          -0.366                             -0.797   
1                          -0.366                              2.227   
2                          -0.366                              0.948   
3                          -0.112                             -0.628   
4                          -0.366                              0.303   

   num__customer_age  num__days_since_request  num__intended_balcon_amount  \
0              0.630                   -0.180                       -0.462   
1              0.630            

In [31]:
print(X_scaled_full_df.info())
print(X_scaled_full_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 54 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   num__fraud_bool                        1000000 non-null  float64
 1   num__income                            1000000 non-null  float64
 2   num__name_email_similarity             1000000 non-null  float64
 3   num__prev_address_months_count         1000000 non-null  float64
 4   num__current_address_months_count      1000000 non-null  float64
 5   num__customer_age                      1000000 non-null  float64
 6   num__days_since_request                1000000 non-null  float64
 7   num__intended_balcon_amount            1000000 non-null  float64
 8   num__zip_count_4w                      1000000 non-null  float64
 9   num__velocity_6h                       1000000 non-null  float64
 10  num__velocity_24h                      1000

In [32]:
X_scaled_full_df.columns

Index(['num__fraud_bool', 'num__income', 'num__name_email_similarity',
       'num__prev_address_months_count', 'num__current_address_months_count',
       'num__customer_age', 'num__days_since_request',
       'num__intended_balcon_amount', 'num__zip_count_4w', 'num__velocity_6h',
       'num__velocity_24h', 'num__velocity_4w', 'num__bank_branch_count_8w',
       'num__date_of_birth_distinct_emails_4w', 'num__credit_risk_score',
       'num__email_is_free', 'num__phone_home_valid',
       'num__phone_mobile_valid', 'num__bank_months_count',
       'num__has_other_cards', 'num__proposed_credit_limit',
       'num__foreign_request', 'num__session_length_in_minutes',
       'num__keep_alive_session', 'num__device_distinct_emails_8w',
       'num__device_fraud_count', 'num__month_sin', 'num__month_cos',
       'cat__payment_type_AA', 'cat__payment_type_AB', 'cat__payment_type_AC',
       'cat__payment_type_AD', 'cat__payment_type_AE',
       'cat__employment_status_CA', 'cat__employment_s

In [33]:
df.shape

(1000000, 34)

In [34]:
features.shape

(1000000, 33)

In [35]:
X_scaled.shape

(1000000, 53)

In [36]:
X_scaled_full.shape

(1000000, 54)

In [37]:
save_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Variant II/X_scaled_full.csv"
X_scaled_full_df.to_csv(save_path, index=False)

In [38]:
import pickle

save_path = "/content/drive/MyDrive/Professional/Portfolio/Fraud Detection/Data/Preprocessed/Variant II/X_scaled_full.pkl"
with open(save_path, 'wb') as file:
    pickle.dump(X_scaled, file)
