# Import and Load and Preprocess Data

In [1]:
import os
import sys
sys.path.append(os.path.abspath('..')) # add parent path to working directory

In [2]:
from importlib import reload
from scripts import data_load_clean_transform, fraud_detection_preprocessor
reload(data_load_clean_transform) # reload the script to avoid kernel restart
reload(fraud_detection_preprocessor)

<module 'scripts.fraud_detection_preprocessor' from 'c:\\ML and DS Files\\Kifiya AI\\Kaim-week-8-9\\scripts\\fraud_detection_preprocessor.py'>

In [3]:
from scripts.data_load_clean_transform import DataLoader, DataCleaner # import DataLoader and DataCleaner from scripts
from scripts.fraud_detection_preprocessor import FraudDetectionPreprocessor

In [4]:
fraud_data_path = '../week 8-9 data/Data/Fraud_Data.csv'
Loader = DataLoader()
fraud_data = Loader.load_csv(fraud_data_path)

2025-02-18 06:27:03,539 - INFO - Data successfully loaded from ../week 8-9 data/Data/Fraud_Data.csv
2025-02-18 06:27:03,547 - INFO - DataFrame Shape: (151112, 11)


In [5]:
fraud_cleaner = DataCleaner(fraud_data) # instatntiate Cleaner class for data cleaning

In [6]:
fraud_cleaner.transform_datetime(column='signup_time')
fraud_cleaner.transform_datetime(column='purchase_time')

2025-02-18 06:27:03,573 - INFO - Transforming datetime column 'signup_time' without timezone conversion.
2025-02-18 06:27:03,711 - INFO - Datetime transformation for column 'signup_time' completed.
2025-02-18 06:27:03,712 - INFO - Transforming datetime column 'purchase_time' without timezone conversion.
2025-02-18 06:27:03,829 - INFO - Datetime transformation for column 'purchase_time' completed.


In [7]:
fraud_cleaner.check_missing_values()

2025-02-18 06:27:03,843 - INFO - Checking for missing values in the DataFrame.
2025-02-18 06:27:03,875 - INFO - Missing values check completed.


Unnamed: 0,Column,Missing Values,Missing Percentage,Data Type
0,user_id,0,0.0,int64
1,signup_time,0,0.0,datetime64[ns]
2,purchase_time,0,0.0,datetime64[ns]
3,purchase_value,0,0.0,int64
4,device_id,0,0.0,object
5,source,0,0.0,object
6,browser,0,0.0,object
7,sex,0,0.0,object
8,age,0,0.0,int64
9,ip_address,0,0.0,float64


In [8]:
creditcard_data_path = '../week 8-9 data/Data/creditcard.csv'
creditcard_data = Loader.load_csv(creditcard_data_path)

2025-02-18 06:27:06,199 - INFO - Data successfully loaded from ../week 8-9 data/Data/creditcard.csv
2025-02-18 06:27:06,200 - INFO - DataFrame Shape: (284807, 31)


In [9]:
creditcard_cleaner = DataCleaner(creditcard_data)
creditcard_cleaner.check_missing_values()

2025-02-18 06:27:06,213 - INFO - Checking for missing values in the DataFrame.
2025-02-18 06:27:06,243 - INFO - Missing values check completed.


Unnamed: 0,Column,Missing Values,Missing Percentage,Data Type
0,Time,0,0.0,float64
1,V1,0,0.0,float64
2,V2,0,0.0,float64
3,V3,0,0.0,float64
4,V4,0,0.0,float64
5,V5,0,0.0,float64
6,V6,0,0.0,float64
7,V7,0,0.0,float64
8,V8,0,0.0,float64
9,V9,0,0.0,float64


In [10]:
ip_address_path = '../week 8-9 data/Data/IpAddress_to_Country.csv'
ip_address_data = Loader.load_csv(ip_address_path)

2025-02-18 06:27:06,363 - INFO - Data successfully loaded from ../week 8-9 data/Data/IpAddress_to_Country.csv
2025-02-18 06:27:06,364 - INFO - DataFrame Shape: (138846, 3)


In [11]:
ip_cleaner = DataCleaner(ip_address_data)
ip_cleaner.check_missing_values()

2025-02-18 06:27:06,376 - INFO - Checking for missing values in the DataFrame.
2025-02-18 06:27:06,388 - INFO - Missing values check completed.


Unnamed: 0,Column,Missing Values,Missing Percentage,Data Type
0,lower_bound_ip_address,0,0.0,float64
1,upper_bound_ip_address,0,0.0,int64
2,country,0,0.0,object


# Preprocessing for Trainnig

In [12]:
preprocessor = FraudDetectionPreprocessor(fraud_data, ip_address_data) # instantiate the object 

In [13]:
# convert ip to int
preprocessor.convert_ip_to_int() 

2025-02-18 06:27:06,447 - INFO - Converting IP addresses to integer format.
2025-02-18 06:27:06,451 - INFO - IP address conversion completed successfully.


In [14]:
# Merge Fraud_Data.csv with IpAddress_to_Country.csv
preprocessor.merge_with_ip_data()

2025-02-18 06:27:06,467 - INFO - Merging fraud_data with ip_address_data.
2025-02-18 06:27:06,585 - INFO - Merging completed successfully.


In [15]:
processed_data = preprocessor.get_processed_data()
processed_data.head(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_int,lower_bound_ip_address,upper_bound_ip_address,country
0,62421,2015-02-16 00:17:05,2015-03-08 10:00:39,46,ZCLZTAJPCRAQX,Direct,Safari,M,36,52093.496895,0,52093,,,
1,173212,2015-03-08 04:03:22,2015-03-20 17:23:45,33,YFGYOALADBHLT,Ads,IE,F,30,93447.138961,0,93447,,,
2,242286,2015-05-17 16:45:54,2015-05-26 08:54:34,33,QZNVQTUITFTHH,Direct,FireFox,F,32,105818.501505,0,105818,,,
3,370003,2015-03-03 19:58:39,2015-05-28 21:09:13,33,PIBUQMBIELMMG,Ads,IE,M,40,117566.664867,0,117566,,,
4,119824,2015-03-20 00:31:27,2015-04-05 07:31:46,55,WFIIFCPIOGMHT,Ads,Safari,M,38,131423.789042,0,131423,,,
