# Import the necessary libraries

In [9]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import pandas as pd
 
# Configure logging
logging.basicConfig(filename='../logs/',
                    level=logging.INFO,
                    format='%(asctime)s:%(levelname)s:%(message)s')
  

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from load_csv_data import Load_CSV_Data


# Load the data sets

In [10]:
credit_df = Load_CSV_Data('../data/cleaned_credit_card_data.csv')
credit_df.load_csv_data()
credit_df = credit_df.get_data()

Data successfully loaded from ../data/cleaned_credit_card_data.csv


In [11]:
fraud_df = Load_CSV_Data('../data/cleaned_fraud_data.csv')
fraud_df.load_csv_data()
fraud_df = fraud_df.get_data()

Data successfully loaded from ../data/cleaned_fraud_data.csv


In [12]:
ip_df = Load_CSV_Data('../data/cleaned_IpAddress_to_Country.csv')
ip_df.load_csv_data()
ip_df = ip_df.get_data()

Data successfully loaded from ../data/cleaned_IpAddress_to_Country.csv


# Merge the data sets

In [14]:
# Convert the 'ip_address' in df_user_cleaned to integer (if not already)
fraud_df['ip_address'] = fraud_df['ip_address'].astype(float)

# Merge fraud data with IP address geolocation data based on IP address range
def merge_ip_geolocation(fraud_df, ip_df):
    """
    Merges fraud dataset with IP geolocation dataset.
    
    Parameters:
    fraud_df (pd.DataFrame): Fraud dataset with 'ip_address' column.
    ip_df (pd.DataFrame): IP geolocation dataset with 'lower_bound_ip_address' and 'upper_bound_ip_address' columns.
    
    Returns:
    pd.DataFrame: Merged dataset.
    """
    merged_data = pd.merge_asof(
        fraud_df.sort_values('ip_address'),  # Sort fraud data by IP
        ip_df.sort_values('lower_bound_ip_address'),  # Sort IP data by lower bound
        left_on='ip_address',  # Merge on ip_address from fraud data
        right_on='lower_bound_ip_address',  # Merge on lower_bound_ip_address from IP geolocation data
        direction='backward'  # Merge backward to match within the IP range
    )

    # Filter to keep only the rows where the IP is within the upper bound
    merged_data = merged_data[merged_data['ip_address'] <= merged_data['upper_bound_ip_address']]
    
    return merged_data

# Apply the merging function
df_merged = merge_ip_geolocation(fraud_df, ip_df)

df_merged.head()

# Save the merged data
df_merged.to_csv("../data/merged_fraud_geolocation_data.csv", index=False)
print("Merged dataset saved as 'merged_fraud_geolocation_data.csv'")

# Feature Engineering 

In [16]:
import pandas as pd

# Load the merged data
df_merged = pd.read_csv("../data/merged_fraud_geolocation_data.csv")

# Convert 'signup_time' and 'purchase_time' to datetime
df_merged['signup_time'] = pd.to_datetime(df_merged['signup_time'])
df_merged['purchase_time'] = pd.to_datetime(df_merged['purchase_time'])

# Feature 1: Transaction frequency and velocity (time between signup and purchase)
df_merged['transaction_velocity'] = (df_merged['purchase_time'] - df_merged['signup_time']).dt.total_seconds()

# Feature 2: Time-based features (hour of day and day of week)
df_merged['purchase_hour'] = df_merged['purchase_time'].dt.hour
df_merged['purchase_day_of_week'] = df_merged['purchase_time'].dt.dayofweek

# Save the feature-engineered data
df_merged.to_csv("../data/feature_engineered_data.csv", index=False)
print("Feature-engineered data saved as 'feature_engineered_data.csv'")


PermissionError: [Errno 13] Permission denied: '../data/feature_engineered_data.csv'

In [17]:
df_merged.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,lower_bound_ip_address,upper_bound_ip_address,country,transaction_velocity,purchase_hour,purchase_day_of_week
0,247547,2015-06-28 03:00:34,2015-08-09 03:57:29,47,KIXYSVCHIPQBR,SEO,Safari,F,30,16778860.0,0,16778240.0,16779263.0,Australia,3632215.0,3,6
1,220737,2015-01-28 14:21:11,2015-02-11 20:28:28,15,PKYOWQKWGJNJI,SEO,Chrome,F,34,16842050.0,0,16809984.0,16842751.0,Thailand,1231637.0,20,2
2,390400,2015-03-19 20:49:09,2015-04-11 23:41:23,44,LVCSXLISZHVUO,Ads,IE,M,29,16843660.0,0,16843264.0,16843775.0,China,1997534.0,23,5
3,69592,2015-02-24 06:11:57,2015-05-23 16:40:14,55,UHAUHNXXUADJE,Direct,Chrome,F,30,16938730.0,0,16924672.0,16941055.0,China,7640897.0,16,5
4,174987,2015-07-07 12:58:11,2015-11-03 04:04:30,51,XPGPMOHIDRMGE,SEO,Chrome,F,37,16971980.0,0,16941056.0,16973823.0,Thailand,10249579.0,4,1


# Normalization and Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Load the feature-engineered data
df_featured = pd.read_csv("../data/feature_engineered_data.csv")

# Define the columns that need to be scaled (numeric columns)
numeric_columns = ['transaction_velocity', 'Amount']

# Initialize a StandardScaler
scaler = StandardScaler()

# Apply scaling to the selected columns
df_featured[numeric_columns] = scaler.fit_transform(df_featured[numeric_columns])

# Save the normalized and scaled data
df_featured.to_csv("../data/normalized_scaled_data.csv", index=False)
print("Normalized and scaled data saved as 'normalized_scaled_data.csv'")


# Encoding categorical column

In [19]:

# Load your dataset
df = pd.read_csv("../data/feature_engineered_data.csv")

# List of categorical columns to encode
categorical_columns = ['source', 'browser', 'sex', 'country']

# One-Hot Encoding for categorical features
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Save the encoded dataset
df_encoded.to_csv("encoded_fraud_data.csv", index=False)
print("Categorical features encoded and saved as 'encoded_fraud_data.csv'")


Categorical features encoded and saved as 'encoded_fraud_data.csv'
