# Import the necessary libraries

In [5]:
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import pandas as pd
 
# Configure logging
logging.basicConfig(filename='../logs/',
                    level=logging.INFO,
                    format='%(asctime)s:%(levelname)s:%(message)s')
  

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../scripts')))
from load_csv_data import Load_CSV_Data


# Load the data sets

In [6]:
credit_df = Load_CSV_Data('../data/cleaned_credit_card_data.csv')
credit_df.load_csv_data()
credit_df = credit_df.get_data()

Data successfully loaded from ../data/cleaned_credit_card_data.csv


In [7]:
fraud_df = Load_CSV_Data('../data/cleaned_fraud_data.csv')
fraud_df.load_csv_data()
fraud_df = fraud_df.get_data()

Data successfully loaded from ../data/cleaned_fraud_data.csv


In [8]:
ip_df = Load_CSV_Data('../data/cleaned_IpAddress_to_Country.csv')
ip_df.load_csv_data()
ip_df = ip_df.get_data()

Data successfully loaded from ../data/cleaned_IpAddress_to_Country.csv


# Merge the data sets

In [9]:
# Convert the 'ip_address' in df_user_cleaned to integer (if not already)
fraud_df['ip_address'] = fraud_df['ip_address'].astype(float)

# Merge fraud data with IP address geolocation data based on IP address range
def merge_ip_geolocation(fraud_df, ip_df):
    """
    Merges fraud dataset with IP geolocation dataset.
    
    Parameters:
    fraud_df (pd.DataFrame): Fraud dataset with 'ip_address' column.
    ip_df (pd.DataFrame): IP geolocation dataset with 'lower_bound_ip_address' and 'upper_bound_ip_address' columns.
    
    Returns:
    pd.DataFrame: Merged dataset.
    """
    merged_data = pd.merge_asof(
        fraud_df.sort_values('ip_address'),  # Sort fraud data by IP
        ip_df.sort_values('lower_bound_ip_address'),  # Sort IP data by lower bound
        left_on='ip_address',  # Merge on ip_address from fraud data
        right_on='lower_bound_ip_address',  # Merge on lower_bound_ip_address from IP geolocation data
        direction='backward'  # Merge backward to match within the IP range
    )

    # Filter to keep only the rows where the IP is within the upper bound
    merged_data = merged_data[merged_data['ip_address'] <= merged_data['upper_bound_ip_address']]
    
    return merged_data

# Apply the merging function
df_merged = merge_ip_geolocation(fraud_df, ip_df)

df_merged.head()

# Save the merged data
df_merged.to_csv("../data/merged_fraud_geolocation_data.csv", index=False)
print("Merged dataset saved as 'merged_fraud_geolocation_data.csv'")

Merged dataset saved as 'merged_fraud_geolocation_data.csv'
