In [18]:
# Import necessary libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the path where telecom_analysis.py is located
sys.path.append(os.path.abspath('../scripts'))

# Import functions from telecom_analysis.py
from data_preprocessing import (
    load_data,
    data_overview,
)

In [39]:
creditcard_file_path = '../data/creditcard.csv'
fraud_data_file_path = '../data/Fraud_Data.csv'
ipaddress_to_country_file_path = '../data/IpAddress_to_Country.csv'

# Load the CSV files into DataFrames
creditcard_df = pd.read_csv(creditcard_file_path)
fraud_data_df = pd.read_csv(fraud_data_file_path)
ipaddress_to_country_df = pd.read_csv(ipaddress_to_country_file_path)

In [21]:
creditcard_df.head(4)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0


In [22]:
fraud_data_df.head(4)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0


In [23]:
ipaddress_to_country_df.head(4)

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia


In [24]:
# Check for missing values in each dataset
print("Missing values in Credit Card data:\n", creditcard_df.isnull().sum())
print("Missing values in Fraud Data:\n", fraud_data_df.isnull().sum())
print("Missing values in IP Address to Country Data:\n", ipaddress_to_country_df.isnull().sum())


Missing values in Credit Card data:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Missing values in Fraud Data:
 user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64
Missing values in IP Address to Country Data:
 lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64


In [25]:
# For this example, we will drop rows with missing values (you can use imputation if needed)
creditcard_df.dropna(inplace=True)
fraud_data_df.dropna(inplace=True)
ipaddress_to_country_df.dropna(inplace=True)

# 2. Data Cleaning
# Remove duplicates in each dataset
creditcard_df.drop_duplicates(inplace=True)
fraud_data_df.drop_duplicates(inplace=True)
ipaddress_to_country_df.drop_duplicates(inplace=True)

In [26]:
# Convert data types where appropriate
# For Fraud Data: Convert 'signup_time' and 'purchase_time' to datetime format
fraud_data_df['signup_time'] = pd.to_datetime(fraud_data_df['signup_time'])
fraud_data_df['purchase_time'] = pd.to_datetime(fraud_data_df['purchase_time'])

In [27]:
print("Data types in Fraud Data:\n", fraud_data_df.dtypes)

Data types in Fraud Data:
 user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value             int64
device_id                 object
source                    object
browser                   object
sex                       object
age                        int64
ip_address               float64
class                      int64
dtype: object


In [None]:


# 3. Exploratory Data Analysis (EDA)

# --- Credit Card Dataset EDA ---
# Univariate Analysis - Distribution of transaction amounts
plt.figure(figsize=(10,6))
sns.histplot(creditcard_df['Amount'], bins=50, kde=True)
plt.title('Distribution of Transaction Amounts (Credit Card Data)')
plt.show()

# Univariate Analysis - Count of fraudulent vs non-fraudulent transactions
plt.figure(figsize=(6,4))
sns.countplot(x='Class', data=creditcard_df)
plt.title('Fraudulent vs Non-Fraudulent Transactions (Credit Card Data)')
plt.show()

# --- Fraud Data EDA ---
# Univariate Analysis - Distribution of purchase values
plt.figure(figsize=(10,6))
sns.histplot(fraud_data_df['purchase_value'], bins=50, kde=True)
plt.title('Distribution of Purchase Values (Fraud Data)')
plt.show()

# Univariate Analysis - Distribution of age
plt.figure(figsize=(10,6))
sns.histplot(fraud_data_df['age'], bins=30, kde=True)
plt.title('Distribution of Age (Fraud Data)')
plt.show()



In [None]:
# Bivariate Analysis - Correlation between numeric features in Credit Card Data
plt.figure(figsize=(12,8))
corr_matrix = creditcard_df.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap (Credit Card Data)')
plt.show()

# Bivariate Analysis - Fraud occurrence by gender in Fraud Data
plt.figure(figsize=(6,4))
sns.countplot(x='sex', hue='class', data=fraud_data_df)
plt.title('Fraud Occurrence by Gender (Fraud Data)')
plt.show()

# Time difference between signup and purchase
fraud_data_df['time_diff'] = (fraud_data_df['purchase_time'] - fraud_data_df['signup_time']).dt.total_seconds() / 3600
plt.figure(figsize=(10,6))
sns.histplot(fraud_data_df['time_diff'], bins=50, kde=True)
plt.title('Distribution of Time Difference between Signup and Purchase (Fraud Data)')
plt.show()

# --- IP Address to Country Dataset EDA ---
# Check distribution of countries
plt.figure(figsize=(12,6))
top_countries = ipaddress_to_country_df['country'].value_counts().head(10)
sns.barplot(x=top_countries.index, y=top_countries.values)
plt.title('Top 10 Countries in IP Address Range')
plt.xticks(rotation=45)
plt.show()

In [40]:
import pandas as pd


def int_to_ip(ip_int):
    """
    Converts an integer IP address back to the dot-decimal format (e.g., '192.168.0.1').
    """
    # Ensure ip_int is an integer
    if not isinstance(ip_int, int):
        raise ValueError(f"Expected an integer, got {type(ip_int).__name__}")

    return '.'.join([str((ip_int >> (i * 8)) & 0xFF) for i in range(3, -1, -1)])

# Ensure the columns are treated as integers
ipaddress_to_country_df['lower_bound_ip_address'] = ipaddress_to_country_df['lower_bound_ip_address'].fillna(0).astype(int)
ipaddress_to_country_df['upper_bound_ip_address'] = ipaddress_to_country_df['upper_bound_ip_address'].fillna(0).astype(int)

# Convert the integer IP addresses to dot-decimal format and overwrite the original columns
ipaddress_to_country_df['lower_bound_ip_address'] = ipaddress_to_country_df['lower_bound_ip_address'].apply(int_to_ip)
ipaddress_to_country_df['upper_bound_ip_address'] = ipaddress_to_country_df['upper_bound_ip_address'].apply(int_to_ip)

# Inspect the result
print(ipaddress_to_country_df)


       lower_bound_ip_address upper_bound_ip_address    country
0                     1.0.0.0              1.0.0.255  Australia
1                     1.0.1.0              1.0.1.255      China
2                     1.0.2.0              1.0.3.255      China
3                     1.0.4.0              1.0.7.255  Australia
4                     1.0.8.0             1.0.15.255      China
...                       ...                    ...        ...
138841          223.255.240.0        223.255.243.255  Hong Kong
138842          223.255.244.0        223.255.247.255      India
138843          223.255.252.0        223.255.253.255      China
138844          223.255.254.0        223.255.254.255  Singapore
138845          223.255.255.0        223.255.255.255  Australia

[138846 rows x 3 columns]


In [None]:
import pandas as pd

def int_to_ip(ip_int):
    """
    Converts an integer IP address back to the dot-decimal format (e.g., '192.168.0.1').
    """
    # Ensure ip_int is an integer
    if not isinstance(ip_int, int):
        raise ValueError(f"Expected an integer, got {type(ip_int).__name__}")

    return '.'.join([str((ip_int >> (i * 8)) & 0xFF) for i in range(3, -1, -1)])

# Sample DataFrames for demonstration (replace with your actual DataFrames)
# fraud_data_df = pd.DataFrame({
#     'user_id': [22058, 333320, 1359, 150084],
#     'signup_time': pd.to_datetime(['2015-02-24 22:55:49', '2015-06-07 20:39:50', '2015-01-01 18:52:44', '2015-04-28 21:13:25']),
#     'purchase_time': pd.to_datetime(['2015-04-18 02:47:11', '2015-06-08 01:38:54', '2015-01-01 18:52:45', '2015-05-04 13:54:50']),
#     'purchase_value': [34, 16, 15, 44],
#     'device_id': ['QVPSPJUOCKZAR', 'EOGFQPIZPYXFZ', 'YSSKYOSJHPPLJ', 'ATGTXKYKUDUQN'],
#     'source': ['SEO', 'Ads', 'SEO', 'SEO'],
#     'browser': ['Chrome', 'Chrome', 'Opera', 'Safari'],
#     'sex': ['M', 'F', 'M', 'M'],
#     'age': [39, 53, 53, 41],
#     'ip_address': [732758400, 350311400, 2621474000, 3840542000],  # Example IP addresses as integers
#     'class': [0, 0, 1, 0]
# })

# ipaddress_to_country_df = pd.DataFrame({
#     'lower_bound_ip_address': [16777216, 16777472, 16777728, 16778240],
#     'upper_bound_ip_address': [16777471, 16777727, 16778239, 16779263],
#     'country': ['Australia', 'China', 'China', 'Australia']
# })

# Step 1: Ensure the IP address columns in both DataFrames are treated as integers
fraud_data_df['ip_address'] = fraud_data_df['ip_address'].fillna(0).astype(int)

ipaddress_to_country_df['lower_bound_ip_address'] = ipaddress_to_country_df['lower_bound_ip_address'].fillna(0).astype(int)
ipaddress_to_country_df['upper_bound_ip_address'] = ipaddress_to_country_df['upper_bound_ip_address'].fillna(0).astype(int)

# Step 2: Convert the integer IP addresses in ipaddress_to_country_df to dot-decimal format
ipaddress_to_country_df['lower_bound_ip_address'] = ipaddress_to_country_df['lower_bound_ip_address'].apply(int_to_ip)
ipaddress_to_country_df['upper_bound_ip_address'] = ipaddress_to_country_df['upper_bound_ip_address'].apply(int_to_ip)

# Step 3: Inspect the results
print("Fraud Data:")
print(fraud_data_df[['user_id', 'ip_address']].head())  # Show the fraud data IPs
print("\nIP Address to Country Data:")
print(ipaddress_to_country_df.head())  # Show the converted IP address ranges


In [41]:
ipaddress_to_country_df.head(8)

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,1.0.0.0,1.0.0.255,Australia
1,1.0.1.0,1.0.1.255,China
2,1.0.2.0,1.0.3.255,China
3,1.0.4.0,1.0.7.255,Australia
4,1.0.8.0,1.0.15.255,China
5,1.0.16.0,1.0.31.255,Japan
6,1.0.32.0,1.0.63.255,China
7,1.0.64.0,1.0.127.255,Japan


In [42]:
# Step 1: Convert 'ip_address' in fraud_data_df to integer if it's float
fraud_data_df['ip_address'] = fraud_data_df['ip_address'].fillna(0).astype('int64')

In [43]:
fraud_data_df.head(8)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758368,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311387,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542443,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0
5,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315199,0
6,50116,2015-08-01 22:40:52,2015-08-27 03:37:57,11,IWKVZHJOCLPUR,Ads,Chrome,F,19,3987484328,0
7,360585,2015-04-06 07:35:45,2015-05-25 17:21:14,27,HPUCUYLMJBYFW,Ads,Opera,M,34,1692458727,0


In [None]:

# -------------------- Step 2: Merge Fraud_Data with IpAddress_to_Country --------------------

# We will use the pandas merge_asof function to merge based on IP address ranges
# This assumes that the 'ip_address_int' should fall between 'lower_bound_ip_address_int' and 'upper_bound_ip_address_int'

# First, sort the dataframes by IP addresses
fraud_data_df = fraud_data_df.sort_values('ip_address_int')
ipaddress_to_country_df = ipaddress_to_country_df.sort_values('lower_bound_ip_address_int')

# Perform the merge_asof operation
merged_df = pd.merge_asof(fraud_data_df, ipaddress_to_country_df,
                          left_on='ip_address_int',
                          right_on='lower_bound_ip_address_int',
                          direction='backward',  # We want to match the IP address to the closest lower bound
                          suffixes=('', '_country'))

# Now, we have the `country` column in the merged_df dataset
# Check the result
print(merged_df[['ip_address', 'ip_address_int', 'country']].head())


In [None]:

# -------------------- Step 3: Feature Engineering --------------------

# Feature 1: Transaction frequency and velocity (for each device_id)
# Transaction count per user and device
merged_df['user_transaction_count'] = merged_df.groupby('user_id')['user_id'].transform('count')
merged_df['device_transaction_count'] = merged_df.groupby('device_id')['device_id'].transform('count')

# Feature 2: Time-based features
# Convert 'signup_time' and 'purchase_time' to datetime format if not already
merged_df['signup_time'] = pd.to_datetime(merged_df['signup_time'])
merged_df['purchase_time'] = pd.to_datetime(merged_df['purchase_time'])

# Time difference between signup and purchase (in hours)
merged_df['time_diff_hours'] = (merged_df['purchase_time'] - merged_df['signup_time']).dt.total_seconds() / 3600

# Extract hour of the day and day of the week from purchase_time
merged_df['hour_of_day'] = merged_df['purchase_time'].dt.hour
merged_df['day_of_week'] = merged_df['purchase_time'].dt.dayofweek  # Monday=0, Sunday=6

# Feature 3: Device and IP uniqueness
# Number of unique users per device
merged_df['unique_users_per_device'] = merged_df.groupby('device_id')['user_id'].transform('nunique')

# Number of unique devices per IP address
merged_df['unique_devices_per_ip'] = merged_df.groupby('ip_address_int')['device_id'].transform('nunique')

# -------------------- Step 4: Inspect the final DataFrame --------------------
print(merged_df.head())

# Save the merged DataFrame with new features for further analysis
merged_df.to_csv('../data/merged_fraud_data_with_features.csv', index=False)

# -------------------- Step 5: Basic Summary of Feature Engineering --------------------
# Display basic statistics of new features
print("Transaction Frequency and Velocity Statistics:\n", merged_df[['user_transaction_count', 'device_transaction_count']].describe())
print("Time-Based Feature Statistics:\n", merged_df[['time_diff_hours', 'hour_of_day', 'day_of_week']].describe())
print("Device and IP Uniqueness Statistics:\n", merged_df[['unique_users_per_device', 'unique_devices_per_ip']].describe())
