In [1]:
# Import necessary libraries
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add the path where telecom_analysis.py is located
sys.path.append(os.path.abspath('../scripts'))


# Import functions from telecom_analysis.py
from data_preprocessing import (
    load_data,
    data_overview,
    standardize_numerical_features,
    encode_categorical_features
)

In [2]:
creditcard_file_path = '../data/creditcard.csv'
fraud_data_file_path = '../data/Fraud_Data.csv'
ipaddress_to_country_file_path = '../data/IpAddress_to_Country.csv'

# Load the CSV files into DataFrames
creditcard_df = pd.read_csv(creditcard_file_path)
fraud_data_df = pd.read_csv(fraud_data_file_path)
ipaddress_to_country_df = pd.read_csv(ipaddress_to_country_file_path)

In [None]:
creditcard_df.head(4)


In [None]:
fraud_data_df.head(4)

In [None]:
ipaddress_to_country_df.head(4)

In [3]:
# Check for missing values in each dataset
print("Missing values in Credit Card data:\n", creditcard_df.isnull().sum())
print("Missing values in Fraud Data:\n", fraud_data_df.isnull().sum())
print("Missing values in IP Address to Country Data:\n", ipaddress_to_country_df.isnull().sum())


Missing values in Credit Card data:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Missing values in Fraud Data:
 user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64
Missing values in IP Address to Country Data:
 lower_bound_ip_address    0
upper_bound_ip_address    0
country                   0
dtype: int64


In [4]:
# For this example, we will drop rows with missing values (you can use imputation if needed)
creditcard_df.dropna(inplace=True)
fraud_data_df.dropna(inplace=True)
ipaddress_to_country_df.dropna(inplace=True)

# 2. Data Cleaning
# Remove duplicates in each dataset
creditcard_df.drop_duplicates(inplace=True)
fraud_data_df.drop_duplicates(inplace=True)
ipaddress_to_country_df.drop_duplicates(inplace=True)

In [5]:
# Convert data types where appropriate
# For Fraud Data: Convert 'signup_time' and 'purchase_time' to datetime format
fraud_data_df['signup_time'] = pd.to_datetime(fraud_data_df['signup_time'])
fraud_data_df['purchase_time'] = pd.to_datetime(fraud_data_df['purchase_time'])

In [6]:
print("Data types in Fraud Data:\n", fraud_data_df.dtypes)

Data types in Fraud Data:
 user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value             int64
device_id                 object
source                    object
browser                   object
sex                       object
age                        int64
ip_address               float64
class                      int64
dtype: object


In [None]:


# 3. Exploratory Data Analysis (EDA)

# --- Credit Card Dataset EDA ---
# Univariate Analysis - Distribution of transaction amounts
plt.figure(figsize=(10,6))
sns.histplot(creditcard_df['Amount'], bins=50, kde=True)
plt.title('Distribution of Transaction Amounts (Credit Card Data)')
plt.show()

# Univariate Analysis - Count of fraudulent vs non-fraudulent transactions
plt.figure(figsize=(6,4))
sns.countplot(x='Class', data=creditcard_df)
plt.title('Fraudulent vs Non-Fraudulent Transactions (Credit Card Data)')
plt.show()

# --- Fraud Data EDA ---
# Univariate Analysis - Distribution of purchase values
plt.figure(figsize=(10,6))
sns.histplot(fraud_data_df['purchase_value'], bins=50, kde=True)
plt.title('Distribution of Purchase Values (Fraud Data)')
plt.show()

# Univariate Analysis - Distribution of age
plt.figure(figsize=(10,6))
sns.histplot(fraud_data_df['age'], bins=30, kde=True)
plt.title('Distribution of Age (Fraud Data)')
plt.show()



In [None]:
# Bivariate Analysis - Correlation between numeric features in Credit Card Data
plt.figure(figsize=(12,8))
corr_matrix = creditcard_df.corr()
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Heatmap (Credit Card Data)')
plt.show()

# Bivariate Analysis - Fraud occurrence by gender in Fraud Data
plt.figure(figsize=(6,4))
sns.countplot(x='sex', hue='class', data=fraud_data_df)
plt.title('Fraud Occurrence by Gender (Fraud Data)')
plt.show()

# Time difference between signup and purchase
fraud_data_df['time_diff'] = (fraud_data_df['purchase_time'] - fraud_data_df['signup_time']).dt.total_seconds() / 3600
plt.figure(figsize=(10,6))
sns.histplot(fraud_data_df['time_diff'], bins=50, kde=True)
plt.title('Distribution of Time Difference between Signup and Purchase (Fraud Data)')
plt.show()

# --- IP Address to Country Dataset EDA ---
# Check distribution of countries
plt.figure(figsize=(12,6))
top_countries = ipaddress_to_country_df['country'].value_counts().head(10)
sns.barplot(x=top_countries.index, y=top_countries.values)
plt.title('Top 10 Countries in IP Address Range')
plt.xticks(rotation=45)
plt.show()

In [7]:
import pandas as pd

# Function to convert an integer to a dot-decimal IP format
def int_to_ip(ip_int):
    """
    Converts an integer IP address back to the dot-decimal format (e.g., '192.168.0.1').
    """
    # Ensure ip_int is an integer
    if not isinstance(ip_int, int):
        raise ValueError(f"Expected an integer, got {type(ip_int).__name__}")

    return '.'.join([str((ip_int >> (i * 8)) & 0xFF) for i in range(3, -1, -1)])


# Step 1: Ensure the IP address columns in both DataFrames are treated as integers
fraud_data_df['ip_address'] = fraud_data_df['ip_address'].fillna(0).astype(int)
ipaddress_to_country_df['lower_bound_ip_address'] = ipaddress_to_country_df['lower_bound_ip_address'].fillna(0).astype(int)
ipaddress_to_country_df['upper_bound_ip_address'] = ipaddress_to_country_df['upper_bound_ip_address'].fillna(0).astype(int)

# Step 2: Convert the integer IP addresses to dot-decimal format (overwrite the columns)
ipaddress_to_country_df['lower_bound_ip_address'] = ipaddress_to_country_df['lower_bound_ip_address'].apply(int_to_ip)
ipaddress_to_country_df['upper_bound_ip_address'] = ipaddress_to_country_df['upper_bound_ip_address'].apply(int_to_ip)
fraud_data_df['ip_address'] = fraud_data_df['ip_address'].apply(int_to_ip)

# Step 3: Inspect the results
print("Fraud Data (with dot-decimal IPs):")
print(fraud_data_df[['user_id', 'ip_address']].head())  # Show the fraud data IPs
print("\nIP Address to Country Data:")
print(ipaddress_to_country_df.head())  # Show the converted IP address ranges


Fraud Data (with dot-decimal IPs):
   user_id     ip_address
0    22058    43.173.1.96
1   333320  20.225.83.219
2     1359  156.64.132.28
3   150084  228.234.6.235
4   221365  24.197.75.141

IP Address to Country Data:
  lower_bound_ip_address upper_bound_ip_address    country
0                1.0.0.0              1.0.0.255  Australia
1                1.0.1.0              1.0.1.255      China
2                1.0.2.0              1.0.3.255      China
3                1.0.4.0              1.0.7.255  Australia
4                1.0.8.0             1.0.15.255      China


In [8]:
import pandas as pd

# Function to convert IP address from dot-decimal format to integer format
def ip_to_int(ip_str):
    """
    Converts a dot-decimal IP address (e.g., '192.168.0.1') to its integer equivalent.
    """
    octets = ip_str.split('.')
    return (int(octets[0]) << 24) + (int(octets[1]) << 16) + (int(octets[2]) << 8) + int(octets[3])


# Step 2: Convert IP addresses to integer format
fraud_data_df['ip_address'] = fraud_data_df['ip_address'].apply(ip_to_int)
ipaddress_to_country_df['lower_bound_ip_address'] = ipaddress_to_country_df['lower_bound_ip_address'].apply(ip_to_int)
ipaddress_to_country_df['upper_bound_ip_address'] = ipaddress_to_country_df['upper_bound_ip_address'].apply(ip_to_int)

# Step 3: Merge the two datasets based on IP address ranges
# We want to find the rows where fraud_data_df['ip_address'] is between lower_bound_ip_address and upper_bound_ip_address
fraud_data_df= pd.merge_asof(
    fraud_data_df.sort_values('ip_address'),
    ipaddress_to_country_df.sort_values('lower_bound_ip_address'),
    left_on='ip_address',
    right_on='lower_bound_ip_address',
    direction='backward'  # Ensures we find the closest lower_bound_ip_address <= ip_address
)

# Step 4: Filter rows where ip_address falls between lower_bound and upper_bound
fraud_data_df = fraud_data_df[(fraud_data_df['ip_address'] >= fraud_data_df['lower_bound_ip_address']) & 
                      (fraud_data_df['ip_address'] <= fraud_data_df['upper_bound_ip_address'])]

# Step 5: Clean up (optional, based on your needs)
# You may drop unnecessary columns or rename them as required
fraud_data_df = fraud_data_df.drop(columns=['lower_bound_ip_address', 'upper_bound_ip_address'])

# Now, fraud_data_df contains fraud data along with the corresponding country information for each IP address
print(fraud_data_df.head())


     user_id         signup_time       purchase_time  purchase_value  \
634   247547 2015-06-28 03:00:34 2015-08-09 03:57:29              47   
635   220737 2015-01-28 14:21:11 2015-02-11 20:28:28              15   
636   390400 2015-03-19 20:49:09 2015-04-11 23:41:23              44   
637    69592 2015-02-24 06:11:57 2015-05-23 16:40:14              55   
638   174987 2015-07-07 12:58:11 2015-11-03 04:04:30              51   

         device_id  source browser sex  age  ip_address  class    country  
634  KIXYSVCHIPQBR     SEO  Safari   F   30    16778864      0  Australia  
635  PKYOWQKWGJNJI     SEO  Chrome   F   34    16842045      0   Thailand  
636  LVCSXLISZHVUO     Ads      IE   M   29    16843656      0      China  
637  UHAUHNXXUADJE  Direct  Chrome   F   30    16938732      0      China  
638  XPGPMOHIDRMGE     SEO  Chrome   F   37    16971984      0   Thailand  


In [9]:
fraud_data_df.head(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
634,247547,2015-06-28 03:00:34,2015-08-09 03:57:29,47,KIXYSVCHIPQBR,SEO,Safari,F,30,16778864,0,Australia
635,220737,2015-01-28 14:21:11,2015-02-11 20:28:28,15,PKYOWQKWGJNJI,SEO,Chrome,F,34,16842045,0,Thailand
636,390400,2015-03-19 20:49:09,2015-04-11 23:41:23,44,LVCSXLISZHVUO,Ads,IE,M,29,16843656,0,China
637,69592,2015-02-24 06:11:57,2015-05-23 16:40:14,55,UHAUHNXXUADJE,Direct,Chrome,F,30,16938732,0,China
638,174987,2015-07-07 12:58:11,2015-11-03 04:04:30,51,XPGPMOHIDRMGE,SEO,Chrome,F,37,16971984,0,Thailand


In [16]:

# # Convert the `purchase_time` and `signup_time` to datetime objects for easier manipulation
# fraud_data_df['purchase_time'] = pd.to_datetime(fraud_data_df['purchase_time'])
# fraud_data_df['signup_time'] = pd.to_datetime(fraud_data_df['signup_time'])

# # Step 1: Transaction Frequency per User
# # Group by `user_id` to count the number of transactions each user made
# fraud_data_df['transaction_count'] = fraud_data_df.groupby('user_id')['user_id'].transform('count')

# # Step 2: Transaction Velocity
# # Sort data by user_id and purchase_time to calculate velocity
# fraud_data_df = fraud_data_df.sort_values(by=['user_id', 'purchase_time'])

# # Calculate time difference between consecutive purchases for the same user
# fraud_data_df['purchase_diff'] = fraud_data_df.groupby('user_id')['purchase_time'].diff().dt.total_seconds()

# # Step 3: Extracting Hour of the Day and Day of the Week from `purchase_time`
# fraud_data_df['hour_of_day'] = fraud_data_df['purchase_time'].dt.hour
# fraud_data_df['day_of_week'] = fraud_data_df['purchase_time'].dt.dayofweek  # Monday=0, Sunday=6




In [10]:
# Example: Calculate frequency
fraud_data_df['transaction_frequency'] = fraud_data_df.groupby('user_id')['purchase_time'].transform('count')
fraud_data_df['transaction_velocity'] = (fraud_data_df['purchase_time'] - fraud_data_df['signup_time']).dt.total_seconds()

# Extracting hour and day from purchase time
fraud_data_df['hour_of_day'] = fraud_data_df['purchase_time'].dt.hour
fraud_data_df['day_of_week'] = fraud_data_df['purchase_time'].dt.dayofweek

In [None]:
# # Step 4: Save the enhanced DataFrame with new features
# fraud_data_df.to_csv('data/Fraud_Data_with_features.csv', index=False)

# Display the new DataFrame head
fraud_data_df.head(5)

In [11]:
# Columns you don't want to scale, including 'class'
exclude_columns = ['device_id', 'source', 'browser', 'sex', 'ip_address', 'class', 'country']

# Apply the function
df = standardize_numerical_features(fraud_data_df, exclude_columns)

In [16]:
fraud_data_df = standardize_numerical_features(fraud_data_df)

creditcard_df = standardize_numerical_features(creditcard_df)
  


In [12]:
fraud_data_df.head(5)

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,transaction_frequency,transaction_velocity,hour_of_day,day_of_week
634,0.411032,2015-06-28 03:00:34,2015-08-09 03:57:29,0.549607,KIXYSVCHIPQBR,SEO,Safari,F,-0.363124,16778864,0,Australia,0.0,-0.4138,3,6
635,0.178626,2015-01-28 14:21:11,2015-02-11 20:28:28,-1.197335,PKYOWQKWGJNJI,SEO,Chrome,F,0.101168,16842045,0,Thailand,0.0,-1.180852,20,2
636,1.649372,2015-03-19 20:49:09,2015-04-11 23:41:23,0.385831,LVCSXLISZHVUO,Ads,IE,M,-0.479197,16843656,0,China,0.0,-0.936126,23,5
637,-1.131594,2015-02-24 06:11:57,2015-05-23 16:40:14,0.986342,UHAUHNXXUADJE,Direct,Chrome,F,-0.363124,16938732,0,China,0.0,0.867086,16,5
638,-0.217963,2015-07-07 12:58:11,2015-11-03 04:04:30,0.767974,XPGPMOHIDRMGE,SEO,Chrome,F,0.449387,16971984,0,Thailand,0.0,1.700633,4,1


In [21]:
fraud_data_df = encode_categorical_features(fraud_data_df)
  


In [22]:
# # Identify categorical columns
# categorical_columns = fraud_data_df.select_dtypes(include=['object', 'category']).columns.tolist()

# # One-Hot Encode categorical features
# fraud_data_df = pd.get_dummies(fraud_data_df, columns=categorical_columns, drop_first=True)


In [14]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
categorical_columns = fraud_data_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Initialize LabelEncoder
label_encoders = {}

# Apply Label Encoding for each categorical column
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    fraud_data_df[col] = label_encoders[col].fit_transform(fraud_data_df[col])



In [13]:
creditcard_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 283726 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    283726 non-null  float64
 1   V1      283726 non-null  float64
 2   V2      283726 non-null  float64
 3   V3      283726 non-null  float64
 4   V4      283726 non-null  float64
 5   V5      283726 non-null  float64
 6   V6      283726 non-null  float64
 7   V7      283726 non-null  float64
 8   V8      283726 non-null  float64
 9   V9      283726 non-null  float64
 10  V10     283726 non-null  float64
 11  V11     283726 non-null  float64
 12  V12     283726 non-null  float64
 13  V13     283726 non-null  float64
 14  V14     283726 non-null  float64
 15  V15     283726 non-null  float64
 16  V16     283726 non-null  float64
 17  V17     283726 non-null  float64
 18  V18     283726 non-null  float64
 19  V19     283726 non-null  float64
 20  V20     283726 non-null  float64
 21  V21     283726 

In [17]:
fraud_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 129146 entries, 634 to 131728
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   user_id                129146 non-null  float64       
 1   signup_time            129146 non-null  datetime64[ns]
 2   purchase_time          129146 non-null  datetime64[ns]
 3   purchase_value         129146 non-null  float64       
 4   device_id              129146 non-null  int64         
 5   source                 129146 non-null  int64         
 6   browser                129146 non-null  int64         
 7   sex                    129146 non-null  int64         
 8   age                    129146 non-null  float64       
 9   ip_address             129146 non-null  int64         
 10  fraud_class            129146 non-null  int64         
 11  country                129146 non-null  int64         
 12  transaction_frequency  129146 non-null  float64

In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler
import time
import warnings
from sklearn.exceptions import ConvergenceWarning

# Ignore convergence warnings for MLP
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Rename the 'class' column to avoid conflict with Python's reserved keywords
fraud_data_df.rename(columns={'class': 'fraud_class'}, inplace=True)

# Data Preparation function (removing standardization, as you've done it already)
def prepare_data(df, target_col, drop_cols):
    X = df.drop(drop_cols, axis=1)
    y = df[target_col]
    
    return X, y

# Model Training and Evaluation function
def evaluate_models(X_train, X_test, y_train, y_test, models):
    for name, model in models.items():
        try:
            print(f"Training {name}...")
            start_time = time.time()
            model.fit(X_train, y_train)
            end_time = time.time()

            y_pred = model.predict(X_test)
            print(f"Model: {name}")
            print(f"Training Time: {end_time - start_time:.2f} seconds")
            print(classification_report(y_test, y_pred))

            # ROC AUC score calculation (for models that support predict_proba)
            if hasattr(model, "predict_proba"):
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba)}\n")
            else:
                print(f"{name} does not support probability prediction.\n")
        
        except Exception as e:
            print(f"Error with model {name}: {str(e)}\n")

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=2000, class_weight='balanced'),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=50),  # Simplified for faster training
    'MLP': MLPClassifier(max_iter=1000),  # Increased iterations
}

# Fraud data processing
X_fraud, y_fraud = prepare_data(fraud_data_df, 'fraud_class', ['fraud_class', 'signup_time', 'purchase_time', 'ip_address'])

# Convert target variable to integers (in case they are float)
y_fraud = y_fraud.astype(int)

# Standardize the features for MLP and models that require scaling
scaler = StandardScaler()
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)
X_train_fraud_scaled = scaler.fit_transform(X_train_fraud)
X_test_fraud_scaled = scaler.transform(X_test_fraud)

# Evaluate models on fraud data (use scaled data for MLP)
print("Fraud Data Results:\n")
evaluate_models(X_train_fraud_scaled, X_test_fraud_scaled, y_train_fraud, y_test_fraud, models)


Fraud Data Results:

Model: Logistic Regression
Training Time: 18.39 seconds
              precision    recall  f1-score   support

           0       0.95      0.65      0.77     23427
           1       0.17      0.70      0.27      2403

    accuracy                           0.65     25830
   macro avg       0.56      0.67      0.52     25830
weighted avg       0.88      0.65      0.72     25830

ROC AUC: 0.7581122229844558

Model: Decision Tree
Training Time: 4.24 seconds
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     23427
           1       0.48      0.57      0.52      2403

    accuracy                           0.90     25830
   macro avg       0.72      0.75      0.73     25830
weighted avg       0.91      0.90      0.91     25830

ROC AUC: 0.7523471189250088

Model: Random Forest
Training Time: 60.04 seconds
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     23427
   

In [None]:
print(y_fraud.unique())

In [None]:
print(fraud_data_df['fraud_class'].unique())

In [None]:
fraud_data_df.head(5)

In [None]:

# Credit card data processing
# Assuming creditcard_data is a DataFrame and 'Class' is the target
X_creditcard, y_creditcard = prepare_data(creditcard_data, 'Class', ['Class'])
X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)

# Evaluate models on credit card data
print("Credit Card Data Results:\n")
evaluate_models(X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard, models)
