# KAIM Week 8 and 9 Challenges

## **Task 2: MOdel Building**

## Import Necessary Libraries

In [30]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import warnings

warnings.filterwarnings('ignore')

# Set plot style for better visuals
sns.set(style="whitegrid")

## Load Datasets

In [2]:
# Load the datasets
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_data = pd.read_csv('../data/IpAddress_to_Country.csv')
credit_data = pd.read_csv('../data/creditcard.csv')

## Data Overview

In [4]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   user_id         151112 non-null  int64  
 1   signup_time     151112 non-null  object 
 2   purchase_time   151112 non-null  object 
 3   purchase_value  151112 non-null  int64  
 4   device_id       151112 non-null  object 
 5   source          151112 non-null  object 
 6   browser         151112 non-null  object 
 7   sex             151112 non-null  object 
 8   age             151112 non-null  int64  
 9   ip_address      151112 non-null  float64
 10  class           151112 non-null  int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB


## Handling Missing Values

In [10]:
# Check for missing values in fraud_data
print(fraud_data.isnull().sum())

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64


**No Missing value to handle**

## Data Cleaning

### Duplicate Vaues

In [8]:
# Remove duplicates
fraud_data = fraud_data.drop_duplicates()
credit_data = credit_data.drop_duplicates()
ip_data = ip_data.drop_duplicates()
# Confirm if duplicates are removed
print(f"Remaining duplicates in fraud_data: {fraud_data.duplicated().sum()}")
print(f"Remaining duplicates in credit_data: {credit_data.duplicated().sum()}")
print(f"Remaining duplicates in ip_data: {ip_data.duplicated().sum()}")

Remaining duplicates in fraud_data: 0
Remaining duplicates in credit_data: 0
Remaining duplicates in ip_data: 0


## Correct Data Type

In [9]:
# Check and convert data types
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Ensure categorical features are properly encoded
fraud_data['source'] = fraud_data['source'].astype('category')
fraud_data['browser'] = fraud_data['browser'].astype('category')
fraud_data['sex'] = fraud_data['sex'].astype('category')

In [10]:
# Confirm data types
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  category      
 6   browser         151112 non-null  category      
 7   sex             151112 non-null  category      
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  float64       
 10  class           151112 non-null  int64         
dtypes: category(3), datetime64[ns](2), float64(1), int64(4), object(1)
memory usage: 9.7+ MB


In [11]:
# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()

fraud_data['source'] = label_encoder.fit_transform(fraud_data['source'])
fraud_data['browser'] = label_encoder.fit_transform(fraud_data['browser'])
fraud_data['sex'] = label_encoder.fit_transform(fraud_data['sex'])


# Check the dataset after encoding
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  int64         
 6   browser         151112 non-null  int64         
 7   sex             151112 non-null  int64         
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  float64       
 10  class           151112 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(7), object(1)
memory usage: 12.7+ MB


## Merge Dataset for Geolocation Analysis

### Convert IP Address into Integer format

In [12]:
def convert_ip_to_int(ip_address):
    try:
        # Ensure IP address is a valid string before conversion
        if isinstance(ip_address, str):
            return int(ip_address.replace('.', ''))
        else:
            return None
    except Exception as e:
        print(f"Error converting IP address {ip_address}: {e}")
        return None

In [13]:
 # Ensure 'ip_address' is treated as a string to handle missing values
fraud_data['ip_address'] = fraud_data['ip_address'].astype(str)

# Convert IP addresses to integer format, handling errors
fraud_data['ip_address'] = fraud_data['ip_address'].apply(lambda x: convert_ip_to_int(x) if x != 'nan' else None)

In [14]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  int64         
 6   browser         151112 non-null  int64         
 7   sex             151112 non-null  int64         
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  int64         
 10  class           151112 non-null  int64         
dtypes: datetime64[ns](2), int64(8), object(1)
memory usage: 12.7+ MB


In [16]:
# Ensure both 'lower_bound_ip_address' and 'upper_bound_ip_address' in ip_data are strings
ip_data['lower_bound_ip_address'] = ip_data['lower_bound_ip_address'].astype(str)
ip_data['upper_bound_ip_address'] = ip_data['upper_bound_ip_address'].astype(str)

# Convert the IP addresses in the IP-to-country dataset to integer format
ip_data['lower_bound_ip_addres'] = ip_data['lower_bound_ip_address'].apply(lambda x: convert_ip_to_int(x) if x != 'nan' else None)
ip_data['upper_bound_ip_adress'] = ip_data['upper_bound_ip_address'].apply(lambda x: convert_ip_to_int(x) if x != 'nan' else None)
ip_data = ip_data.iloc[:, 2:]
ip_data.head()

Unnamed: 0,country,lower_bound_ip_addres,upper_bound_ip_adress
0,Australia,167772160,16777471
1,China,167774720,16777727
2,China,167777280,16778239
3,Australia,167782400,16779263
4,China,167792640,16781311


### Merge Fraud Dataset with IP Dataset

In [17]:
merged_df = pd.concat([fraud_data, ip_data], axis = 1)


In [18]:
merged_df.isnull().sum()

user_id                      0
signup_time                  0
purchase_time                0
purchase_value               0
device_id                    0
source                       0
browser                      0
sex                          0
age                          0
ip_address                   0
class                        0
country                  12266
lower_bound_ip_addres    12266
upper_bound_ip_adress    12266
dtype: int64

In [19]:
fraud_data_combined = merged_df.dropna()
fraud_data_combined.shape

(138846, 14)

## Feature Engineering

### Tansaction Frequency and Velocity

In [20]:
# Calculate transaction frequency and velocity
fraud_data_combined['signup_purchase_diff'] = (fraud_data_combined['purchase_time'] - fraud_data_combined['signup_time']).dt.total_seconds()

# Calculate total transactions per user
fraud_data_combined['transaction_count'] = fraud_data_combined.groupby('user_id')['user_id'].transform('count')

### Time Based Features

In [21]:
# Extract hour of the day and day of the week
fraud_data_combined['hour_of_day'] = fraud_data_combined['purchase_time'].dt.hour
fraud_data_combined['day_of_week'] = fraud_data_combined['purchase_time'].dt.dayofweek

## Normalization and Scaling

In [22]:
# Normalize the transaction amount and signup_purchase_diff
scaler = StandardScaler()

fraud_data_combined[['purchase_value', 'signup_purchase_diff']] = scaler.fit_transform(fraud_data_combined[['purchase_value', 'signup_purchase_diff']])

## Encode Categorical Features

In [23]:
# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()

fraud_data_combined['source'] = label_encoder.fit_transform(fraud_data_combined['source'])
fraud_data_combined['browser'] = label_encoder.fit_transform(fraud_data_combined['browser'])
fraud_data_combined['sex'] = label_encoder.fit_transform(fraud_data_combined['sex'])
fraud_data_combined['country'] = label_encoder.fit_transform(fraud_data_combined['country'])

# Check the dataset after encoding
fraud_data_combined.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,lower_bound_ip_addres,upper_bound_ip_adress,signup_purchase_diff,transaction_count,hour_of_day,day_of_week
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,-0.159776,QVPSPJUOCKZAR,2,0,1,39,73275836879972,0,11,167772160.0,16777471.0,-0.136131,1,2,5
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,-1.142137,EOGFQPIZPYXFZ,0,0,0,53,350311387865908,0,42,167774720.0,16777727.0,-1.571694,1,1,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,-1.196713,YSSKYOSJHPPLJ,2,3,1,53,262147382011095,1,42,167777280.0,16778239.0,-1.577432,1,18,3
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,0.385981,ATGTXKYKUDUQN,2,4,1,41,384054244391396,0,11,167782400.0,16779263.0,-1.420057,1,13,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,0.113102,NAUITBZFJKHWW,0,4,1,45,415583117452712,0,42,167792640.0,16781311.0,-0.182575,1,18,2


In [24]:
fraud_data_combined.columns

Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class',
       'country', 'lower_bound_ip_addres', 'upper_bound_ip_adress',
       'signup_purchase_diff', 'transaction_count', 'hour_of_day',
       'day_of_week'],
      dtype='object')

## Feature Importance (Preliminary Model Insights)

In [25]:
# Prepare data for the model (e-commerce)
X = fraud_data_combined.drop(columns=['class', 'device_id', 'signup_time', 'purchase_time'])
y = fraud_data_combined['class']

np.unique(y, return_counts=True)

(array([0, 1]), array([125849,  12997]))

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



### Logistic Regression

In [33]:
log_reg = LogisticRegression(C=1, solver='liblinear')
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print(classification_report(y_test, y_pred_log_reg))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     25193
           1       0.00      0.00      0.00      2577

    accuracy                           0.91     27770
   macro avg       0.45      0.50      0.48     27770
weighted avg       0.82      0.91      0.86     27770



### Random Forest

In [34]:
rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98     25193
           1       1.00      0.53      0.69      2577

    accuracy                           0.96     27770
   macro avg       0.98      0.76      0.83     27770
weighted avg       0.96      0.96      0.95     27770



### XGBoost 

In [35]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     25193
           1       0.94      0.55      0.69      2577

    accuracy                           0.96     27770
   macro avg       0.95      0.77      0.84     27770
weighted avg       0.95      0.96      0.95     27770



#### Class Balancing using SMOTE

In [37]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_train_res, y_train_res = smt.fit_resample(X_train, y_train)

np.unique(y_train_res, return_counts=True)

(array([0, 1]), array([88947, 88947]))

In [38]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train_res, y_train_res)
y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     25193
           1       0.66      0.57      0.61      2577

    accuracy                           0.93     27770
   macro avg       0.81      0.77      0.79     27770
weighted avg       0.93      0.93      0.93     27770



## Experiments

In [39]:
models = [
    (
        "Logistic Regression", 
        LogisticRegression(C=1, solver='liblinear'), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest", 
        RandomForestClassifier(n_estimators=30, max_depth=3), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier With SMOTE",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
        (X_train_res, y_train_res),
        (X_test, y_test)
    )
]

In [40]:
reports = []

for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [41]:
# Initialize MLflow
mlflow.set_experiment("Fraud Detection Models")
mlflow.set_tracking_uri("http://localhost:5000")

for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):        
        mlflow.log_param("model", model_name)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])        
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")  

2024/10/19 09:45:02 INFO mlflow.tracking.fluent: Experiment with name 'Fraud Detection Models' does not exist. Creating a new experiment.
2024/10/19 09:45:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/935012191474515353/runs/86dfdfcb50d44cc1bbc3d6ae0af4982a.
2024/10/19 09:45:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.
2024/10/19 09:45:12 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/935012191474515353/runs/51b22addefeb43389bf0f2a641f7e84a.
2024/10/19 09:45:12 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.
2024/10/19 09:45:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier at: http://localhost:5000/#/experiments/935012191474515353/runs/d6b0c4e0da464649a6c5ff25cf05e5