# Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
import os

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, confusion_matrix, auc
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Data Summary

In [4]:
cwd = os.getcwd()
print("Current working directory:", cwd)

## Data Loading

In [5]:
df = pd.read_csv("./Base.csv",low_memory=False)

## Data Summary

In [6]:
df.head()

In [7]:
df.shape

In [8]:
df.columns

## Data Dictionary

## Data Types

In [9]:
dtypes = df.dtypes

# count number of int, float, and object columns
int_count = (dtypes == "int64").sum()
float_count = (dtypes == "float64").sum()
object_count = (dtypes == "object").sum()

# print counts of each data type
print("Int columns:", int_count)
print("Float columns:", float_count)
print("Object columns:", object_count)

In [10]:
int_columns = df.select_dtypes(include=['int64']).columns.tolist()
float_columns = df.select_dtypes(include=['float64']).columns.tolist()
object_columns = df.select_dtypes(include=['object']).columns.tolist()

print("Int columns:", int_columns)
print("\n")
print("Float columns:", float_columns)
print("\n")
print("Object columns:", object_columns)

# Missing Values

In [11]:
columns_to_replace = ['prev_address_months_count', 'current_address_months_count', 'bank_months_count', 'session_length_in_minutes', 'device_distinct_emails_8w']

df[columns_to_replace] = df[columns_to_replace].replace(-1, np.nan)


In [12]:
df["prev_address_months_count"].value_counts(dropna=False)

In [13]:
df.loc[df['intended_balcon_amount'] < 0, 'intended_balcon_amount'] = np.nan

In [14]:
# Top ten columns with missing values %
missing = df.count()/len(df)
missing = (1-missing)*100
missing.sort_values(ascending=False).head(10)

Based on the missing value information, the following columns have missing values:

intended_balcon_amount
prev_address_months_count,
bank_months_count,
current_address_months_count,
session_length_in_minutes,
device_distinct_emails_8w.

Among these columns, intended_balcon_amount and prev_address_months_count has more than 71% missing values. Therefore, it is recommended to remove these columns from the dataset.

To address missing values in the remaining columns, we can consider various strategies such as:

Removing rows with missing values: If the number of missing values in the other columns is relatively small, we may choose to remove the rows containing missing values.

Imputation: If the missing values are a small portion of the dataset, We can fill them with appropriate values. This can include methods like mean, median, mode imputation, or using machine learning algorithms to predict missing values based on other variables.

It is important to assess the impact of missing values on the analysis or model being developed and select the most suitable approach for handling them.


In [15]:
df = df.drop(['intended_balcon_amount','prev_address_months_count'], axis = 1)

## Missing Values Treatment

In [16]:
df['bank_months_count']=df['bank_months_count'].fillna(df['bank_months_count'].median())
df['current_address_months_count']=df['current_address_months_count'].fillna(df['current_address_months_count'].median())
df['session_length_in_minutes']=df['session_length_in_minutes'].fillna(df['session_length_in_minutes'].median())
df['credit_risk_score']=df['credit_risk_score'].fillna(df['credit_risk_score'].median())
df['device_distinct_emails_8w']=df['device_distinct_emails_8w'].fillna(df['device_distinct_emails_8w'].median())

## Data after Missing Values Treatment

In [17]:
# Top ten columns with missing values %
missing = df.count()/len(df)
missing = (1-missing)*100
missing.sort_values(ascending=False).head(10)

In [18]:
int_columns = df.select_dtypes(include=['int64']).columns.tolist()
float_columns = df.select_dtypes(include=['float64']).columns.tolist()
object_columns = df.select_dtypes(include=['object']).columns.tolist()

print("Int columns:", int_columns)
print("\n")
print("Float columns:", float_columns)
print("\n")
print("Object columns:", object_columns)

# Univariate Analysis

## Numerical Data Analysis : Int

In [19]:
print("Int columns:", int_columns)

### Target variable

In [20]:
df['fraud_bool'].value_counts(dropna=False)

In [21]:
df["fraud_bool"].value_counts(1, dropna=False)*100

In [22]:
df["fraud_bool"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for fraud_bool")
plt.show()

Based on the provided information, the fraud_bool variable exhibits an imbalanced distribution. The percentage of instances labeled as 0 (non-fraudulent) is 98.8971%, while the percentage of instances labeled as 1 (fraudulent) is 1.1029%.

Imbalanced problems like this can pose challenges during analysis and modeling, as the minority class (fraudulent) may be underrepresented, making it harder for a model to learn and predict accurately.

### customer_age

In [23]:
df["customer_age"].value_counts(dropna=False)

In [24]:
df["customer_age"].value_counts(1, dropna=False)*100 

In [25]:
df["customer_age"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for customer_age")
plt.show()

The provided information describes the age distribution of applicants, indicating the following:

More than 93% of the applicants fall within the age range of 20-50.
Less than 1% of the applicants are aged 60 or above.
Approximately 2% of the applicants are aged 10.

These statistics highlight the age composition of the applicant pool, emphasizing that a significant majority (over 93%) are between the ages of 20 and 50. Meanwhile, a very small proportion (less than 1%) consists of individuals aged 60 or older. Additionally, around 2% of the applicants are found to be 10 years old.

### zip_count_4w

In [26]:
df["zip_count_4w"].value_counts(dropna=False)

In [27]:
df["zip_count_4w"].value_counts(1, dropna=False)*100 

### bank_branch_count_8w

In [28]:
df["bank_branch_count_8w"].value_counts(dropna=False)

In [29]:
df["bank_branch_count_8w"].value_counts(1, dropna=False)*100 

The variable "bank_branch_count_8w" in a dataset typically represents the count or number of different bank branches that an individual has interacted with or visited within the past 8 weeks.

### date_of_birth_distinct_emails_4w

The variable "date_of_birth_distinct_emails_4w" in a dataset typically represents the count or number of distinct or unique email addresses associated with individuals who share the same date of birth within a 4-week period.

In [30]:
df["date_of_birth_distinct_emails_4w"].value_counts(dropna=False)

In [31]:
df["date_of_birth_distinct_emails_4w"].value_counts(1, dropna=False)*100 

In [32]:
df["date_of_birth_distinct_emails_4w"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for date_of_birth_distinct_emails_4w")
plt.show()

### credit_risk_score

In [33]:
df["credit_risk_score"].value_counts(dropna=False)

In [34]:
df["credit_risk_score"].value_counts(1, dropna=False)*100 

In [35]:
df["credit_risk_score"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for credit_risk_score")
plt.show()

### email_is_free

The variable "email_is_free" in a dataset typically represents whether an email address is associated with a free email service provider or not.

In many cases, individuals can choose between free email services (such as Gmail, Yahoo Mail, or Outlook.com) or paid email services offered by their employers or specific providers. The "email_is_free" variable is often a binary indicator that denotes whether an email address belongs to a free email service provider (1) or not (0).

In [36]:
df["email_is_free"].value_counts(dropna=False)

In [37]:
df["email_is_free"].value_counts(1, dropna=False)*100 

In [38]:
df["email_is_free"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for email_is_free")
plt.show()

### phone_home_valid

The variable "phone_home_valid" in a dataset typically represents whether a home phone number is considered valid or not.

In [39]:
df["phone_home_valid"].value_counts(dropna=False)

In [40]:
df["phone_home_valid"].value_counts(1, dropna=False)*100 

df["phone_home_valid"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for phone_home_valid")
plt.show()

### has_other_cards   

In [41]:
df["has_other_cards"].value_counts(dropna=False)

In [42]:
df["has_other_cards"].value_counts(1, dropna=False)*100 

In [43]:
df["has_other_cards"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for has_other_cards")
plt.show()

The variable "has_other_cards" in a dataset typically represents whether an individual has other credit cards or not.

### foreign_request

In [44]:
df["foreign_request"].value_counts(dropna=False)

In [45]:
df["foreign_request"].value_counts(1, dropna=False)*100 

In [46]:
df["foreign_request"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for foreign_request")
plt.show()

### keep_alive_session

In [47]:
df["keep_alive_session"].value_counts(dropna=False)

In [48]:
df["keep_alive_session"].value_counts(1, dropna=False)*100 

In [49]:
df["keep_alive_session"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for customer_age")
plt.show()

### device_fraud_count

In [50]:
df["device_fraud_count"].value_counts(dropna=False)

In [51]:
df["device_fraud_count"].value_counts(1, dropna=False)*100 

In [52]:
df["device_fraud_count"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for device_fraud_count")
plt.show()

Since the "device_fraud_count" feature does not exhibit any variance in the data, meaning it contains only a single value or does not provide any distinguishing information, it would not contribute to the prediction or analysis tasks. Therefore, it is recommended to remove this feature from the dataset.

In [53]:
df = df.drop(['device_fraud_count'], axis = 1)

### month

In [54]:
df["month"].value_counts(dropna=False)

In [55]:
df["month"].value_counts(1, dropna=False)*100 

In [56]:
df["month"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for month")
plt.show()

## Numerical Data Analysis : Float

In [57]:
float_columns = df.select_dtypes(include=['float64']).columns.tolist()
float_columns

### Income

In [58]:
df["income"].value_counts(dropna=False)

In [59]:
df["income"].value_counts(1, dropna=False)*100 

In [60]:
df["income"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for Income")
plt.show()

### device_distinct_emails_8w

In [61]:
df["device_distinct_emails_8w"].value_counts(dropna=False)

In [62]:
df["device_distinct_emails_8w"].value_counts(1, dropna=False)*100 

In [63]:
df["device_distinct_emails_8w"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for device_distinct_emails_8w")
plt.show()

## Outliers Analysis

In [64]:
df[['name_email_similarity',
 'current_address_months_count',
 'days_since_request',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'credit_risk_score',
 'bank_months_count',
 'proposed_credit_limit',
 'session_length_in_minutes']]

In [65]:
f_l=['name_email_similarity',
 'current_address_months_count',
 'days_since_request',
 'velocity_6h',
 'velocity_24h',
 'velocity_4w',
 'credit_risk_score',
 'proposed_credit_limit',
 'session_length_in_minutes']

### Indetification of Outliers

In [66]:
# Create a box plot for each float variable
for col in f_l:
    sns.boxplot(df[col])
    plt.title(col)
    plt.show()

In [67]:
Columns_with_Outliers=['current_address_months_count',
 'days_since_request',
 'intended_balcon_amount',
 'velocity_6h',
 'velocity_24h',
 'credit_risk_score',
 'bank_months_count',
 'proposed_credit_limit',
 'session_length_in_minutes']

Columns_without_Outliers = ['name_email_similarity','velocity_4w','bank_months_count']

### Outlier Treatment

In [68]:
columns_to_impute_outliers=['current_address_months_count',
 'days_since_request',
 'velocity_6h',
 'velocity_24h',
 'credit_risk_score',
 'bank_months_count',
 'proposed_credit_limit',
 'session_length_in_minutes']

In [69]:
def impute_outliers_with_median(df, columns, threshold=3):
    for column in columns:
        # Calculate the median of the column
        median = df[column].median()
        
        # Calculate the absolute deviation from the median
        median_deviation = np.abs(df[column] - median)
        
        # Identify the outliers based on the specified threshold
        outliers = df[median_deviation > threshold * median_deviation.median()]
        
        # Impute the outlier values with the median
        df.loc[outliers.index, column] = median
    
    return df

# Example usage
# Assuming df is your pandas DataFrame and columns_to_impute_outliers is a list of column names

df = impute_outliers_with_median(df, columns_to_impute_outliers)


In [70]:
df

### After Removing Outliers using Z score

In [71]:
# Create a box plot for each float variable
for col in f_l:
    sns.boxplot(df[col])
    plt.title(col)
    plt.show()

## Distribution Plots

In [72]:
'''
import seaborn as sns

# Iterate over each column with continuous numerical data
for column in float_columns:
    # Plot the distribution for the current column
    sns.histplot(df[column], kde=True)
    sns.kdeplot(df[column])
    
    # Add plot title and labels
    plt.title(f"Distribution of {column}")
    plt.xlabel(column)
    plt.ylabel("Density")
    
    # Show the plot
    plt.show()
'''

## Categorical Data Analysis

In [73]:
object_columns = df.select_dtypes(include=['object']).columns.tolist()
object_columns

### payment_type

In [74]:
df["payment_type"].value_counts(dropna=False)

In [75]:
df["payment_type"].value_counts(1, dropna=False)*100 

In [76]:
df["payment_type"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for payment_type")
plt.show()

### employment_status

In [77]:
df["employment_status"].value_counts(dropna=False)

In [78]:
df["employment_status"].value_counts(1, dropna=False)*100

In [79]:
df["employment_status"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for employment_status")
plt.show()

### housing_status

In [80]:
df["housing_status"].value_counts(dropna=False)

In [81]:
df["housing_status"].value_counts(1, dropna=False)*100 

In [82]:
df["housing_status"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for housing_status")
plt.show()

### source

In [83]:
df["source"].value_counts(dropna=False)

In [84]:
df["source"].value_counts(1, dropna=False)*100 

In [85]:
df["source"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for source")
plt.show()

### device_os

In [86]:
df["device_os"].value_counts(dropna=False)

In [87]:
df["device_os"].value_counts(1, dropna=False)*100 

In [88]:
df["device_os"].value_counts(dropna=False).plot(kind="barh")
plt.title("Bar Plot for device_os")
plt.show()

# Bivariate Analysis

In [89]:
int_columns = df.select_dtypes(include=['int64']).columns.tolist()
float_columns = df.select_dtypes(include=['float64']).columns.tolist()
object_columns = df.select_dtypes(include=['object']).columns.tolist()

print("Int columns:", int_columns)
print("\n")
print("Float columns:", float_columns)
print("\n")
print("Object columns:", object_columns)

In [90]:
num_cols=['fraud_bool', 'customer_age', 'zip_count_4w', 'bank_branch_count_8w', 'date_of_birth_distinct_emails_4w', 'email_is_free', 'phone_home_valid', 'phone_mobile_valid', 'has_other_cards', 'foreign_request', 'keep_alive_session', 'month','income', 'name_email_similarity', 'current_address_months_count', 'days_since_request', 'velocity_6h', 'velocity_24h', 'velocity_4w', 'credit_risk_score', 'bank_months_count', 'proposed_credit_limit', 'session_length_in_minutes', 'device_distinct_emails_8w']

## Correlation Graph

In [91]:
# Create a heatmap of the correlation matrix
corr =  df[num_cols].corr().round(2)
plt.figure(figsize = (15,10))
sns.heatmap(corr, annot = True)

## Stacked Bar Graphs

In [92]:
cat_cols=['payment_type', 'employment_status', 'housing_status', 'source', 'device_os']

In [93]:
# Create a cross-tabulation of two categorical variables
for i in range(len(cat_cols)):
    for j in range(i+1, len(cat_cols)):
        ct = pd.crosstab(df[cat_cols[i]], df[cat_cols[j]])
        ct.plot(kind='bar', stacked=True)
        plt.title('{} vs. {}'.format(cat_cols[i], cat_cols[j]))
        plt.xticks(rotation=0)
        plt.show()

## Analysis with respect to Target

In [94]:
df.columns

### income

In [95]:
pd.crosstab(df["income"], df["fraud_bool"], normalize="index")*100

In [96]:
pd.crosstab(df["income"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("income Against fraud_bool", fontsize=15);
plt.show()

### customer_age

In [97]:
pd.crosstab(df["customer_age"], df["fraud_bool"], normalize="index")*100

In [98]:
pd.crosstab(df["customer_age"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("customer_age Against fraud_bool", fontsize=15);
plt.show()

###  zip_count_4w

In [99]:
pd.crosstab(df["zip_count_4w"], df["fraud_bool"], normalize="index")*100

### bank_branch_count_8w

In [100]:
pd.crosstab(df["bank_branch_count_8w"], df["fraud_bool"], normalize="index")*100

### date_of_birth_distinct_emails_4w

In [101]:
pd.crosstab(df["date_of_birth_distinct_emails_4w"], df["fraud_bool"], normalize="index")*100

In [102]:
pd.crosstab(df["date_of_birth_distinct_emails_4w"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("date_of_birth_distinct_emails_4w Against fraud_bool", fontsize=15);
plt.show()

### email_is_free

In [103]:
pd.crosstab(df["email_is_free"], df["fraud_bool"], normalize="index")*100

In [104]:
pd.crosstab(df["email_is_free"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("email_is_free Against fraud_bool", fontsize=15);
plt.show()

### phone_home_valid

In [105]:
pd.crosstab(df["phone_home_valid"], df["fraud_bool"], normalize="index")*100

In [106]:
pd.crosstab(df["phone_home_valid"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("phone_home_valid Against fraud_bool", fontsize=15);
plt.show()

### has_other_cards

In [107]:
pd.crosstab(df["has_other_cards"], df["fraud_bool"], normalize="index")*100

In [108]:
pd.crosstab(df["has_other_cards"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("has_other_cards Against fraud_bool", fontsize=15);
plt.show()

### foreign_request

In [109]:
pd.crosstab(df["foreign_request"], df["fraud_bool"], normalize="index")*100

In [110]:
pd.crosstab(df["foreign_request"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("foreign_request Against fraud_bool", fontsize=15);
plt.show()

### keep_alive_session

In [111]:
pd.crosstab(df["keep_alive_session"], df["fraud_bool"], normalize="index")*100

In [112]:
pd.crosstab(df["keep_alive_session"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("keep_alive_session Against fraud_bool", fontsize=15);
plt.show()

### month

In [113]:
pd.crosstab(df["month"], df["fraud_bool"], normalize="index")*100

In [114]:
pd.crosstab(df["month"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("month Against fraud_bool", fontsize=15);
plt.show()

### device_distinct_emails_8w

In [115]:
pd.crosstab(df["device_distinct_emails_8w"], df["fraud_bool"], normalize="index")*100

In [116]:
pd.crosstab(df["device_distinct_emails_8w"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("device_distinct_emails_8w Against fraud_bool", fontsize=15);
plt.show()

### payment_type

In [117]:
pd.crosstab(df["payment_type"], df["fraud_bool"], normalize="index")*100

In [118]:
pd.crosstab(df["payment_type"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("payment_type Against fraud_bool", fontsize=15);
plt.show()

### employment_status

In [119]:
pd.crosstab(df["employment_status"], df["fraud_bool"], normalize="index")*100

In [120]:
pd.crosstab(df["employment_status"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("employment_status Against fraud_bool", fontsize=15);
plt.show()

### housing_status

In [121]:
pd.crosstab(df["housing_status"], df["fraud_bool"], normalize="index")*100

In [122]:
pd.crosstab(df["housing_status"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("housing_status Against fraud_bool", fontsize=15);
plt.show()

### source

In [123]:
pd.crosstab(df["source"], df["fraud_bool"], normalize="index")*100

In [124]:
pd.crosstab(df["source"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("source Against fraud_bool", fontsize=15);
plt.show()

### device_os

In [125]:
pd.crosstab(df["device_os"], df["fraud_bool"], normalize="index")*100

In [126]:
pd.crosstab(df["device_os"], df["fraud_bool"]).plot(kind="bar", figsize=(5, 5), stacked=True)
plt.title("device_os Against fraud_bool", fontsize=15);
plt.show()

# Feature Engineering

## one hot encoding

In [127]:
df.shape

In [128]:
df = pd.get_dummies(df,drop_first=True) # one hot encoding

In [129]:
df.shape

In [130]:
df.columns

In [131]:
# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.metrics import ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE

# Model Building

## Train test Split

In [132]:
X = df.iloc[:, 1:]  
y = df.iloc[:, 0]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


## Data Scaling 

In [133]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Create an instance of the scaler
scaler = MinMaxScaler()  # or StandardScaler() for standardization
column_names = X_train.columns.tolist()
# Fit the scaler on the training data
scaler.fit(X_train)

# Apply scaling to both the training and testing data
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)

X_train = pd.DataFrame(X_train, columns=column_names)
X_val = pd.DataFrame(X_val, columns=column_names)


In [134]:
value_counts = y_train.value_counts()
print(value_counts)

In [135]:
value_counts = y_val.value_counts()
print(value_counts)

In [136]:
X_train.columns

## ML Models training without Sampling Techniques

In [140]:
from imblearn.ensemble import BalancedBaggingClassifier, EasyEnsembleClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [141]:
# Define a list of classification models to evaluate
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier(n_estimators=10, max_depth=10)),
    ('GBM', GradientBoostingClassifier()),
    ('LightGBM', LGBMClassifier())
]

# Iterate over each model
for name, model in models:
    print(f"Evaluating {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate validation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # Print the validation metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print()
    
    # Print the classification report
    report = classification_report(y_val, y_pred)
    print("Classification Report:")
    print(report)
    print()
    
    # Draw ROC curve
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
    # Plot confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()


## ML Models to handle Imbalance in the dataset

In [142]:
# Define a list of classification models to evaluate
models = [
    ('Balanced Bagging', BalancedBaggingClassifier()),
    ('Easy Ensemble', EasyEnsembleClassifier())
]

# Iterate over each model
for name, model in models:
    print(f"Evaluating {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Calculate validation metrics
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    # Print the validation metrics
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-Score: {f1}")
    print()
    
    # Print the classification report
    report = classification_report(y_val, y_pred)
    print("Classification Report:")
    print(report)
    print()
    
    # Draw ROC curve
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    
    # Plot confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()


# Feature Importance

In [None]:
# Define a list of classification models to evaluate
models = [
    ('Logistic Regression', LogisticRegression()),
]

# Iterate over each model
for name, model in models:
    print(f"Evaluating {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    
    # Print the classification report
    report = classification_report(y_val, y_pred)
    print("Classification Report:")
    print(report)
    print()
    # Get feature importance
    if hasattr(model, 'coef_'):
        feature_importance = model.coef_
        print("Top 10 Important Features:")
        indices = feature_importance.argsort()[0][-10:]  # Indices of top 10 features
        for idx in indices:
            feature_name = X_train.columns[idx]  # Assuming X_train is a DataFrame with feature names as column names
            importance = feature_importance[0][idx]
            print(f"Feature: {feature_name}, Importance: {importance}")
    print()

In [None]:
from sklearn.tree import export_text
# Define a list of classification models to evaluate
models = [
    ('Decision Tree', DecisionTreeClassifier()),
]

# Iterate over each model
for name, model in models:
    print(f"Evaluating {name}...")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    y_pred = model.predict(X_val)
    # Print the classification report
    report = classification_report(y_val, y_pred)
    #print("Classification Report:")
    #print(report)
    print()
    
    # Print the decision tree structure
    print("Decision Tree:")
    tree_rules = export_text(model, feature_names=list(X_train.columns))
    #print(tree_rules)
    
    # Get feature importance
    if hasattr(model, 'feature_importances_'):
        feature_importance = model.feature_importances_
        
        # Get indices of top ten features
        top_indices = feature_importance.argsort()[-10:][::-1]
        
        print("Top Ten Important Features:")
        for idx in top_indices:
            feature_name = X_train.columns[idx]  # Assuming X_train is a DataFrame with feature names as column names
            importance = feature_importance[idx]
            print(f"Feature: {feature_name}, Importance: {importance}")
    print()