In [17]:
# Check for NaN values in numerical data
def check_nans(data):
    nans = pd.DataFrame(data.isna().sum()).reset_index()
    nans.columns = ['count', 'val']
    display(nans[nans['val'] > 0])
    return None


In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

# Set display options for pandas dataframes
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 15)

# Load target data
y = pd.read_csv('target1.csv')
y = y['TARGET_B']

# Load numerical and categorical feature data
numerical = pd.read_csv('numerical1.csv')
categorical = pd.read_csv('categorical1.csv')

# Drop unnamed columns
numerical = numerical.drop(columns=['Unnamed: 0'])
categorical = categorical.drop(columns=['Unnamed: 0'])


check_nans(numerical)

# Fill NaN values in numerical data
numerical['NEXTDATE'] = numerical['NEXTDATE'].fillna(0)

# Check for NaN values in categorical data
check_nans(categorical)

# Fill NaN values in categorical data
categorical.SOLIH = categorical.SOLIH.fillna(13)
categorical.VETERANS = categorical.VETERANS.fillna('N')

# Convert all values in categorical data to string
categorical = categorical.applymap(str)

# Combine numerical and categorical data into one dataset
X = pd.concat([numerical, categorical], axis=1)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)


Unnamed: 0,count,val
316,NEXTDATE,9973


Unnamed: 0,count,val
5,SOLIH,89212
6,VETERANS,84986


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

# Split the data into numeric and categorical
X_train_num = X_train.select_dtypes(np.number) 
X_train_cat = X_train.select_dtypes(object)
X_test_num = X_test.select_dtypes(np.number)
X_test_cat  = X_test.select_dtypes(object)

# Scale the numeric data
transformer = MinMaxScaler().fit(X_train_num)
X_train_num_scaled = pd.DataFrame(transformer.transform(X_train_num), columns = X_train_num.columns)
X_test_num_scaled = pd.DataFrame(transformer.transform(X_test_num), columns = X_test_num.columns)

# Encode the categorical data
def cat_encode(data, onehotencoder):
    encoded = onehotencoder.transform(data).toarray()
    cols = onehotencoder.get_feature_names_out(input_features=data.columns)
    return pd.DataFrame(encoded, columns=cols).reset_index(drop=True)

onehotencoder = OneHotEncoder(drop='first', handle_unknown = 'ignore').fit(X_train_cat)
X_train_cat_encoded = cat_encode(X_train_cat, onehotencoder).reset_index(drop = True)
X_test_cat_encoded = cat_encode(X_test_cat, onehotencoder).reset_index(drop = True)

# Combine the encoded categorical and scaled numeric data
X_train_scaled = pd.concat([X_train_cat_encoded, X_train_num_scaled], axis = 1)
X_test_scaled = pd.concat([X_test_cat_encoded, X_test_num_scaled], axis = 1)

# Reset the index for the target variables
y_test = y_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

# Define the logistic regression model
def logistic_regression_model(X, y, X_test):
    lr = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial').fit(X, y)
    return lr.predict(X_test)




In [24]:
from sklearn.metrics import classification_report, confusion_matrix

def model_eval(Y, P):
    print('\033[1m' + '\033[91m' + '      Logistics Regression\n' + '\033[0m')
    conf_matrix = confusion_matrix(Y, P)
    print('Confusion Matrix:')
    print(conf_matrix)
    print('\nClassification Report:')
    print(classification_report(Y, P))
    return None

In [25]:
# Import required libraries
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

# Prepare the data
s_data = pd.concat([X_train_scaled, y_train], axis=1)
cat_1 = s_data[s_data['TARGET_B'] == 1]
cat_0 = s_data[s_data['TARGET_B'] == 0]

In [26]:
# 1. Downsampling the majority class
cat_0_undersampled = resample(cat_0, replace=False, n_samples=len(cat_1))
s_data_downsampled = pd.concat([cat_0_undersampled, cat_1], axis=0)

# Split the data into features and target variables
X_down = s_data_downsampled.drop(columns=['TARGET_B'])
y_down = s_data_downsampled['TARGET_B']

# Predict target variable for X_test_scaled
predicted = logistic_regression_model(X_down, y_down, X_test_scaled)

# Evaluate the model
model_eval(y_test, predicted)

[1m[91m      Logistics Regression
[0m
Confusion Matrix:
[[13226  9426]
 [  523   678]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.58      0.73     22652
           1       0.07      0.56      0.12      1201

    accuracy                           0.58     23853
   macro avg       0.51      0.57      0.42     23853
weighted avg       0.92      0.58      0.70     23853





In [27]:
# 2. Upsampling the minority class
cat_1_upsampled = resample(cat_1, replace=True, n_samples=len(cat_0))
s_data_upsampled = pd.concat([cat_1_upsampled, cat_0], axis=0)

# Split the data into features and target variables
X_up = s_data_upsampled.drop(columns=['TARGET_B'])
y_up = s_data_upsampled['TARGET_B']

# Predict target variable for X_test_scaled
predicted = logistic_regression_model(X_up, y_up, X_test_scaled)

# Evaluate the model
model_eval(y_test, predicted)

[1m[91m      Logistics Regression
[0m
Confusion Matrix:
[[14049  8603]
 [  552   649]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.62      0.75     22652
           1       0.07      0.54      0.12      1201

    accuracy                           0.62     23853
   macro avg       0.52      0.58      0.44     23853
weighted avg       0.92      0.62      0.72     23853





In [28]:
# 3. Synthetic Minority Over-sampling Technique (SMOTE)
X_smote = s_data.drop(columns=['TARGET_B'])
y_smote = s_data['TARGET_B']

# Use SMOTE to balance the class distribution
smote = SMOTE(random_state=100, k_neighbors=3)
X_S, y_S = smote.fit_resample(X_smote, y_smote)

# Predict target variable for X_test_scaled
predicted = logistic_regression_model(X_S, y_S, X_test_scaled)

# Evaluate the model
model_eval(y_test, predicted)

[1m[91m      Logistics Regression
[0m
Confusion Matrix:
[[14161  8491]
 [  593   608]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.63      0.76     22652
           1       0.07      0.51      0.12      1201

    accuracy                           0.62     23853
   macro avg       0.51      0.57      0.44     23853
weighted avg       0.91      0.62      0.72     23853



