In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 15)

In [2]:
y = pd.read_csv('target1.csv')

numerical = pd.read_csv('numerical1.csv')
categorical = pd.read_csv('categorical1.csv')

In [3]:
y = y['TARGET_B']
numerical = numerical.drop(columns = 'Unnamed: 0')
categorical = categorical.drop(columns = 'Unnamed: 0')

In [4]:
def nan_check(data):
    nans = pd.DataFrame(data.isna().sum()).reset_index()
    nans.columns = ['count', 'val']
    display(nans[nans['val']>0])
    return None

nan_check(numerical)

Unnamed: 0,count,val
316,NEXTDATE,9973


In [5]:
numerical['NEXTDATE'] = numerical['NEXTDATE'].fillna(0)

In [6]:
categorical.dtypes


STATE        object
CLUSTER       int64
HOMEOWNR     object
GENDER       object
DATASRCE      int64
SOLIH       float64
VETERANS     object
RFA_2R       object
RFA_2A       object
GEOCODE2     object
DOMAIN_A     object
DOMAIN_B      int64
dtype: object

In [7]:
categorical.dtypes


STATE        object
CLUSTER       int64
HOMEOWNR     object
GENDER       object
DATASRCE      int64
SOLIH       float64
VETERANS     object
RFA_2R       object
RFA_2A       object
GEOCODE2     object
DOMAIN_A     object
DOMAIN_B      int64
dtype: object

In [8]:
nan_check(categorical)


Unnamed: 0,count,val
5,SOLIH,89212
6,VETERANS,84986


In [9]:
categorical.SOLIH.value_counts(dropna=False)


NaN     89212
12.0     5693
0.0       296
1.0        94
2.0        75
3.0        19
4.0        16
6.0         7
Name: SOLIH, dtype: int64

In [10]:
categorical.SOLIH = categorical.SOLIH.fillna(13)
categorical.SOLIH.value_counts(dropna=False)

13.0    89212
12.0     5693
0.0       296
1.0        94
2.0        75
3.0        19
4.0        16
6.0         7
Name: SOLIH, dtype: int64

In [11]:

categorical.VETERANS.value_counts(dropna=False)

NaN    84986
Y      10426
Name: VETERANS, dtype: int64

In [12]:
categorical.VETERANS = categorical.VETERANS.fillna('N')
categorical.VETERANS.value_counts(dropna=False)

N    84986
Y    10426
Name: VETERANS, dtype: int64

In [13]:

categorical.DATASRCE.value_counts()

3    64829
2    23455
1     7128
Name: DATASRCE, dtype: int64

In [14]:
categorical = categorical.applymap(str)
categorical.dtypes

STATE       object
CLUSTER     object
HOMEOWNR    object
GENDER      object
DATASRCE    object
SOLIH       object
VETERANS    object
RFA_2R      object
RFA_2A      object
GEOCODE2    object
DOMAIN_A    object
DOMAIN_B    object
dtype: object

In [15]:
X = pd.concat([numerical, categorical], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 1)

In [16]:
X_train_num = X_train.select_dtypes(np.number) 
X_train_cat = X_train.select_dtypes(object)
X_test_num = X_test.select_dtypes(np.number)
X_test_cat  = X_test.select_dtypes(object)

In [17]:
transformer = MinMaxScaler().fit(X_train_num)

X_train_num_scaled = pd.DataFrame(transformer.transform(X_train_num), columns = X_train_num.columns)
X_test_num_scaled = pd.DataFrame(transformer.transform(X_test_num), columns = X_test_num.columns)

In [18]:
def cat_encode(data, _onehotencoder):

    encoded = _onehotencoder.transform(data).toarray()
    cols = _onehotencoder.get_feature_names_out(input_features=data.columns)
    
    # Returning a complete encoded categorical dataset
    return pd.DataFrame(encoded, columns=cols).reset_index(drop=True)

onehotencoder = OneHotEncoder(drop='first', handle_unknown = 'ignore').fit(X_train_cat)

X_train_cat_encoded = cat_encode(X_train_cat, onehotencoder).reset_index(drop = True)
X_test_cat_encoded = cat_encode(X_test_cat, onehotencoder).reset_index(drop = True)

In [19]:
X_train_scaled = pd.concat([X_train_cat_encoded, X_train_num_scaled], axis = 1)
X_test_scaled = pd.concat([X_test_cat_encoded, X_test_num_scaled], axis = 1)

In [20]:
y_test = y_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)

In [21]:
X_train_scaled = pd.concat([X_train_cat_encoded, X_train_num_scaled], axis = 1)
X_test_scaled = pd.concat([X_test_cat_encoded, X_test_num_scaled], axis = 1)

In [22]:
X_train_scaled = pd.concat([X_train_cat_encoded, X_train_num_scaled], axis = 1)
X_test_scaled = pd.concat([X_test_cat_encoded, X_test_num_scaled], axis = 1)

In [23]:
def model(X, y, X_t):
    
    lr = LogisticRegression(random_state=0, solver= 'saga', multi_class='multinomial').fit(X, y)
    return lr.predict(X_t)

In [24]:
def model_eval(Y, P):
    print('\033[1m'+'\033[91m'+'      Logistics Regression\n'+'\033[0m')
    array = confusion_matrix(Y, P)
    print('             Predicted Labels')
    print('             |      A     |     B')    
    print('             ------------------------')
    print('True label A |   ',array[0][0],'  |    ', array[0][1])
    print('             ------------------------')
    print('           B |   ',array[1][0],'   |    ', array[1][1])
    print('\nModel Accuracy', accuracy_score(Y, P)*100)
    print('Model F1', f1_score(Y, P)*100)
    return None

In [25]:
def model_eval(Y, P):
    print('\033[1m'+'\033[91m'+'      Logistics Regression\n'+'\033[0m')
    array = confusion_matrix(Y, P)
    print('             Predicted Labels')
    print('             |      A     |     B')    
    print('             ------------------------')
    print('True label A |   ',array[0][0],'  |    ', array[0][1])
    print('             ------------------------')
    print('           B |   ',array[1][0],'   |    ', array[1][1])
    print('\nModel Accuracy', accuracy_score(Y, P)*100)
    print('Model F1', f1_score(Y, P)*100)
    return None

In [27]:
s_data = pd.concat([X_train_scaled, y_train], axis = 1)
cat_1 = s_data[s_data['TARGET_B'] == 1]
cat_0 = s_data[s_data['TARGET_B'] == 0]

In [28]:
cat_0_undersampled = resample(cat_0, replace=False, n_samples = len(cat_1))
s_data_downsampled = pd.concat([cat_0_undersampled, cat_1], axis=0)

In [29]:
s_data_downsampled.shape


(7284, 413)

In [30]:
X_down = s_data_downsampled.drop(columns=['TARGET_B'])
y_down = s_data_downsampled['TARGET_B']

predicted = model(X_down, y_down, X_test_scaled)
model_eval(y_test, predicted)



[1m[91m      Logistics Regression
[0m
             Predicted Labels
             |      A     |     B
             ------------------------
True label A |    13149   |     9503
             ------------------------
           B |    544    |     657

Model Accuracy 57.87951201106779
Model F1 11.565883284922105


In [31]:
cat_1_upsampled = resample(cat_1, replace=True, n_samples = len(cat_0))
s_data_upsampled = pd.concat([cat_1_upsampled, cat_0], axis=0)

In [32]:
s_data_upsampled.shape


(135834, 413)

In [None]:
X_up = s_data_upsampled.drop(columns=['TARGET_B'])
y_up = s_data_upsampled['TARGET_B']

predicted = model(X_up, y_up, X_test_scaled)
model_eval(y_test, predicted)

In [None]:
X_smote = s_data.drop(columns=['TARGET_B'])
y_smote = s_data['TARGET_B']

smote = SMOTE(random_state=100, k_neighbors=3)
X_S,y_S = smote.fit_resample(X_smote,y_smote)

predicted = model(X_S, y_S, X_test_scaled)
model_eval(y_test, predicted)