In [79]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from algo_implementation import logistic_regression

# Load train and test datasets
train_df = pd.read_csv('train_dataset2.csv').drop(columns=['Unnamed: 0'])
test_df = pd.read_csv('test_dataset2.csv').drop(columns=['Unnamed: 0'])

# Extract features and target variable
X_train, y_train = train_df.iloc[:, :-1], train_df.iloc[:, -1]
X_test, y_test = test_df.iloc[:, :-1], test_df.iloc[:, -1]

# Train and evaluate logistic regression
lr = logistic_regression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred, average='macro')
lr_recall = recall_score(y_test, lr_pred, average='macro')

# Train and evaluate LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_pred = lda.predict(X_test)
lda_acc = accuracy_score(y_test, lda_pred)
lda_precision = precision_score(y_test, lda_pred, average='macro')
lda_recall = recall_score(y_test, lda_pred, average='macro')

# Train and evaluate QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_pred = qda.predict(X_test)
qda_acc = accuracy_score(y_test, qda_pred)
qda_precision = precision_score(y_test, qda_pred, average='macro')
qda_recall = recall_score(y_test, qda_pred, average='macro')

# Train and evaluate KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred, average='macro')
knn_recall = recall_score(y_test, knn_pred, average='macro')

# Print evaluation results
print('Logistic Regression:')
print(f'Accuracy: {lr_acc}')
print(f'Precision: {lr_precision}')
print(f'Recall: {lr_recall}')
print()
print('LDA:')
print(f'Accuracy: {lda_acc}')
print(f'Precision: {lda_precision}')
print(f'Recall: {lda_recall}')
print()
print('QDA:')
print(f'Accuracy: {qda_acc}')
print(f'Precision: {qda_precision}')
print(f'Recall: {qda_recall}')
print()
print('KNN:')
print(f'Accuracy: {knn_acc}')
print(f'Precision: {knn_precision}')
print(f'Recall: {knn_recall}')


Logistic Regression:
Accuracy: 0.8541666666666666
Precision: 0.8303030303030303
Recall: 0.8303030303030303

LDA:
Accuracy: 0.8229166666666666
Precision: 0.8180952380952381
Recall: 0.753030303030303

QDA:
Accuracy: 0.8333333333333334
Precision: 0.806060606060606
Recall: 0.806060606060606

KNN:
Accuracy: 0.875
Precision: 0.8500948766603416
Recall: 0.8727272727272728


# Interactions

In [80]:
from power_qcut import qcut_fold_validation, prepare_cols_comb

In [81]:
X_train.columns = X_train.columns.str.replace(' ', '_')
X_test.columns = X_test.columns.str.replace(' ', '_')

In [4]:
basic_cols = list(X_train.columns)

In [5]:
col1 = prepare_cols_comb(list(prepare_cols_comb(basic_cols)) + basic_cols)

In [6]:
data = pd.concat([X_train, y_train], axis=1)

In [7]:
import numpy as np
res1 = qcut_fold_validation(data, 4, 'Car', col1, np.mean, 4)

100%|██████████| 1046/1046 [00:00<00:00, 1319.34it/s]


In [8]:
res1.reset_index(drop=True, inplace=True)

In [9]:
res1.sort_values("mono").feature[1]

'Monthly_Income * Finance_History + Monthly_Income - Years_of_Employment'

In [15]:
res1.sort_values("mono").feature[4]

'Monthly_Income * Number_of_Children + Monthly_Income - Years_of_Employment'

In [20]:
res1.sort_values("mono").feature.loc[11]

'Finance_Status / Years_of_Employment - Years_of_Employment - Finance_History'

In [21]:
X_train['Qcut_Var_1'] = X_train.eval('Monthly_Income * Finance_History')
X_train['Qcut_Var_1'] = (X_train['Qcut_Var_1'] - X_train['Qcut_Var_1'].mean()) / X_train['Qcut_Var_1'].std()
X_test['Qcut_Var_1'] = X_test.eval('Monthly_Income * Finance_History')
X_test['Qcut_Var_1'] = (X_test['Qcut_Var_1'] - X_train['Qcut_Var_1'].mean()) / X_train['Qcut_Var_1'].std()
X_train['Qcut_Var_2'] = X_train.eval('Monthly_Income * Number_of_Children')
X_train['Qcut_Var_2'] = (X_train['Qcut_Var_2'] - X_train['Qcut_Var_2'].mean()) / X_train['Qcut_Var_2'].std()
X_test['Qcut_Var_2'] = X_test.eval('Monthly_Income * Number_of_Children')
X_test['Qcut_Var_2'] = (X_test['Qcut_Var_2'] - X_train['Qcut_Var_2'].mean()) / X_train['Qcut_Var_2'].std()
X_train['Qcut_Var_3'] = X_train.eval('Finance_Status / Years_of_Employment')
X_train['Qcut_Var_3'] = (X_train['Qcut_Var_3'] - X_train['Qcut_Var_3'].mean()) / X_train['Qcut_Var_3'].std()
X_test['Qcut_Var_3'] = X_test.eval('Finance_Status / Years_of_Employment')
X_test['Qcut_Var_3'] = (X_test['Qcut_Var_3'] - X_train['Qcut_Var_3'].mean()) / X_train['Qcut_Var_3'].std()

In [22]:

# Train and evaluate logistic regression
lr = logistic_regression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred, average='macro')
lr_recall = recall_score(y_test, lr_pred, average='macro')

# Train and evaluate LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_pred = lda.predict(X_test)
lda_acc = accuracy_score(y_test, lda_pred)
lda_precision = precision_score(y_test, lda_pred, average='macro')
lda_recall = recall_score(y_test, lda_pred, average='macro')

# Train and evaluate QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_pred = qda.predict(X_test)
qda_acc = accuracy_score(y_test, qda_pred)
qda_precision = precision_score(y_test, qda_pred, average='macro')
qda_recall = recall_score(y_test, qda_pred, average='macro')

# Train and evaluate KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred, average='macro')
knn_recall = recall_score(y_test, knn_pred, average='macro')

# Print evaluation results
print('Logistic Regression:')
print(f'Accuracy: {lr_acc}')
print(f'Precision: {lr_precision}')
print(f'Recall: {lr_recall}')
print()
print('LDA:')
print(f'Accuracy: {lda_acc}')
print(f'Precision: {lda_precision}')
print(f'Recall: {lda_recall}')
print()
print('QDA:')
print(f'Accuracy: {qda_acc}')
print(f'Precision: {qda_precision}')
print(f'Recall: {qda_recall}')
print()
print('KNN:')
print(f'Accuracy: {knn_acc}')
print(f'Precision: {knn_precision}')
print(f'Recall: {knn_recall}')


Logistic Regression:
Accuracy: 0.8541666666666666
Precision: 0.8303030303030303
Recall: 0.8303030303030303

LDA:
Accuracy: 0.84375
Precision: 0.8377010125074449
Recall: 0.7863636363636364

QDA:
Accuracy: 0.7395833333333334
Precision: 0.7727272727272727
Recall: 0.8106060606060606

KNN:
Accuracy: 0.875
Precision: 0.8545454545454545
Recall: 0.8545454545454545


In [82]:
%%capture
new_X_train = pd.DataFrame()
new_X_test = pd.DataFrame()

for i, col_val in enumerate(list(res1.feature)[:20]):
    new_X_train[f'Q_var_{i}'] = X_train.eval(col_val)
    new_X_test[f'Q_var_{i}'] = X_test.eval(col_val)

In [83]:
from preprocessor import Preprocessor

p = Preprocessor()
vif_coefs = p.vif(new_X_train)
colnames = vif_coefs[vif_coefs['VIF'] >= 10]['variables']
X_train, X_test = new_X_train.drop(colnames, axis=1 ), new_X_test.drop(colnames, axis=1 )
print(colnames)

0      Q_var_0
1      Q_var_1
2      Q_var_2
3      Q_var_3
4      Q_var_4
5      Q_var_5
6      Q_var_6
7      Q_var_7
9      Q_var_9
12    Q_var_12
13    Q_var_13
14    Q_var_14
15    Q_var_15
16    Q_var_16
17    Q_var_17
18    Q_var_18
19    Q_var_19
Name: variables, dtype: object


  vif = 1. / (1. - r_squared_i)


In [84]:

# Train and evaluate logistic regression
lr = logistic_regression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_acc = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred, average='macro')
lr_recall = recall_score(y_test, lr_pred, average='macro')

# Train and evaluate LDA
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_pred = lda.predict(X_test)
lda_acc = accuracy_score(y_test, lda_pred)
lda_precision = precision_score(y_test, lda_pred, average='macro')
lda_recall = recall_score(y_test, lda_pred, average='macro')

# Train and evaluate QDA
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_pred = qda.predict(X_test)
qda_acc = accuracy_score(y_test, qda_pred)
qda_precision = precision_score(y_test, qda_pred, average='macro')
qda_recall = recall_score(y_test, qda_pred, average='macro')

# Train and evaluate KNN
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
knn_acc = accuracy_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred, average='macro')
knn_recall = recall_score(y_test, knn_pred, average='macro')

# Print evaluation results
print('Logistic Regression:')
print(f'Accuracy: {lr_acc}')
print(f'Precision: {lr_precision}')
print(f'Recall: {lr_recall}')
print()
print('LDA:')
print(f'Accuracy: {lda_acc}')
print(f'Precision: {lda_precision}')
print(f'Recall: {lda_recall}')
print()
print('QDA:')
print(f'Accuracy: {qda_acc}')
print(f'Precision: {qda_precision}')
print(f'Recall: {qda_recall}')
print()
print('KNN:')
print(f'Accuracy: {knn_acc}')
print(f'Precision: {knn_precision}')
print(f'Recall: {knn_recall}')


Logistic Regression:
Accuracy: 0.78125
Precision: 0.7484507042253521
Recall: 0.7227272727272727

LDA:
Accuracy: 0.78125
Precision: 0.7519356759976177
Recall: 0.7136363636363636

QDA:
Accuracy: 0.8125
Precision: 0.7916666666666667
Recall: 0.7545454545454545

KNN:
Accuracy: 0.90625
Precision: 0.8852813852813852
Recall: 0.9045454545454545
