In [77]:
import pandas as pd
import numpy as np

# read the files
df = pd.read_excel('Copy of split_datasets(1).xlsx')

In [78]:
# set X and y as the train and target
y_orgi = df['default']
X_orgi = df.drop(columns=['default'], axis=1)

In [79]:
# set seed
seed = 42

In [80]:
# split the training data into training and validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_orgi, y_orgi, test_size=0.3, random_state=seed)

In [81]:
X_train_continuous = X_train[['age', 'log_income', 'log_debtinc', 'log_creddebt', 'log_othdebt']]
X_train_discrete = X_train[['ed', 'address']]

X_test_continuous = X_test[['age', 'log_income', 'log_debtinc', 'log_creddebt', 'log_othdebt']]
X_test_discrete = X_test[['ed', 'address']]

In [82]:
# preform standardization on continuous features
from sklearn.preprocessing import StandardScaler
scaler_train = StandardScaler()
scalar_test = StandardScaler()
X_train_continuous_st = scaler_train.fit_transform(X_train_continuous)
X_test_continuous_st = scalar_test.fit_transform(X_test_continuous)
X_train_continuous_st

array([[-0.91474096,  0.93666804,  1.08548917,  2.22427816,  1.19586455],
       [-0.91474096, -0.04234708, -0.31445499, -0.13769812, -0.44120649],
       [-0.51052257, -0.10053665,  1.71081204,  0.19487173,  1.81827724],
       ...,
       [ 0.56739313,  1.16918876,  0.73534529,  2.38824784,  0.66729171],
       [-0.51052257, -1.26868561,  0.40118701, -1.008998  , -0.19771036],
       [-0.10630419, -0.35178907,  0.77943902,  0.36746568,  0.35065293]])

In [83]:
# perform PCA on the train set, and make test set align with them
from sklearn.decomposition import PCA
X_train_continuous_st_copy = X_train_continuous_st.copy()
pca = PCA(n_components=2)
X_train_continuous_st_copy_pca = pca.fit_transform(X_train_continuous_st_copy)

X_test_continuous_st_copy = X_test_continuous_st.copy()
pca = PCA(n_components=2)
X_test_continuous_st_copy_pca = pca.fit_transform(X_test_continuous_st_copy)


In [84]:
# combine the data together
X_train = np.hstack((X_train_discrete, X_train_continuous_st_copy_pca))
X_test = np.hstack((X_test_discrete, X_test_continuous_st_copy_pca))

In [85]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [86]:
# set the training data

X = X_train
y = y_train

In [87]:
# logistic regression

parameters = {
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.001, 0.01, 0.1],
    'learning_rate': ['constant', 'optimal', 'adaptive', 'invscaling'],
    'eta0': [0.001, 0.01, 0.1],
}

model = SGDClassifier(
    loss='log_loss',
    class_weight='balanced',
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")


The best F1 score: 0.5315554958988561, the std: 0.061370197198731234
The corresponding acc score: 0.7305658381808566, the std: 0.07711830977587317
The best estimator params: {'alpha': 0.01, 'eta0': 0.01, 'learning_rate': 'constant', 'penalty': 'elasticnet'}


In [88]:
model = SGDClassifier(
    loss='log_loss',
    class_weight='balanced',
    random_state=seed,
    alpha=0.1,
    eta0=0.01,
    penalty="l2",
    learning_rate='adaptive',
)

model.fit(X, y)
model.score(X_test, y_test)

0.44360902255639095

In [89]:
# linear SVC

parameters = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': [0.001, 0.01, 0.1, 1],
}

model = LinearSVC(
    class_weight='balanced',
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=5, scoring={'Acc': 'accuracy', 'F1': 'f1'}, refit='Acc')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")
print(f"The acc of test set (final prediction) is {cv.score(X_test, y_test)}")


The best F1 score: 0.7594923320994182, the std: 0.1555689088943435
The corresponding acc score: 0.7594923320994182, the std: 0.07320464426310051
The best estimator params: {'C': 0.001, 'loss': 'hinge', 'penalty': 'l2'}
The acc of test set (final prediction) is 0.6165413533834586


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\55479\PycharmProjects\EUR_QF_ML_1\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\55479\PycharmProjects\EUR_QF_ML_1\venv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\55479\PycharmProjects\EUR_QF_ML_1\venv\lib\site-packages\sklearn\svm\_classes.py", line 326, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "C:\Users\55479\PycharmProjects\EUR_QF_ML_1\ve

In [90]:
model = LinearSVC(
    random_state=seed,
    C=0.1,
    loss="hinge",
)

model.fit(X, y)
model.score(X_test, y_test)



0.7443609022556391

In [91]:
# random forest
parameters = {
}

model = RandomForestClassifier(
    bootstrap=True,
    criterion='entropy',
    n_estimators=100,
    class_weight='balanced',
    random_state=seed,
)

cv = GridSearchCV(model, parameters, n_jobs=-1, cv=10, scoring={'F1': 'f1', 'Acc': 'accuracy'}, refit='F1')
cv.fit(X, y)

print(f"The best F1 score: {cv.best_score_}, the std: {cv.cv_results_['std_test_F1'][cv.best_index_]}")
print(f"The corresponding acc score: {cv.cv_results_['mean_test_Acc'][cv.best_index_]}, the std: {cv.cv_results_['std_test_Acc'][cv.best_index_]}")
print(f"The best estimator params: {cv.best_params_}")

The best F1 score: 0.49804195804195805, the std: 0.1581039129995314
The corresponding acc score: 0.7990322580645162, the std: 0.05292780703509474
The best estimator params: {}


In [92]:
model = RandomForestClassifier(
    bootstrap=True,
    criterion='entropy',
    n_estimators=100,
    class_weight='balanced',
    random_state=seed,
)
model.fit(X, y)
model.score(X_test, y_test)

0.6917293233082706