In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


# Load the data
cc_apps = pd.read_csv('cc_approvals.data', header=None) 
cc_apps.head()


# Replace missing values
cc_apps_cleaned = cc_apps.replace({'?':'np.NaN'})

cc_apps_imputed = cc_apps_cleaned.copy()

for column in cc_apps_imputed.columns:
    if cc_apps_imputed[column].dtype == 'object':
        most_frequent_value = cc_apps_imputed[column].value_counts().idxmax()
        cc_apps_imputed[column] = cc_apps_imputed[column].fillna(most_frequent_value)
    else:
        mean_value = cc_apps_imputed[column].mean()
        cc_apps_imputed[column] = cc_apps_imputed[column].fillna(mean_value)

cc_apps_dummies =  pd.get_dummies(cc_apps_imputed, drop_first = True)


# Prepare the data for modeling
target = cc_apps_dummies.iloc[:, -1].values  
feature = cc_apps_dummies.iloc[: ,:-1].values

X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)


# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)  


# Train the model
log_reg = LogisticRegression()

log_reg.fit(X_train_scaled, y_train)

predictions = log_reg.predict(X_train_scaled)


# Find the best scoring model
tolerance = [0.01, 0.001, 0.0001]
max_iteration = [100, 150, 200]

param_grid = dict(tol = tolerance, max_iter = max_iteration)


# Create GridSearchCV object
grid_model = GridSearchCV(estimator = log_reg, param_grid = param_grid, cv = 5)


# Fit the model
grid_model_result = grid_model.fit(X_train_scaled, y_train)

best_train_score, best_train_param = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_train_score, best_train_param))


# Extract the best model
best_model = grid_model_result.best_estimator_
best_score = best_model.score(X_test_scaled, y_test)

print('Accuracy score:', best_score)


