### Importing packages nad libraries

In [137]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

#### Load and showcase our Dataset

In [11]:
#loading data
cc_apps = pd.read_csv("crx.data")
print(cc_apps)

#renaming features to make visualising our dataframe much easier
cc_apps.columns = ["Gender", "Age", "Debt", "Married", "BankCustomer", "EducationLevel", "Ethnicity", "YearsEmployed", "PriorDefault", "Employed", "CreditScore", "DriversLicense", "Citizen", "ZipCode", "Income", "ApprovalStatus"]
print(cc_apps)
#notice that we lost one row but it wouldn't make much difference

     b  30.83       0  u  g   w   v  1.25  t t.1  01  f g.1  00202  0.1  +
0    a  58.67   4.460  u  g   q   h  3.04  t   t   6  f   g  00043  560  +
1    a  24.50   0.500  u  g   q   h  1.50  t   f   0  f   g  00280  824  +
2    b  27.83   1.540  u  g   w   v  3.75  t   t   5  t   g  00100    3  +
3    b  20.17   5.625  u  g   w   v  1.71  t   f   0  f   s  00120    0  +
4    b  32.08   4.000  u  g   m   v  2.50  t   f   0  t   g  00360    0  +
..  ..    ...     ... .. ..  ..  ..   ... ..  ..  .. ..  ..    ...  ... ..
684  b  21.08  10.085  y  p   e   h  1.25  f   f   0  f   g  00260    0  -
685  a  22.67   0.750  u  g   c   v  2.00  f   t   2  t   g  00200  394  -
686  a  25.25  13.500  y  p  ff  ff  2.00  f   t   1  t   g  00200    1  -
687  b  17.92   0.205  u  g  aa   v  0.04  f   f   0  f   g  00280  750  -
688  b  35.00   3.375  u  g   c   h  8.29  f   f   0  t   g  00000    0  -

[689 rows x 16 columns]
    Gender    Age    Debt Married BankCustomer EducationLevel Ethnicity  \


### Inspecting our data


In [12]:
#using pandas method describe to print summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

#print DataFrame information
cc_apps_info = cc_apps.info()
print(cc_apps_info)

#we have two features that will not affect our models performance so its best if we remove them
cc_apps = cc_apps.drop(["DriversLicense", "ZipCode"], axis=1)
print(cc_apps)

             Debt  YearsEmployed  CreditScore         Income
count  689.000000     689.000000   689.000000     689.000000
mean     4.765631       2.224819     2.402032    1018.862119
std      4.978470       3.348739     4.866180    5213.743149
min      0.000000       0.000000     0.000000       0.000000
25%      1.000000       0.165000     0.000000       0.000000
50%      2.750000       1.000000     0.000000       5.000000
75%      7.250000       2.625000     3.000000     396.000000
max     28.000000      28.500000    67.000000  100000.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689 entries, 0 to 688
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          689 non-null    object 
 1   Age             689 non-null    object 
 2   Debt            689 non-null    float64
 3   Married         689 non-null    object 
 4   BankCustomer    689 non-null    object 
 5   EducationLevel  689 non-nu

### Handling missing values (part I)

In [13]:
#this dataset indicates missing values with '?' so we have to replace it with NaN
cc_apps = cc_apps.replace("?", np.NaN)

In [166]:
#Age feature is not a float despite looking like one
cc_num = cc_apps.drop(["Gender", "PriorDefault", "Employed", "Citizen", "Married", "BankCustomer", "EducationLevel", "Ethnicity", "ApprovalStatus"], axis = 1).values
cc_cat = cc_apps.drop(["Age", "Debt", "YearsEmployed", "CreditScore", "Income"], axis = 1).values
cc_num[:, 0] = cc_num[:, 0].astype('float')

### Splitting the dataset into train and test sets

In [165]:
cc_num_train, cc_num_test = train_test_split(cc_num, test_size = 0.3, random_state = 42)
cc_cat_train, cc_cat_test = train_test_split(cc_cat, test_size = 0.3, random_state = 42)

### Handling missing data (part II)

In [160]:
#handling categorical data

imp_cat = SimpleImputer(strategy = 'most_frequent')
cc_cat_train = imp_cat.fit_transform(cc_cat_train)
cc_cat_test = imp_cat.transform(cc_cat_test)

In [161]:
#handling numerical data

imp_num = SimpleImputer()
cc_num_train = imp_num.fit_transform(cc_num_train)
cc_num_test = imp_num.transform(cc_num_test)

In [164]:
#combine train/test

cc_train = np.append(cc_num_train, cc_cat_train, axis = 1)
cc_test = np.append(cc_num_test, cc_cat_test, axis = 1)

In [163]:
#checking if there are still some missing values

print(np.where(cc_train == np.NaN))
print(np.where(cc_test == np.NaN))
#both arrays are empty that means we got rid of NaN values

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))


### Encoding categorical values

In [147]:
#splitting to features and labels to make encoding easier

X_train, y_train = np.delete(cc_train, -1, 1), cc_train[:,-1] 
X_test, y_test = np.delete(cc_train, -1, 1), cc_train[:,-1] 

y_train = y_train.reshape(482, 1)
y_test = y_test.reshape(482, 1)


In [148]:
#Encoding the dependent variable

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

y_train = y_train.reshape(482, 1)
y_test = y_test.reshape(482, 1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [150]:
#Encoding features

ct = ColumnTransformer(transformers = [('cat', OneHotEncoder(), slice(5, 13))], remainder = 'passthrough')
X_train = np.array(ct.fit_transform(X_train))
X_test = np.array(ct.transform(X_test))

### Scaling X values

In [152]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test

scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

In [155]:
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [156]:
# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test,y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test,y_pred)

Accuracy of logistic regression classifier:  0.8775933609958506


array([[189,  23],
       [ 36, 234]])