# Classification
Here we focus on three classifiers: logistic regression, linear discriminant, and quadratic discriminant. We will apply each classifier in two steps:

- When there is only one predictor.
- When there are two or more predictors.









# Load file
Load credit data set. 
for excel files pandas library is used
- pandas function `pd.read_csv`


In [None]:
import pandas as pd
path='data/'
filename = path+'Default.xlsx'
default_data = pd.read_excel(filename)

In [None]:
default_data.head()

#  Factorize
Machine learning algorithms cannot function with string variables. We need to transform 'Yes' and 'No' to some numerical values. For binary variables it is convenient to transform the categories to zero and one. Let's transform 'No' to zero and 'Yes' to one before going further.

In [None]:
default_data['default_factor'] = default_data.default.factorize()[0]
default_data['student_factor'] = default_data.student.factorize()[0]

default_data.head(3)

In [None]:
default_data.info()

# Simple logistic regression
Here we predict the default status 'No'=0 or 'Yes' only based on credit balance.

In [None]:
# Make sure you feed the data in the right shape  
from sklearn.linear_model import LogisticRegression

X = default_data[['balance']]
y = default_data['default_factor']

lr = LogisticRegression()
lr.fit(X, y)

In [None]:
print(lr.intercept_)
print(lr.coef_)

In [None]:
import numpy as np

X_pred = np.array([1500, 2000]).reshape(2,1)

print(X_pred)


In [None]:
print(lr.predict_proba(X_pred))


In [None]:
print(lr.predict(X_pred).reshape(2,1))


In [None]:
X_pred = np.linspace(start = 0, stop = 3000, num= 100).reshape(-1,1)
y_pred = lr.predict_proba(X_pred)
#y_pred = (1+np.exp(-(lr.intercept_ + lr.coef_*X_pred)))**(-1)
#X_pred.shape
#y_pred

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(X, y, 'or', mfc='none');
plt.plot(X_pred, y_pred[:,1], '-b');
plt.xlabel('Balance');
plt.ylabel('Probability');

In [None]:
# Let's see how precise the predictor is
from sklearn.metrics import confusion_matrix
y_pred = lr.predict(X)
#print(y_pred)
confusion_matrix(y, y_pred)

# Lesson
Logistic regression tends to favour over-represented class
# Remedy
Cut the probability at the right point

In [None]:
y_pred = (lr.predict_proba(X)[:,1]>0.03)*1
confusion_matrix(y, y_pred)

In [None]:
from sklearn.metrics import roc_curve
log_fpr, log_tpr, log_thresholds = roc_curve(y, y_pred)

In [None]:
from sklearn.metrics import roc_auc_score
log_AUC = roc_auc_score(y, y_pred)

print('AUC:%.3f'% log_AUC)

In [None]:
plt.plot(log_fpr, log_tpr,'r-',label = 'LOG AUC: %.3f'%log_AUC)

plt.plot([0,1],[0,1],'k-',label='random')
plt.plot([0,0,1,1],[0,1,1,1],'g-',label='perfect')
plt.legend()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate');

# Linear Discriminant
Linear discriminant analysis (lda) is one of the most popular classifiers.


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Make sure you feed the data in the right shape  
X = default_data[['balance']]
y = default_data['default_factor']
lda = LinearDiscriminantAnalysis()
lda.fit(X,y)

In [None]:
import numpy as np
X_pred = np.array([1500, 2000]).reshape(2,1)
print(X_pred)
print(lda.predict_proba(X_pred))

In [None]:
import matplotlib.pyplot as plt
X_pred = np.linspace(start = 0, stop = 3000, num= 100).reshape(-1,1)
y_pred = lda.predict_proba(X_pred)
plt.plot(X, y, 'or', mfc='none')
plt.plot(X_pred, y_pred[:,1], '-b')
plt.xlabel('Balance')
plt.ylabel('Probability');

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = lda.predict(X)
confusion_matrix(y, y_pred)

# Lesson
Linear discriminant is a bit better in the case of unbalanced data. 

In [None]:
y_pred = (lda.predict_proba(X)[:,1]>0.03)*1
confusion_matrix(y, y_pred)

# Quadratic Discriminant
Quadratic discriminant functions the same as the linear discriminant. Here we only repeat the codes above.


In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# Make sure you feed the data in the right shape  
X = default_data[['balance']]
y = default_data['default_factor']
qda = QuadraticDiscriminantAnalysis()
qda.fit(X,y)

In [None]:
X_pred = np.array([1500, 2000]).reshape(-1,1)
print(qda.predict_proba(X_pred))

In [None]:
y_pred = qda.predict(X)
confusion_matrix(y, y_pred)

In [None]:
y_pred = (qda.predict_proba(X)[:,1]>0.03)*1
confusion_matrix(y, y_pred)

# Two predictors (or more)
Fitting the logistic regression with two predictors is very similar to simple version. Just feed the appropriate matrix X.


In [None]:
X = default_data[['balance', 'income']]
y = default_data['default_factor']

lr = LogisticRegression()
lr.fit(X, y)

print(lr.intercept_)
print(lr.coef_)

In [None]:
X_pred = np.array([1500, 20000, 1500, 25000]).reshape(2,2)
print(X_pred)
print(lr.predict_proba(X_pred))

# Logistic Regression with statsmodels
Always statsmodels provides more statistical details. Let's try fitting logistic regression with statsmodels

In [None]:
import statsmodels.formula.api as smf
lr = smf.Logit.from_formula(formula = "default_factor~balance+income", 
                            data= default_data).fit()


In [None]:
lr.summary()