[statinfer.com](https://statinfer.com/)

# Model Selection Cross validation

In [None]:
import sklearn as sk
import pandas as pd
import numpy as np

# Data Importing

In [None]:
import pandas as pd
loans=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Credit_Risk_Cleaned/Cleaned_cs_training.csv")

In [None]:
loans.info()

In [None]:
## Function for OLS formula
def ols_formula(df, dependent_var, *excluded_cols):
    '''
    Generates the y~X style formula for statsmodels (patsy) given
    the dataframe, dependent variable and optional excluded columns
    as strings
    '''
    df_columns = list(df.columns.values)
    df_columns.remove(dependent_var)
    for col in excluded_cols:
        df_columns.remove(col)
    return dependent_var + ' ~ ' + ' + '.join(df_columns)

In [None]:
model_formula=ols_formula(loans.iloc[:,1:], dependent_var="SeriousDlqin2yrs")
print(model_formula)

# Sensitivity and Specificity

In [None]:
import statsmodels.formula.api as sm

risk_model=sm.logit(model_formula, data=loans)
results=risk_model.fit()
print(results.summary())

In [None]:
#####Create the confusion matrix
###predict the variable active customer from logistic fit####
predictions = results.predict()
print(predictions[0:10])
len(predictions)

In [None]:
### Converting predicted values into classes using threshold
threshold=0.5
predicted_class1=[ 0 if x < threshold else 1 for x in predictions]
print(predicted_class1[0:10])

In [None]:
from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix(loans["SeriousDlqin2yrs"],predicted_class1)
print('Confusion Matrix : \n', cm1)

total1=sum(sum(cm1))
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity1 )

specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity1)

## Threshold

In [None]:
###Sensitivity vs Specificity with Different Thresholds
### Converting predicted values into classes using new threshold
threshold=0.2
predicted_class1=[ 0 if x < threshold else 1 for x in predictions]

#Change in Confusion Matrix, Accuracy and Sensitivity-Specificity
#Confusion matrix, Accuracy, sensitivity and specificity
cm1 = confusion_matrix(loans["SeriousDlqin2yrs"],predicted_class1)
print('Confusion Matrix : \n', cm1)

total1=sum(sum(cm1))
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity1 )

specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity1)

In [None]:
###Sensitivity vs Specificity with Different Thresholds
### Converting predicted values into classes using new threshold
threshold=0.8
predicted_class1=[ 0 if x < threshold else 1 for x in predictions]

#Change in Confusion Matrix, Accuracy and Sensitivity-Specificity
#Confusion matrix, Accuracy, sensitivity and specificity
cm1 = confusion_matrix(loans["SeriousDlqin2yrs"],predicted_class1)
print('Confusion Matrix : \n', cm1)

total1=sum(sum(cm1))
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity1 )

specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity1)

# Precision, Recall and F1 Score

In [None]:
predictions = results.predict()
### Converting predicted values into classes using threshold
threshold=0.5
predicted_class1=[ 0 if x < threshold else 1 for x in predictions]

cm1 = confusion_matrix(loans["SeriousDlqin2yrs"],predicted_class1)
print('Confusion Matrix : \n', cm1)

total1=sum(sum(cm1))
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

Precision_Class0 = cm1[0,0]/(cm1[0,0]+cm1[1,0])
print('Precision_Class0 : ', Precision_Class0 )

Recall_Class0 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Recall_Class0 : ', Recall_Class0 )

F1_Class0 = 2/((1/Precision_Class0)+(1/Recall_Class0))
print('F1_Class0 : ', F1_Class0 )


Precision_Class1 = cm1[1,1]/(cm1[0,1]+cm1[1,1])
print('Precision_Class1 : ', Precision_Class1 )

Recall_Class1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Recall_Class1 : ', Recall_Class1 )

F1_Class1 = 2/((1/Precision_Class1)+(1/Recall_Class1))
print('F1_Class1 : ', F1_Class1 )



In [None]:
from sklearn.metrics import classification_report
print(classification_report(loans["SeriousDlqin2yrs"],predicted_class1))

# ROC

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

actual = loans["SeriousDlqin2yrs"]
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predictions)
print("false_positive_rate", false_positive_rate)
print("true_positive_rate", true_positive_rate)
print("thresholds", thresholds)

In [None]:
import matplotlib.pyplot as plt
plt.title('ROC Curve')
#Drawing ROC Curve
plt.plot(false_positive_rate, true_positive_rate)

#X and Y  Axis Limits 
plt.xlim([-0,1])
plt.ylim([-0,1])

# Labels 
plt.ylabel('True Positive Rate(Sensitivity)')
plt.xlabel('False Positive Rate(Specificity)')
plt.show()

## AUC

In [None]:
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

# Handling Class Imbalance

In [None]:
print("Actual Data :", loans.shape)

#Frequency count on target column
freq=loans['SeriousDlqin2yrs'].value_counts()
print(freq)
print((freq/freq.sum())*100)

#Classwise data
credit_risk_class0 = loans[loans['SeriousDlqin2yrs'] == 0]
credit_risk_class1 = loans[loans['SeriousDlqin2yrs'] == 1]

print("Class0 Actual :", credit_risk_class0.shape)
print("Class1 Actual  :", credit_risk_class1.shape)


## Undersampling and Oversampling

In [None]:
##Undersampling of class-0
## Consider half of class-0
credit_risk_class0_under = credit_risk_class0.sample(int(0.5*len(credit_risk_class0)))
print("Class0 Undersample :", credit_risk_class0_under.shape)

##Oversampling of Class-1 
# Lets increase the size by four times
credit_risk_class1_over = credit_risk_class1.sample(4*len(credit_risk_class1),replace=True)
print("Class1 Oversample :", credit_risk_class1_over.shape)

#Concatenate to create the final balanced data
credit_risk_balanced=pd.concat([credit_risk_class0_under,credit_risk_class1_over])
print("Final Balannced Data :", credit_risk_balanced.shape)

#Frequency count on target column in the balanced data
freq=credit_risk_balanced['SeriousDlqin2yrs'].value_counts()
print(freq)
print((freq/freq.sum())*100)


### Model with balanced data

In [None]:
model_formula="SeriousDlqin2yrs ~ util_new + age_new + num_30_59_dpd_new + DebtRatio_new_ind + DebtRatio_new + MonthlyIncome_ind + MonthlyIncome_new + Open_Credit_lines_new + num_90_dpd_new + Real_estate_loans_new + num_60_dpd_new + NumberOfDependents_new"

In [None]:
risk_model=sm.logit(model_formula, data=credit_risk_balanced)
results=risk_model.fit()
print(results.summary())

#### Updated Sensitivity and Specificity

In [None]:
#####Create the confusion matrix
###predict the variable active customer from logistic fit####
predictions = results.predict()
threshold=0.5
predicted_class1=[ 0 if x < threshold else 1 for x in predictions]

cm1 = confusion_matrix(credit_risk_balanced["SeriousDlqin2yrs"],predicted_class1)
print('Confusion Matrix : \n', cm1)

total1=sum(sum(cm1))
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity1 )

specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity1)

#### Updated Precision, Recall and F1-Score

In [None]:
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

Precision_Class0 = cm1[0,0]/(cm1[0,0]+cm1[1,0])
print('Precision_Class0 : ', Precision_Class0 )

Recall_Class0 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Recall_Class0 : ', Recall_Class0 )

F1_Class0 = 2/((1/Precision_Class0)+(1/Recall_Class0))
print('F1_Class0 : ', F1_Class0 )


Precision_Class1 = cm1[1,1]/(cm1[0,1]+cm1[1,1])
print('Precision_Class1 : ', Precision_Class1 )

Recall_Class1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Recall_Class1 : ', Recall_Class1 )

F1_Class1 = 2/((1/Precision_Class1)+(1/Recall_Class1))
print('F1_Class1 : ', F1_Class1 )

In [None]:
from sklearn.metrics import classification_report
print(classification_report(credit_risk_balanced["SeriousDlqin2yrs"],predicted_class1))

## SMOTE for Handling Class Imbalance

In [None]:
print("Actual Data :", loans.shape)

#Frequency count on target column
freq=loans['SeriousDlqin2yrs'].value_counts()
print(freq)
print((freq/freq.sum())*100)

#Classwise data
credit_risk_class0 = loans[loans['SeriousDlqin2yrs'] == 0]
credit_risk_class1 = loans[loans['SeriousDlqin2yrs'] == 1]

print("Class0 Actual :", credit_risk_class0.shape)
print("Class1 Actual  :", credit_risk_class1.shape)


### Visulaizing Imbalanced Data

In [None]:
import matplotlib.pyplot as plt
loans1=loans.sample(5000)
fig = plt.figure()
ax1 = fig.add_subplot(111)

var1="util_new"
var2="MonthlyIncome_new"

ax1.scatter(loans1[var1][loans1.SeriousDlqin2yrs==0],loans1[var2][loans1.SeriousDlqin2yrs==0], s=50, c='b', marker="o", label='0 Good')
ax1.scatter(loans1[var1][loans1.SeriousDlqin2yrs==1],loans1[var2][loans1.SeriousDlqin2yrs==1], s=50, c='r', marker="o", label='1 Bad')

plt.xlim(min(loans1[var1]), max(loans1[var1]))
plt.ylim(min(loans1[var2]), max(loans1[var2]))
plt.legend(loc='upper left');

plt.show()

## SMOTE Function

In [None]:
print(loans.columns)
X_train=loans.iloc[:,2:]
y_train=loans["SeriousDlqin2yrs"]
print(X_train.shape)
print(y_train.shape)

In [None]:
from imblearn.over_sampling import SMOTE 
smote = SMOTE(random_state = 2)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

import collections
print("Before SMOTE", collections.Counter(y_train))
print("After SMOTE", collections.Counter(y_train_smote))

### Mention the sample size

In [None]:
from imblearn.over_sampling import SMOTE 
smote = SMOTE(sampling_strategy=0.6, random_state = 2)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

import collections
print("Before SMOTE", collections.Counter(y_train))
print("After SMOTE", collections.Counter(y_train_smote))

In [None]:
loans_smote=X_train_smote
loans_smote["SeriousDlqin2yrs"]=y_train_smote

### Visulaizing balanced data after SMOTE

In [None]:
import matplotlib.pyplot as plt
loans1=loans_smote.sample(5000)
fig = plt.figure()
ax1 = fig.add_subplot(111)

var1="util_new"
var2="MonthlyIncome_new"

ax1.scatter(loans1[var1][loans1.SeriousDlqin2yrs==0],loans1[var2][loans1.SeriousDlqin2yrs==0], s=50, c='b', marker="o", label='0 Good')
ax1.scatter(loans1[var1][loans1.SeriousDlqin2yrs==1],loans1[var2][loans1.SeriousDlqin2yrs==1], s=50, c='r', marker="o", label='1 Bad')

plt.xlim(min(loans1[var1]), max(loans1[var1]))
plt.ylim(min(loans1[var2]), max(loans1[var2]))
plt.legend(loc='upper left');

plt.show()

### Model with balanced data - SMOTE

In [None]:
model_formula="SeriousDlqin2yrs ~ util_new + age_new + num_30_59_dpd_new + DebtRatio_new_ind + DebtRatio_new + MonthlyIncome_ind + MonthlyIncome_new + Open_Credit_lines_new + num_90_dpd_new + Real_estate_loans_new + num_60_dpd_new + NumberOfDependents_new"

In [None]:
credit_risk_balanced=loans_smote
risk_model=sm.logit(model_formula, data=credit_risk_balanced)
results=risk_model.fit()
print(results.summary())

#### Updated Sensitivity and Specificity

In [None]:
#####Create the confusion matrix
###predict the variable active customer from logistic fit####
predictions = results.predict()
threshold=0.5
predicted_class1=[ 0 if x < threshold else 1 for x in predictions]

cm1 = confusion_matrix(credit_risk_balanced["SeriousDlqin2yrs"],predicted_class1)
print('Confusion Matrix : \n', cm1)

total1=sum(sum(cm1))
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

sensitivity1 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Sensitivity : ', sensitivity1 )

specificity1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Specificity : ', specificity1)

#### Updated Precision, Recall and F1-Score

In [None]:
#####from confusion matrix calculate accuracy
accuracy1=(cm1[0,0]+cm1[1,1])/total1
print ('Accuracy : ', accuracy1)

Precision_Class0 = cm1[0,0]/(cm1[0,0]+cm1[1,0])
print('Precision_Class0 : ', Precision_Class0 )

Recall_Class0 = cm1[0,0]/(cm1[0,0]+cm1[0,1])
print('Recall_Class0 : ', Recall_Class0 )

F1_Class0 = 2/((1/Precision_Class0)+(1/Recall_Class0))
print('F1_Class0 : ', F1_Class0 )


Precision_Class1 = cm1[1,1]/(cm1[0,1]+cm1[1,1])
print('Precision_Class1 : ', Precision_Class1 )

Recall_Class1 = cm1[1,1]/(cm1[1,0]+cm1[1,1])
print('Recall_Class1 : ', Recall_Class1 )

F1_Class1 = 2/((1/Precision_Class1)+(1/Recall_Class1))
print('F1_Class1 : ', F1_Class1 )

In [None]:
from sklearn.metrics import classification_report
print(classification_report(credit_risk_balanced["SeriousDlqin2yrs"],predicted_class1))

# Bias Variance TradeOff 


## Model with huge Variance - Overfitted model

In [None]:
Fiber_df=pd.read_csv("https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/Fiberbits/Fiberbits_v1.csv")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import tree

features = list(Fiber_df.drop(['active_cust'],1).columns) 
X = np.array(Fiber_df[features])
y = np.array(Fiber_df['active_cust'])

#Splitting the dataset into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.8 , random_state=333)
#Build the best model(1% error) model on training data.
tree_model = tree.DecisionTreeClassifier()
tree_model.fit(X_train,y_train)

train_acc= tree_model.score(X_train,y_train)
print("train_accuracy", train_acc)

test_acc= tree_model.score(X_test,y_test)
print("test_accuracy", test_acc)

## Model with huge Bias - underfitted model

In [None]:
tree_model = tree.DecisionTreeClassifier(max_depth=1)
tree_model.fit(X_train,y_train)

train_acc= tree_model.score(X_train,y_train)
print("train_accuracy", train_acc)

test_acc= tree_model.score(X_test,y_test)
print("test_accuracy", test_acc)

## Holdout data Cross validation

In [None]:
tree_model = tree.DecisionTreeClassifier(max_depth=3)
tree_model.fit(X_train,y_train)

train_acc= tree_model.score(X_train,y_train)
print("train_accuracy", train_acc)

test_acc= tree_model.score(X_test,y_test)
print("test_accuracy", test_acc)

## K- Fold cross validation

In [None]:
X = np.array(Fiber_df[features])
y = np.array(Fiber_df['active_cust'])

tree_KF = tree.DecisionTreeClassifier()

#Simple K-Fold cross validation. 10 folds.
from sklearn.model_selection import KFold
kfold_models = KFold(n_splits=10)

from sklearn import model_selection
scores = model_selection.cross_val_score(tree_KF,X, y,cv=kfold_models)
print(scores)
print("Avg K-Fold Accuracy", scores.mean())