# Phase 3

## Imports and options

In [96]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from operator import itemgetter


from sklearn.model_selection import train_test_split
from sklearn import tree, metrics
from sklearn.tree import _tree


from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

import warnings
warnings.filterwarnings("ignore")


In [97]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

In [98]:
pd.set_option('display.max_colwidth', None)

In [99]:
plt.ioff()

<matplotlib.pyplot._IoffContext at 0x239ec27f1f0>

In [100]:
sns.set()

In [101]:
#df = pd.read_csv('imputed.csv')

## Impute + 1 hot encode

In [102]:
initial_df = pd.read_csv('HMEQ_Loss.csv')

In [103]:
#break into groups for initial run
numerical_columns = []
categorical_columns = []
other_cols = []

dt=initial_df.dtypes

TARGET_F = 'TARGET_BAD_FLAG'
TARGET_A = 'TARGET_LOSS_AMT'

for i in dt.index:
    if i in ([TARGET_A,TARGET_F]):
        continue
    elif dt[i] in (["object"]) : 
        categorical_columns.append(i)
    elif dt[i] in (["float64"]) : 
        numerical_columns.append(i)
    elif dt[i] in (["int64"]) : 
        numerical_columns.append(i)
    else:
        other_cols.append(i)


In [104]:
#impute the categorical columns
#drop the originals
imputed_df = initial_df.copy()
for i in categorical_columns :
    if imputed_df[i].isna().sum() == 0: 
        continue
    NAME = "IMP_"+i
    imputed_df[NAME] = imputed_df[i]
    imputed_df[NAME] = imputed_df[NAME].fillna(imputed_df[NAME].mode()[0])
    print("variable",i," has this many missing", imputed_df[i].isna().sum())
    print("variable",NAME," has this many missing", imputed_df[NAME].isna().sum() ,'\n')
    g = imputed_df.groupby(NAME)
    print(g[NAME].count())
    print("\n\n")
    imputed_df = imputed_df.drop(i, axis=1)

variable REASON  has this many missing 252
variable IMP_REASON  has this many missing 0 

IMP_REASON
DebtCon    4180
HomeImp    1780
Name: IMP_REASON, dtype: int64



variable JOB  has this many missing 279
variable IMP_JOB  has this many missing 0 

IMP_JOB
Mgr         767
Office      948
Other      2667
ProfExe    1276
Sales       109
Self        193
Name: IMP_JOB, dtype: int64





In [105]:
#impute the numerical columns
#drop the originals
missing_flag_cols = []

for i in numerical_columns:
    if imputed_df[i].isna().sum() == 0:
        continue
    FLAG = 'M_' + i
    IMP = 'IMP_' + i
    imputed_df[FLAG] = imputed_df[i].isna() + 0
    missing_flag_cols.append(FLAG)
    imputed_df[IMP] = imputed_df[i]
    imputed_df.loc[imputed_df[IMP].isna(),IMP] = imputed_df[i].median()
    imputed_df = imputed_df.drop(i, axis=1)

In [106]:
#break into groups again with the dropped cols
imp_numerical_columns = []
imp_categorical_columns = []
imp_other_cols = []

dt=imputed_df.dtypes

TARGET_F = 'TARGET_BAD_FLAG'
TARGET_A = 'TARGET_LOSS_AMT'

for i in dt.index:
    if i in ([TARGET_A,TARGET_F]):
        continue
    elif dt[i] in (missing_flag_cols):
        imp_other_cols.append(i)
    elif dt[i] in (["object"]) : 
        imp_categorical_columns.append(i)
    elif dt[i] in (["float64"]) : 
        imp_numerical_columns.append(i)
    elif dt[i] in (["int64"]) : 
        imp_numerical_columns.append(i)
    else:
        imp_other_cols.append(i)

In [107]:
#now one hot encode the categorical columns
for i in imp_categorical_columns:
    prefix_name = 'z_' + i
    y = pd.get_dummies(imputed_df[i],prefix=prefix_name,drop_first=True)
    imputed_df = pd.concat([imputed_df,y],axis=1)
    imputed_df = imputed_df.drop(i,axis=1)


## Remove the outliers from the dataset


In [108]:
# from the imputed DF, let's keep only things within 3 standard deviations
# we'll use the scipy stats package to get the zscore
outlier_vis = imputed_df.copy()
outlier_vis['zscore'] = stats.zscore(outlier_vis[TARGET_A],nan_policy='omit')

# histogram of zscore
plot1 = sns.histplot(x=outlier_vis['zscore'], data=outlier_vis)
fig = plot1.get_figure()
fig.savefig('zscore_hist.png', bbox_inches='tight')
# get the absolute value of the zscore so we don't have to do > & < to get both upper and lower
# then save that as another df to use later on
# super important here is the or statement in filtering the df
# if you don't use the or part, you end up dropping all the nondefaults
imputed_no_outlier = imputed_df[(np.abs(stats.zscore(imputed_df[TARGET_A],nan_policy='omit')) < 3)\
                                 | ~imputed_df[TARGET_F].isna()].copy()

## Split the Data Into Test & Train

In [109]:
# drop the targets
# keep the predictors
# drop the missing flags as well
# uncomment to use no outlier
X = imputed_no_outlier.copy()
#X = imputed_df.copy()
X = X.drop(TARGET_F,axis=1)
X = X.drop(TARGET_A,axis=1)

for column in imputed_no_outlier.columns.values:
    if "M_" in column:
        X.drop(column,axis=1,inplace=True)

# the target we're predicting
Y = imputed_no_outlier[[TARGET_F,TARGET_A]]

# keep 80% for training
# keep 20% for testing
# use random_state=1 if you need the results to be the same (this sets our seed)
# X_train and Y_train should match up on the index
# X_test and Y_test should match up on the index
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, \
                                                    train_size=0.8, \
                                                    test_size=.2) #random_state=1

print("Training = ", X_train.shape)
print("Testing = ", X_test.shape)

Training =  (4768, 16)
Testing =  (1192, 16)


## Important Stuff

### Functions

In [110]:
# get tree related important variables

def getTreeVars(TREE, varNames) :
    tree_ = TREE.tree_
    varName = [ varNames[i] if i != _tree.TREE_UNDEFINED else "undefined!" for i in tree_.feature ]

    nameSet = set()
    for i in tree_.feature :
        if i != _tree.TREE_UNDEFINED :
            nameSet.add(i)
    nameList = list(nameSet)
    parameter_list = list()
    for i in nameList :
        parameter_list.append(varNames[i])
    return parameter_list

def getEnsembleTreeVars(ENSTREE, varNames) :
    importance = ENSTREE.feature_importances_
    index = np.argsort(importance)
    theList = []
    for i in index :
        imp_val = importance[i]
        if imp_val > np.average(ENSTREE.feature_importances_) :
            v = int(imp_val / np.max(ENSTREE.feature_importances_) * 100)
            theList.append((varNames[i], v))
    theList = sorted(theList,key=itemgetter(1),reverse=True)
    return theList

In [111]:
# get Regression model important variables
def getCoefLogit( MODEL, TRAIN_DATA ) :
    varNames = list( TRAIN_DATA.columns.values )
    coef_dict = {}
    coef_dict["INTERCEPT"] = MODEL.intercept_[0]
    for coef, feat in zip(MODEL.coef_[0],varNames):
        coef_dict[feat] = coef
    return coef_dict

def getCoefLinear( MODEL, TRAIN_DATA ) :
    varNames = list( TRAIN_DATA.columns.values )
    coef_dict = {}
    coef_dict["INTERCEPT"] = MODEL.intercept_
    for coef, feat in zip(MODEL.coef_,varNames):
        coef_dict[feat] = coef
    return coef_dict

### Classes

In [112]:
class AmountModel:
    # on creation
    def __init__(self, name, x_train, x_test, y_train, y_test,depth=5,random_state=511,estimators=100):
        self.name = name
        self.depth = depth
        self.r_state = random_state
        self.n_estimators = estimators
        self.features = list(x_train.columns.values)
        # get no null index
        amount_train_index = ~y_train[TARGET_A].isna()
        amount_test_index = ~y_test[TARGET_A].isna()

        # init amount variables
        self.X_Train = x_train[amount_train_index]
        self.Y_Train = y_train[amount_train_index]
        self.Y_Train = self.Y_Train[TARGET_A]
        self.X_Test = x_test[amount_test_index]
        self.Y_Test = y_test[amount_test_index]
        self.Y_Test = self.Y_Test[TARGET_A]
        print("After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()")

    def setFlavor(self,flavor):
        if flavor == 'Tree':
            self.model = tree.DecisionTreeRegressor(max_depth=self.depth,random_state=self.r_state)
            self.flavor = flavor
        elif flavor == 'Random Forest':
            self.model = RandomForestRegressor(n_estimators = self.n_estimators, max_depth=self.depth,\
                                                random_state=self.r_state)
            self.flavor = flavor
        elif flavor == 'Gradient Boost':
            self.model = GradientBoostingRegressor(n_estimators = self.n_estimators, max_depth=self.depth,\
                                                random_state=self.r_state)
            self.flavor = flavor
        else:
            print('ERROR, please input Tree, Random Forest, Gradient Boost, or  Regression')

    def fit(self):
        self.model = self.model.fit(self.X_Train,self.Y_Train)
        self.Y_Pred_Train = self.model.predict(self.X_Train)
        self.Y_Pred_Test = self.model.predict(self.X_Test)

    def getVars(self):
        if self.flavor == 'Tree':
            self.important_variables = getTreeVars(self.model,self.features)
            #return self.important_variables
        elif self.flavor == 'Gradient Boost' or self.flavor == 'Random Forest':
            holder = getEnsembleTreeVars(self.model,self.features)
            self.important_variables = []
            for i in holder:
                smallhold = i[0]
                self.important_variables.append(smallhold)
            self.import_variables_matrix = holder
            #return self.important_variables
        else: 
            print('error')
            
    def getAccuracy(self):
        self.RMSE_train = math.sqrt(metrics.mean_squared_error(self.Y_Train,self.Y_Pred_Train))
        self.RMSE_test = math.sqrt(metrics.mean_squared_error(self.Y_Test,self.Y_Pred_Test))
        #print(f"{self.name} RMSE Train: ", self.RMSE_train)
        #print(f"{self.name} RMSE Test: ", self.RMSE_test)

In [113]:
class FlagModel:
    # on creation
    def __init__(self, name, x_train, x_test, y_train, y_test,depth=5,random_state=511,estimators=100):
        self.name = name
        self.depth = depth
        self.r_state = random_state
        self.n_estimators = estimators
        self.features = list(x_train.columns.values)
        
        # init flag variables
        self.X_Train = x_train
        self.Y_Train = y_train[TARGET_F]
        self.X_Test = x_test
        self.Y_Test = y_test[TARGET_F]
        print("After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()")

    def setFlavor(self,flavor):
        if flavor == 'Tree':
            self.model = tree.DecisionTreeClassifier(max_depth=self.depth,random_state=self.r_state)
            self.flavor = flavor
        elif flavor == 'Random Forest':
            self.model = RandomForestClassifier(n_estimators = self.n_estimators, max_depth=self.depth,\
                                                random_state=self.r_state)
            self.flavor = flavor
        elif flavor == 'Gradient Boost':
            self.model = GradientBoostingClassifier(n_estimators = self.n_estimators, max_depth=self.depth,\
                                                random_state=self.r_state)
            self.flavor = flavor
        else:
            print('ERROR, please input Tree, Random Forest, or Gradient Boost')


    def fit(self):
        self.model = self.model.fit(self.X_Train,self.Y_Train)
        self.Y_Pred_Train = self.model.predict(self.X_Train)
        self.Y_Pred_Test = self.model.predict(self.X_Test)

    def getVars(self):
        if self.flavor == 'Tree':
            self.important_variables = getTreeVars(self.model,self.features)
            return self.important_variables
        elif self.flavor == 'Gradient Boost' or self.flavor == 'Random Forest':
            holder = getEnsembleTreeVars(self.model,self.features)
            self.important_variables = []
            for i in holder:
                smallhold = i[0]
                self.important_variables.append(smallhold)
            self.import_variables_matrix = holder
            return self.important_variables
        else: 
            print('error')
    

    def getAccuracy(self):
        self.train_acc = metrics.accuracy_score(self.Y_Train, self.Y_Pred_Train)
        self.test_acc = metrics.accuracy_score(self.Y_Test, self.Y_Pred_Test)
        #print("Accuracy Train: ", self.train_acc)
        #print("Accuracy Test: ", self.test_acc)

    def makeROCplot(self,filename):
        self.train_probs = self.model.predict_proba(self.X_Train)
        self.test_probs = self.model.predict_proba(self.X_Test)
        self.train_probs = self.train_probs[:,1]
        self.test_probs = self.test_probs[:,1]
        self.fpr_train, self.tpr_train, self.train_threshold = metrics.roc_curve(self.Y_Train, self.train_probs)
        self.fpr_test, self.tpr_test, self.test_threshold = metrics.roc_curve(self.Y_Test, self.test_probs)
        self.roc_auc_train = metrics.auc(self.fpr_train, self.tpr_train)
        self.roc_auc_test = metrics.auc(self.fpr_test, self.tpr_test)
        self.roc_plot = plt.figure(figsize=(5,5))
        plt.title(self.name + " ROC Curve")
        plt.plot(self.fpr_train, self.tpr_train,'b',label = 'AUC Train = %0.2f' % self.roc_auc_train)
        plt.plot(self.fpr_test, self.tpr_test,'r',label = 'AUC Test = %0.2f' % self.roc_auc_test)
        plt.legend(loc = 'lower right')
        plt.plot([0,1],[0,1],'k--')
        plt.xlim([0,1])
        plt.ylim([0,1])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.savefig(filename+'.png', bbox_inches='tight')
        #plt.show()

    

        


    

In [114]:
class StepWiseModel():
   def __init__(self, name, x_train, x_test, y_train, y_test,flavor,
                solver='newton-cg',maxiter=100,kmax=1,forward=True,floating=False,scoring=None,cv=3):
      self.name = name
      self.solver = solver
      self.maxiterations = maxiter
      self.features = list(x_train.columns.values)
      self.flavor = flavor
      self.kmax = kmax
      self.forward=forward
      self.floating=floating
      self.scoring=scoring
      self.cv=cv

      if flavor == "Logistic":
         self.X_Train = x_train
         self.Y_Train = y_train[TARGET_F]
         self.X_Test = x_test
         self.Y_Test = y_test[TARGET_F]
      else:
         # get no null index
         amount_train_index = ~y_train[TARGET_A].isna()
         amount_test_index = ~y_test[TARGET_A].isna()

         # init amount variables
         self.X_Train = x_train[amount_train_index]
         self.Y_Train = y_train[amount_train_index]
         self.Y_Train = self.Y_Train[TARGET_A]
         self.X_Test = x_test[amount_test_index]
         self.Y_Test = y_test[amount_test_index]
         self.Y_Test = self.Y_Test[TARGET_A]

   def fit(self):
      if self.flavor == "Logistic":
         self.model = SFS(LogisticRegression(solver=self.solver,max_iter=self.maxiterations),
                          k_features = (1,self.kmax),
                          forward=self.forward,
                          floating=self.floating,
                          cv=self.cv)
         self.model.fit(self.X_Train.values,self.Y_Train.values)

      elif self.flavor == "Linear":
         self.model = SFS(LinearRegression(),
                          k_features = (1,self.kmax),
                          forward=self.forward,
                          floating=self.floating,
                          scoring=self.scoring,
                          cv=self.cv)
         self.model.fit(self.X_Train.values,self.Y_Train.values)

      else:
         print('error')

   def plot(self,filename):
      self.fig = plot_sfs(self.model.get_metric_dict(), kind = None)
      plt.title(self.name + ' Sequential Forward Selection (w. StdErr)')
      plt.grid()
      plt.savefig(filename+'.png', bbox_inches='tight')
      #plt.show()
   

   def getVars(self):
      dfm = pd.DataFrame.from_dict(self.model.get_metric_dict()).T
      dfm = dfm[['feature_names', 'avg_score']]
      dfm['avg_score'] = dfm['avg_score'].astype(float)
      self.dfm = dfm
      maxIndex = dfm['avg_score'].argmax()
      self.accuracy = maxIndex
      stepVars = dfm.iloc[maxIndex,]
      stepVars = stepVars['feature_names']
      self.important_variables_matrix = stepVars
      holder = []
      for i in stepVars :
         index = int(i)
         try :
            theName = self.features[index]
            holder.append(theName)
         except :
            pass
      self.important_variables = holder

In [115]:
class LogModel(FlagModel):
    def __init__(self,name, x_train, x_test, y_train, y_test, solver='newton-cg',maxiter=1000):
        super().__init__(name, x_train, x_test, y_train, y_test,depth=5,random_state=511,estimators=100)
        self.solver = solver
        self.maxiterations = maxiter
        self.flavor = 'Logistic'
    
    def fit(self):
        self.model = LogisticRegression(solver = self.solver, max_iter = self.maxiterations)
        self.model = self.model.fit(self.X_Train,self.Y_Train)
        self.Y_Pred_Train = self.model.predict(self.X_Train)
        self.Y_Pred_Test = self.model.predict(self.X_Test)
    
    def getVars(self):
        self.important_variables = getCoefLogit(self.model,self.X_Train)
        #return self.important_variables
    
class LinModel(AmountModel):
    def __init__(self,name, x_train, x_test, y_train, y_test, estimators=100,solver='newton-cg',maxiter=1000):
        super().__init__(name, x_train, x_test, y_train, y_test,depth=5,random_state=511)
        self.n_estimators = estimators
        self.solver = solver
        self.maxiterations = maxiter
        self.flavor = 'Logistic'
    
    def fit(self):
        self.model = LinearRegression()
        self.model = self.model.fit(self.X_Train,self.Y_Train)
        self.Y_Pred_Train = self.model.predict(self.X_Train)
        self.Y_Pred_Test = self.model.predict(self.X_Test)
    
    def getVars(self):
        self.important_variables = getCoefLinear(self.model,self.X_Train)
        return self.important_variables

## Flag Models

### Simple Tree

In [116]:
tree_flag_01 = FlagModel('Tree_Flag_01',X_train, X_test, Y_train, Y_test)
tree_flag_01.setFlavor('Tree')
tree_flag_01.fit()
tree_flag_01.getAccuracy()
tree_flag_01.makeROCplot(tree_flag_01.name)
tree_flag_01.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


['IMP_VALUE',
 'IMP_DEROG',
 'IMP_DELINQ',
 'IMP_CLAGE',
 'IMP_DEBTINC',
 'z_IMP_JOB_Office']

### Random Forest

In [117]:
rf_flag_01 = FlagModel('RF_Flag_01',X_train, X_test, Y_train, Y_test)
rf_flag_01.setFlavor('Random Forest')
rf_flag_01.fit()
rf_flag_01.getAccuracy()
rf_flag_01.makeROCplot(rf_flag_01.name)
rf_flag_01.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


['IMP_DEBTINC', 'IMP_DELINQ', 'IMP_DEROG', 'IMP_CLAGE']

### Gradient Boost

In [118]:
gb_flag_01 = FlagModel('GB_Flag_01',X_train, X_test, Y_train, Y_test)
gb_flag_01.setFlavor('Gradient Boost')
gb_flag_01.fit()
gb_flag_01.getAccuracy()
gb_flag_01.makeROCplot(gb_flag_01.name)
gb_flag_01.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


['IMP_DEBTINC', 'IMP_DELINQ', 'IMP_CLAGE']

### Stepwise

In [119]:
sw_flag_01 = StepWiseModel('SW_Flag_01',X_train, X_test, Y_train, Y_test,'Logistic',kmax=5)
sw_flag_01.fit()
sw_flag_01.plot(sw_flag_01.name)
sw_flag_01.getVars()

### Log Models of each flag model

#### Log model - all

In [120]:
log_flag_all = LogModel('Log_Flag_AllVars',X_train, X_test, Y_train, Y_test)
log_flag_all.fit()
log_flag_all.getAccuracy()
log_flag_all.makeROCplot(log_flag_all.name)
log_flag_all.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


#### Log model - tree

In [121]:
log_flag_tree = LogModel('Log_Flag_Tree',X_train[tree_flag_01.important_variables]
                         , X_test[tree_flag_01.important_variables], 
                         Y_train, 
                         Y_test)
log_flag_tree.fit()
log_flag_tree.getAccuracy()
log_flag_tree.makeROCplot(log_flag_tree.name)
log_flag_tree.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


#### Log model - rf

In [122]:
log_flag_rf = LogModel('Log_Flag_RF',X_train[rf_flag_01.important_variables]
                         , X_test[rf_flag_01.important_variables], 
                         Y_train, 
                         Y_test)
log_flag_rf.fit()
log_flag_rf.getAccuracy()
log_flag_rf.makeROCplot(log_flag_rf.name)
log_flag_rf.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


#### Log model - gb

In [123]:
log_flag_gb = LogModel('Log_Flag_GB',X_train[gb_flag_01.important_variables]
                         , X_test[gb_flag_01.important_variables], 
                         Y_train, 
                         Y_test)
log_flag_gb.fit()
log_flag_gb.getAccuracy()
log_flag_gb.makeROCplot(log_flag_gb.name)
log_flag_gb.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


#### Log model - sws

In [124]:
log_flag_sw = LogModel('Log_Flag_SW',X_train[sw_flag_01.important_variables]
                         , X_test[sw_flag_01.important_variables], 
                         Y_train, 
                         Y_test)
log_flag_sw.fit()
log_flag_sw.getAccuracy()
log_flag_sw.makeROCplot(log_flag_sw.name)
log_flag_sw.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


## Amount Models

### Tree Model

In [125]:
tree_amount_01 = AmountModel('Tree_Amount_01',X_train, X_test, Y_train, Y_test)
tree_amount_01.setFlavor('Tree')
tree_amount_01.fit()
tree_amount_01.getAccuracy()
tree_amount_01.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()


### Random Forest

In [126]:
rf_amount_01 = AmountModel('RF_Amount_01',X_train, X_test, Y_train, Y_test)
rf_amount_01.setFlavor('Random Forest')
rf_amount_01.fit()
rf_amount_01.getAccuracy()
rf_amount_01.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()


### Gradient Boost

In [127]:
gb_amount_01 = FlagModel('GB_Amount_01',X_train, X_test, Y_train, Y_test)
gb_amount_01.setFlavor('Gradient Boost')
gb_amount_01.fit()
gb_amount_01.getAccuracy()
gb_amount_01.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy() and a .makeROCplot()


['IMP_DEBTINC', 'IMP_DELINQ', 'IMP_CLAGE']

### Stepwise

In [128]:
sw_amount_01 = StepWiseModel('SW_Amount_01',X_train, X_test, Y_train, Y_test,'Linear',kmax=5)
sw_amount_01.fit()
sw_flag_01.plot(sw_flag_01.name)
sw_amount_01.getVars()

### Lin Models of each flag model

#### Lin model - all

In [129]:
lin_amount_all = LinModel('Lin_Amount_AllVars',X_train, X_test, Y_train, Y_test)
lin_amount_all.fit()
lin_amount_all.getAccuracy()
lin_amount_all.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()


{'INTERCEPT': -3259.8582061653724,
 'LOAN': 0.7837434742020188,
 'IMP_MORTDUE': 0.0054417346392702185,
 'IMP_VALUE': -0.01301940120913818,
 'IMP_YOJ': -72.1621920600278,
 'IMP_DEROG': 362.950968868329,
 'IMP_DELINQ': 792.0840350651588,
 'IMP_CLAGE': -19.47233906682193,
 'IMP_NINQ': 58.066167480768016,
 'IMP_CLNO': 221.06924789313203,
 'IMP_DEBTINC': 60.88106182008491,
 'z_IMP_REASON_HomeImp': -1344.5442529307306,
 'z_IMP_JOB_Office': 86.30631311595481,
 'z_IMP_JOB_Other': 117.76173355799428,
 'z_IMP_JOB_ProfExe': -54.5256611671593,
 'z_IMP_JOB_Sales': 1224.0655807096866,
 'z_IMP_JOB_Self': 2826.7032634130987}

#### Lin model - tree

In [130]:
lin_amount_tree = LinModel('Lin_Amount_Tree',X_train[tree_amount_01.important_variables]
                         , X_test[tree_amount_01.important_variables], 
                         Y_train, 
                         Y_test)
lin_amount_tree.fit()
lin_amount_tree.getAccuracy()
lin_amount_tree.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()


{'INTERCEPT': -4644.549790490546,
 'LOAN': 0.7782955737327324,
 'IMP_MORTDUE': -0.004857612121459846,
 'IMP_DEROG': 359.571671091941,
 'IMP_DELINQ': 785.3707195917693,
 'IMP_CLAGE': -20.902866277351308,
 'IMP_CLNO': 230.56385214370056,
 'IMP_DEBTINC': 67.50737611427107}

#### Lin model - rf

In [131]:
lin_amount_rf = LinModel('Lin_Amount_RF',X_train[rf_amount_01.important_variables]
                         , X_test[rf_amount_01.important_variables], 
                         Y_train, 
                         Y_test)
lin_amount_rf.fit()
lin_amount_rf.getAccuracy()
lin_amount_rf.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()


{'INTERCEPT': -4540.168822589738,
 'LOAN': 0.7679350254145798,
 'IMP_CLNO': 237.448782016605}

#### Lin model - gb

In [132]:
lin_amount_gb = LinModel('Lin_Amount_GB',X_train[gb_amount_01.important_variables]
                         , X_test[gb_amount_01.important_variables], 
                         Y_train, 
                         Y_test)
lin_amount_gb.fit()
lin_amount_gb.getAccuracy()
lin_amount_gb.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()


{'INTERCEPT': 7114.595460192209,
 'IMP_DEBTINC': 160.86423596443953,
 'IMP_DELINQ': 1424.985358169077,
 'IMP_CLAGE': -5.835161893793384}

#### Lin model - sws

In [133]:
lin_amount_sw = LinModel('Lin_Amount_SW',X_train[sw_amount_01.important_variables]
                         , X_test[sw_amount_01.important_variables], 
                         Y_train, 
                         Y_test)
lin_amount_sw.fit()
lin_amount_sw.getAccuracy()
lin_amount_sw.getVars()

After initailization, do a .setFlavor(pick a model), then a .fit(), then a .getAccuracy()


{'INTERCEPT': -4539.761858275338,
 'LOAN': 0.7697944599628972,
 'IMP_DELINQ': 817.8555155417754,
 'IMP_CLAGE': -21.399209026455537,
 'IMP_CLNO': 234.36867532201543,
 'IMP_DEBTINC': 64.98268425488469}

## Comparions

### Log Models

In [134]:
# list of all log models
loglist = [log_flag_all,log_flag_sw,log_flag_gb,log_flag_rf,log_flag_tree]

<table><tr>
<td><img src="Log_Flag_AllVars.png" alt="Drawing" style="width: 400px;"/></td>
<td><img src="Log_Flag_Tree.png" alt="Drawing" style="width: 400px;"/></td>
<td><img src="Log_Flag_RF.png" alt="Drawing" style="width: 400px;"/></td>
</tr>
<tr>
<td><img src="Log_Flag_GB.png" alt="Drawing" style="width: 400px;"/></td>
<td><img src="Log_Flag_SW.png" alt="Drawing" style="width: 400px;"/></td>
</tr>
</table>

In [135]:
log_variables = {}
for model in loglist:
    holder = []
    for i in model.important_variables:
        smallholder = i
        holder.append(smallholder)
    log_variables[model.name] = holder[1:]
log_var_df = pd.DataFrame(log_variables.items(),columns=['Model','Variables'])
log_var_df

Unnamed: 0,Model,Variables
0,Log_Flag_AllVars,"[LOAN, IMP_MORTDUE, IMP_VALUE, IMP_YOJ, IMP_DEROG, IMP_DELINQ, IMP_CLAGE, IMP_NINQ, IMP_CLNO, IMP_DEBTINC, z_IMP_REASON_HomeImp, z_IMP_JOB_Office, z_IMP_JOB_Other, z_IMP_JOB_ProfExe, z_IMP_JOB_Sales, z_IMP_JOB_Self]"
1,Log_Flag_SW,"[IMP_DEROG, IMP_DELINQ, IMP_CLAGE, IMP_DEBTINC, z_IMP_JOB_Office]"
2,Log_Flag_GB,"[IMP_DEBTINC, IMP_DELINQ, IMP_CLAGE]"
3,Log_Flag_RF,"[IMP_DEBTINC, IMP_DELINQ, IMP_DEROG, IMP_CLAGE]"
4,Log_Flag_Tree,"[IMP_VALUE, IMP_DEROG, IMP_DELINQ, IMP_CLAGE, IMP_DEBTINC, z_IMP_JOB_Office]"


In [136]:
log_accuracies = {}
for model in loglist:
    log_accuracies[model.name] = model.test_acc
log_acc_df = pd.DataFrame(log_accuracies.items(),columns=['Model','Accuracy'])
log_acc_df

Unnamed: 0,Model,Accuracy
0,Log_Flag_AllVars,0.842282
1,Log_Flag_SW,0.838926
2,Log_Flag_GB,0.832215
3,Log_Flag_RF,0.837248
4,Log_Flag_Tree,0.838926


In [137]:
log_combined_df = pd.merge(left=log_acc_df,right=log_var_df,how="left",on="Model")
log_combined_df

Unnamed: 0,Model,Accuracy,Variables
0,Log_Flag_AllVars,0.842282,"[LOAN, IMP_MORTDUE, IMP_VALUE, IMP_YOJ, IMP_DEROG, IMP_DELINQ, IMP_CLAGE, IMP_NINQ, IMP_CLNO, IMP_DEBTINC, z_IMP_REASON_HomeImp, z_IMP_JOB_Office, z_IMP_JOB_Other, z_IMP_JOB_ProfExe, z_IMP_JOB_Sales, z_IMP_JOB_Self]"
1,Log_Flag_SW,0.838926,"[IMP_DEROG, IMP_DELINQ, IMP_CLAGE, IMP_DEBTINC, z_IMP_JOB_Office]"
2,Log_Flag_GB,0.832215,"[IMP_DEBTINC, IMP_DELINQ, IMP_CLAGE]"
3,Log_Flag_RF,0.837248,"[IMP_DEBTINC, IMP_DELINQ, IMP_DEROG, IMP_CLAGE]"
4,Log_Flag_Tree,0.838926,"[IMP_VALUE, IMP_DEROG, IMP_DELINQ, IMP_CLAGE, IMP_DEBTINC, z_IMP_JOB_Office]"


In [138]:
#coefficient matrix
somedict = {}
for model in loglist:
    somedict[model.name] = model.important_variables
pp.pprint(somedict)

{   'Log_Flag_AllVars': {   'IMP_CLAGE': -0.005772753472952367,
                            'IMP_CLNO': -0.011546709721235596,
                            'IMP_DEBTINC': 0.0694353523190654,
                            'IMP_DELINQ': 0.7415194837597069,
                            'IMP_DEROG': 0.5855009192991077,
                            'IMP_MORTDUE': -4.11337379772705e-06,
                            'IMP_NINQ': 0.15898364016205252,
                            'IMP_VALUE': 3.6395200142305243e-06,
                            'IMP_YOJ': -0.010062635169284169,
                            'INTERCEPT': -3.066241893413238,
                            'LOAN': -1.9465672371148186e-05,
                            'z_IMP_JOB_Office': -0.5749735974105891,
                            'z_IMP_JOB_Other': 0.10811691182781577,
                            'z_IMP_JOB_ProfExe': 0.026799499827995208,
                            'z_IMP_JOB_Sales': 0.6760434094599762,
                            'z_IMP_J

### Lin Models

In [139]:
# list of all lin models
linlist = [lin_amount_all,lin_amount_sw,lin_amount_gb,lin_amount_rf,lin_amount_tree]

In [140]:
#for RMSE
lindict = {}
for lin in linlist:
    lindict[lin.name] = lin.RMSE_test

pp.pprint(lindict)

{   'Lin_Amount_AllVars': 4434.363962005251,
    'Lin_Amount_GB': 9480.720904232108,
    'Lin_Amount_RF': 5035.154398354714,
    'Lin_Amount_SW': 4470.0925767736035,
    'Lin_Amount_Tree': 4492.842275516375}


In [141]:
lincoefdict = {}
for model in linlist:
    lincoefdict[model.name] = model.important_variables
pp.pprint(lincoefdict)

{   'Lin_Amount_AllVars': {   'IMP_CLAGE': -19.47233906682193,
                              'IMP_CLNO': 221.06924789313203,
                              'IMP_DEBTINC': 60.88106182008491,
                              'IMP_DELINQ': 792.0840350651588,
                              'IMP_DEROG': 362.950968868329,
                              'IMP_MORTDUE': 0.0054417346392702185,
                              'IMP_NINQ': 58.066167480768016,
                              'IMP_VALUE': -0.01301940120913818,
                              'IMP_YOJ': -72.1621920600278,
                              'INTERCEPT': -3259.8582061653724,
                              'LOAN': 0.7837434742020188,
                              'z_IMP_JOB_Office': 86.30631311595481,
                              'z_IMP_JOB_Other': 117.76173355799428,
                              'z_IMP_JOB_ProfExe': -54.5256611671593,
                              'z_IMP_JOB_Sales': 1224.0655807096866,
                              'z_

# Bingo Bonus

## Bulk Simulation

### Gradient Boost Randomness

In [142]:
rf_amount_01.RMSE_test

3653.5801500232105