In [1]:
#importing libraries
import pandas as pd
import numpy as np
import pickle

pd.set_option('display.max_columns', 300)

In [2]:
#reading in data
df = pd.read_csv('data/testing_data.csv', index_col = 'Unnamed: 0')
print(df.shape)
df.head()

(7501, 23)


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,X16,X17,X18,X19,X20,X21,X22,X23
5501,180000,2,2,1,44,0,0,0,0,0,0,161186,167080,170788,174764,162667,166953,10000,8000,7000,6000,7000,10000
28857,130000,2,2,1,48,-2,-2,-2,-2,-2,-2,0,1240,1487,1279,749,440,1240,1487,1279,749,440,849
11272,60000,2,1,1,43,-1,3,2,0,0,-1,495,330,495,330,165,340,0,330,0,0,340,0
8206,240000,1,1,1,42,0,0,0,0,0,0,72339,91045,91027,51508,51127,0,20000,2213,1030,1023,6790,10893
6362,100000,2,2,1,28,2,0,0,0,0,2,73073,74739,70844,63924,57326,59654,3500,3003,1910,2400,3300,0


In [3]:
#bringing in the model
infile = open("model.pickle",'rb')
model = pickle.load(infile)
infile.close()

In [4]:
model.get_params()

{'C': 0.01,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'sag',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [5]:
#bringing in the saved variables
infile = open("saved_variables.pickle",'rb')
saved_variables = pickle.load(infile)
infile.close()

In [6]:
saved_variables

[array(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0',
        'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1',
        'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
        'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
        'PAY_AMT6', 'default payment next month'], dtype=object),
 816379.3359375,
 StandardScaler()]

In [7]:
#assigning the saved variables to new names
columns = saved_variables[0] #column names
five_sd = saved_variables[1] #std for the LIMIT_BAL var
scaler = saved_variables[2] #the scaler

In [8]:
#checking for null values
df.isna().mean().head(2)

X1    0.0
X2    0.0
dtype: float64

In [9]:
#Changing the column names 
df.columns = columns[:-1] #excluding the DEFAULT variable from the training set

#renaming some of the other columns for clearer descriptions:
df = df.rename(columns = {'PAY_0':'PAYSTATUS_SEP','PAY_2':'PAYSTATUS_AUG', 'PAY_3':'PAYSTATUS_JUL', 'PAY_4':'PAYSTATUS_JUN', 'PAY_5':'PAYSTATUS_MAY', 'PAY_6':'PAYSTATUS_APR',
                         'BILL_AMT1':'BILLAMT_SEP', 'BILL_AMT2':'BILLAMT_AUG','BILL_AMT3':'BILLAMT_JUL', 'BILL_AMT4':'BILLAMT_JUN', 'BILL_AMT5':'BILLAMT_MAY', 'BILL_AMT6':'BILLAMT_APR',
                         'PAY_AMT1': 'PAYAMT_SEP', 'PAY_AMT2': 'PAYAMT_AUG', 'PAY_AMT3': 'PAYAMT_JUL', 'PAY_AMT4': 'PAYAMT_JUN', 'PAY_AMT5': 'PAYAMT_MAY', 'PAY_AMT6': 'PAYAMT_APR' })

In [10]:
df.dtypes.head(2)

LIMIT_BAL    int64
SEX          int64
dtype: object

In [11]:
#converting continuous to their appropriate dtypes
df = df.astype({'LIMIT_BAL': np.float32 , 'BILLAMT_SEP':np.float32, 'BILLAMT_AUG':np.float32, 'BILLAMT_JUL':np.float32, 'BILLAMT_JUN':np.float32,
                'BILLAMT_MAY':np.float32, 'BILLAMT_APR':np.float32, 'PAYAMT_SEP':np.float32, 'PAYAMT_AUG':np.float32, 
               'PAYAMT_JUL':np.float32, 'PAYAMT_JUN':np.float32, 'PAYAMT_MAY':np.float32, 'PAYAMT_APR':np.float32})
#converting the categorical ones
df = df.astype({'SEX': object, 'EDUCATION': object, 'MARRIAGE': object, 'AGE': object, 'PAYSTATUS_SEP':object,
                'PAYSTATUS_AUG': object, 'PAYSTATUS_JUL': object, 'PAYSTATUS_JUN': object, 'PAYSTATUS_JUL':object,
                'PAYSTATUS_APR': object})
              

In [12]:
#CLEANING THE DATA IN THE COLUMNS

#df['SEX']
df['SEX'] = np.where(df['SEX'] == 1, 1, 0)

#df['EDUCATION']
conditions = [df['EDUCATION'] == 1,df['EDUCATION'] ==2,df['EDUCATION'] ==3,df['EDUCATION'] ==4]
choices = [1,2,3,4]
df['EDUCATION'] = np.select(conditions, choices, default = 2)

#df['MARRIAGE']
df['MARRIAGE'] = np.where(df['MARRIAGE'] == 0, 2, df['MARRIAGE'])

#df['AGE']
df['AGE'] = df['AGE'].astype('int')

def age_groups(x):
    if x in range(20, 30):
        return '2'
    elif x in range(30, 40):
        return '3'
    else:
        return '4'

df['AGE_GROUPS'] = df['AGE'].apply(lambda x: age_groups(x))
df['AGE'] = df['AGE_GROUPS']
df.drop('AGE_GROUPS', axis = 1, inplace = True)

#df['LIM_BAL']
df['LIMIT_BAL'] = np.where(df['LIMIT_BAL'] > five_sd, five_sd, df['LIMIT_BAL'])

#PAYSTATS
def impute(cols):
    for i in cols:
        df[i]=np.where(df[i]==-2,0,df[i])
    return df

df = impute(list(df.columns[5:11]))