# Week 8 Bank Data Case Study

## Load Packages

In [1]:
import pandas as pd
import numpy as np

## Read the Data

In this section we read in the data. 

In [2]:
df = pd.read_csv("../../../case_8.csv")
df.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,1.630686,7.464411,C,4.145098,9.191265,2.436402,2.483921,2.30163,...,6.822439,3.549938,0.598896,AF,1.672658,3.239542,1.957825,0,1.925763,1.739389
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,1.630686,7.464411,C,4.145098,8.742359,2.436402,2.483921,1.496569,...,6.822439,3.549938,0.919812,Z,1.672658,3.239542,2.030373,0,1.925763,1.739389


No obvious issues like parsing errors or missings. Lets see what we have for data types.

In [4]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114321 entries, 0 to 114320
Data columns (total 133 columns):
 #   Column  Dtype  
---  ------  -----  
 0   ID      int64  
 1   target  int64  
 2   v1      float64
 3   v2      float64
 4   v3      object 
 5   v4      float64
 6   v5      float64
 7   v6      float64
 8   v7      float64
 9   v8      float64
 10  v9      float64
 11  v10     float64
 12  v11     float64
 13  v12     float64
 14  v13     float64
 15  v14     float64
 16  v15     float64
 17  v16     float64
 18  v17     float64
 19  v18     float64
 20  v19     float64
 21  v20     float64
 22  v21     float64
 23  v22     object 
 24  v23     float64
 25  v24     object 
 26  v25     float64
 27  v26     float64
 28  v27     float64
 29  v28     float64
 30  v29     float64
 31  v30     object 
 32  v31     object 
 33  v32     float64
 34  v33     float64
 35  v34     float64
 36  v35     float64
 37  v36     float64
 38  v37     float64
 39  v38     int64  
 40  v

We see mostly floats. There are some object data types we should probably recast. We have 114K observations, plenty to work with.  No variable names, as expected. He said there are no missings but lets check anyway.

In [5]:
df.isnull().values.any()

False

Fine. He told the truth. How about that target variable

In [6]:
counts = df.target.value_counts()
print(counts)
print(round(counts[0]/sum(counts),4))

1    87021
0    27300
Name: target, dtype: int64
0.2388


The target is binary and a little unbalanced, but not terrible.

## Data Cleaning

In [7]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


df.describe()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21,v23,v25,v26,v27,v28,v29,v32,v33,v34,v35,v36,v37,v38,v39,v40,v41,v42,v43,v44,v45,v46,v48,v49,v50,v51,v53,v54,v55,v57,v58,v59,v60,v61,v62,v63,v64,v65,v67,v68,v69,v70,v72,v73,v76,v77,v78,v80,v81,v82,v83,v84,v85,v86,v87,v88,v89,v90,v92,v93,v94,v95,v96,v97,v98,v99,v100,v101,v102,v103,v104,v105,v106,v108,v109,v111,v114,v115,v116,v117,v118,v119,v120,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
count,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0
mean,114228.92823,0.7612,1.63069,7.46441,4.1451,8.74236,2.4364,2.48392,1.49657,9.03186,1.88305,15.44741,6.8813,3.7984,12.09428,2.08091,4.92322,3.83227,0.84105,0.2223,17.77359,7.02974,1.09309,1.69813,1.87603,2.74345,5.09333,8.20642,1.62215,2.16163,6.40624,8.12239,13.3756,0.74147,0.09093,1.23718,10.46593,7.18255,12.92497,2.2166,10.79517,9.14223,1.63053,12.53802,8.01655,1.50426,7.19816,15.7113,1.25386,1.55956,4.07783,7.70165,10.58794,1.71429,14.58303,1.03069,1.68733,6.34371,15.84756,9.28728,17.56412,9.44934,12.26996,1.43177,2.4333,2.40506,7.30737,13.33448,2.2097,7.28717,6.20836,2.17381,1.60796,2.82225,1.22018,10.18022,1.92418,1.51843,0.96691,0.58237,5.47518,3.85288,0.66576,6.45795,7.62255,7.66762,1.25072,12.09162,6.86641,2.89029,5.29672,2.64283,1.08105,11.79136,2.15262,4.18128,3.36531,13.57445,10.54805,2.29122,8.30386,8.36465,3.16897,1.29122,2.7376,6.82244,3.54994,0.91981,1.67266,3.23954,2.03037,0.31014,1.92576,1.73939
std,65934.48736,0.42635,0.81326,2.22504,0.86266,1.54344,0.45061,0.44271,2.10979,1.44954,1.39347,0.59338,0.92415,0.88317,1.44392,0.55045,1.34464,1.43607,0.46286,0.12868,0.86743,1.0694,2.98732,2.24158,0.41398,0.62666,2.01131,0.96545,0.42324,0.7397,2.0242,1.00628,1.78573,0.40657,0.58348,1.77108,3.16764,0.75443,0.7488,0.48667,1.58586,1.55058,2.19532,1.64993,0.67797,1.16789,1.87306,0.60036,1.7546,0.62668,0.50925,5.13806,1.5564,0.40378,1.59344,0.69624,2.24951,1.89742,1.4105,0.84371,1.71983,1.4267,1.75436,0.92227,0.59981,1.03956,0.94339,1.38423,0.80726,1.68567,2.78821,0.79785,0.70691,1.06186,0.34985,2.27357,0.78753,2.13245,0.13438,0.1804,1.23201,0.64216,0.19835,0.84155,1.44498,1.76276,0.34655,5.17341,1.76901,1.35412,0.92291,0.66527,1.70317,2.21935,0.69222,2.81395,1.11715,2.61288,1.42744,0.5034,2.74269,1.50358,3.1636,0.55455,1.0186,1.3487,1.94343,1.59155,0.37791,1.22123,0.81434,0.69326,0.94964,0.85182
min,3.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,1.51678,0.10618,-0.0,0.04104,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.06935,-0.0,-0.0,-0.0,-0.0,-0.0,0.01306,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.05306,-0.0,0.6593,-0.0,1.50136,-0.0,0.42709,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.8724,-0.0,0.02237,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,9e-05,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.01914,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
25%,57280.0,1.0,1.34615,6.57577,4.0687,8.39409,2.34097,2.37659,0.26531,8.81356,1.05033,15.39823,6.32262,3.46409,11.25602,1.90569,4.70588,3.37983,0.71949,0.19241,17.77359,6.41875,0.0,0.27448,1.75553,2.56647,4.74236,8.11437,1.49129,1.83074,5.0558,7.89615,13.09935,0.59072,0.0,0.30522,8.41039,7.06762,12.81317,2.06897,10.54256,8.88634,0.2563,12.15693,7.914,0.65879,6.83727,15.66772,0.20819,1.2766,3.97665,4.06136,10.2168,1.5996,14.58303,1.0,0.27094,5.85294,15.84756,9.16798,17.56412,9.27007,12.08748,1.0,2.23612,2.06003,7.19023,13.15175,1.95122,7.20499,3.59298,1.81535,1.31804,2.44395,1.10841,9.55184,1.62351,0.22454,0.94962,0.51793,5.13587,3.65008,0.59461,6.36225,7.18954,7.29994,1.17271,12.09162,6.34055,2.32905,4.98815,2.43202,0.17355,11.70203,1.85201,2.72122,2.92386,11.99667,10.26667,2.13904,7.604,7.86517,1.16943,1.05263,2.28261,6.51961,2.57105,0.08471,1.57097,2.7625,1.68126,0.0,1.44948,1.46341
50%,114189.0,1.0,1.63069,7.46441,4.1451,8.74236,2.4364,2.48392,1.49657,9.03186,1.31291,15.44741,6.61324,3.7984,11.96783,2.08091,4.92322,3.83227,0.84105,0.2223,17.77359,7.03937,0.33059,1.69813,1.87603,2.74345,5.09333,8.20642,1.62215,2.16163,6.53443,8.12239,13.3756,0.74147,0.0,1.23718,10.33934,7.18255,12.92497,2.2166,10.79517,9.14223,1.63053,12.53802,8.01655,1.21194,7.19816,15.7113,1.25386,1.55956,4.07783,7.70165,10.58794,1.71429,14.58303,1.0,1.68733,6.34371,15.84756,9.28728,17.56412,9.44934,12.26996,1.0,2.4333,2.40506,7.30737,13.33448,2.2097,7.28717,6.20836,2.17381,1.60796,2.82225,1.22018,10.18022,1.92418,1.51843,0.96691,0.58237,5.47518,3.85288,0.66576,6.45795,7.62255,7.66762,1.25072,12.09162,6.86641,2.89029,5.29672,2.64283,1.08105,11.79136,2.15262,4.18128,3.36531,14.03888,10.54805,2.29122,8.30386,8.36465,3.16897,1.29122,2.7376,6.82244,3.54994,0.91981,1.67266,3.23954,2.03037,0.0,1.92576,1.73939
75%,171206.0,1.0,1.63069,7.5515,4.34023,8.9248,2.4847,2.52845,1.49657,9.30233,2.10066,15.5939,7.0194,3.7984,12.71577,2.08091,5.14286,3.83227,0.84105,0.2223,18.1546,7.66652,1.09309,1.69813,1.89891,2.7791,5.33034,8.47939,1.62215,2.16163,7.70145,8.25076,14.32492,0.74147,0.0,1.23718,12.76246,7.34477,13.04965,2.23749,11.0221,9.41516,1.63053,12.67463,8.13559,2.00572,7.41788,15.87156,1.25386,1.55956,4.15366,7.70165,10.83954,1.73502,15.31291,1.0,1.68733,6.3844,16.47085,9.46899,18.4375,9.73384,12.9166,2.0,2.43665,2.40506,7.55221,13.55932,2.24359,7.82301,6.20836,2.17381,1.60796,2.82225,1.22018,10.43359,1.92418,1.51843,0.9901,0.58237,5.47518,3.85288,0.66576,6.669,7.71084,8.00612,1.30167,15.69721,6.93119,2.89029,5.29672,2.64283,1.08105,12.44363,2.15262,4.18128,3.36531,15.37219,10.71895,2.31017,8.64537,8.41772,3.16897,1.29122,2.7376,7.0,3.54994,0.91981,1.67266,3.23954,2.03037,0.0,1.92576,1.73939
max,228713.0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,18.53392,20.0,18.71055,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,19.29605,20.0,20.0,20.0,20.0,19.84819,20.0,17.56098,20.0,20.0,20.0,20.0,20.0,12.0,19.91553,20.0,20.0,20.0,20.0,19.83168,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,18.84696,7.0,20.0,20.0,20.0,20.0,20.0,20.0,19.81631,12.0,20.0,20.0,15.97351,20.0,20.0,20.0,20.0,20.0,20.0,20.0,17.56098,19.84275,20.0,20.0,6.30577,8.92384,20.0,19.01631,9.07054,20.0,20.0,19.0588,20.0,20.0,20.0,20.0,18.77525,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,10.39427,20.0,20.0,19.68607,20.0,15.63161,20.0,20.0,11.0,20.0,20.0


In [8]:
df.describe(include='object')

Unnamed: 0,v3,v22,v24,v30,v31,v47,v52,v56,v66,v71,v74,v75,v79,v91,v107,v110,v112,v113,v125
count,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321
unique,3,18210,5,7,3,10,12,122,3,9,3,4,18,7,7,3,22,36,90
top,C,AGDF,E,C,A,C,J,BW,A,F,B,D,C,A,E,A,F,G,BM
freq,114041,2886,55177,92288,91804,55425,11106,18233,70353,75094,113560,75087,34561,27082,27082,55688,22053,71556,5836


In [3]:
#https://github.com/Sundar0989/WOE-and-IV/blob/master/WOE_IV.ipynb
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [4]:
pd.set_option('display.max_rows', 500)
forWOE = df[["v22","target"]].copy()

final_iv, IV = data_vars(forWOE , forWOE.target)
final_iv.sort_values("WOE",ascending=False)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,VAR_NAME,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
9055,v22,JEA,JEA,36,35,0.972222,1,0.027778,0.000402,0.000037,2.396085,0.297343
14740,v22,TPX,TPX,25,24,0.960000,1,0.040000,0.000276,0.000037,2.018791,0.297343
2624,v22,AEUR,AEUR,23,22,0.956522,1,0.043478,0.000253,0.000037,1.931780,0.297343
12366,v22,PEF,PEF,23,22,0.956522,1,0.043478,0.000253,0.000037,1.931780,0.297343
16518,v22,WWN,WWN,22,21,0.954545,1,0.045455,0.000241,0.000037,1.885260,0.297343
...,...,...,...,...,...,...,...,...,...,...,...,...
731,v22,ABIM,ABIM,6,1,0.166667,5,0.833333,0.000011,0.000183,-2.768701,0.297343
4876,v22,BMV,BMV,6,1,0.166667,5,0.833333,0.000011,0.000183,-2.768701,0.297343
1112,v22,ACBD,ACBD,6,1,0.166667,5,0.833333,0.000011,0.000183,-2.768701,0.297343
12304,v22,PAZ,PAZ,7,1,0.142857,6,0.857143,0.000011,0.000220,-2.951022,0.297343


## Data Prep

In [58]:
from sklearn.preprocessing import StandardScaler
def transform_data(data):
    #OH encode
    label_encode = data.select_dtypes(include='object').columns
    normalize = data.drop(columns=["ID","target"]).select_dtypes(include='number').columns
    
    for col in normalize:
        high = np.quantile(data[col],0.99)
        low = np.quantile(data[col],0.01)
        data.loc[data[col] < low, col] = low
        data.loc[data[col] > high, col] = high

    data_OHE = pd.get_dummies(data, columns=label_encode)

    scaler = StandardScaler()
    data_OHE[normalize] = scaler.fit_transform(data_OHE[normalize])
 
    return data_OHE, data

In [59]:
df['v22'] = df['v22'].astype('category')
df2, test2 = transform_data(df)
df3 = df2.merge(final_iv[["MIN_VALUE","WOE"]], how='left', left_on="v22",right_on="MIN_VALUE")
preModel_data = df3.drop(columns=["v22","MIN_VALUE"]).copy()

In [13]:
preModel_data.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21,v23,v25,v26,v27,v28,v29,v32,v33,v34,v35,v36,v37,v38,v39,v40,v41,v42,v43,v44,v45,v46,v48,v49,v50,v51,v53,v54,v55,v57,v58,v59,v60,v61,v62,v63,v64,v65,v67,v68,v69,v70,v72,v73,v76,v77,v78,v80,v81,v82,v83,v84,v85,v86,v87,v88,v89,v90,v92,v93,v94,v95,v96,v97,v98,v99,v100,v101,v102,v103,v104,v105,v106,v108,v109,v111,v114,v115,v116,v117,v118,v119,v120,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131,v3_A,v3_B,v3_C,v24_A,v24_B,v24_C,v24_D,v24_E,v30_A,v30_B,v30_C,v30_D,v30_E,v30_F,v30_G,v31_A,v31_B,v31_C,v47_A,v47_B,v47_C,v47_D,v47_E,v47_F,v47_G,v47_H,v47_I,v47_J,v52_A,v52_B,v52_C,v52_D,v52_E,v52_F,v52_G,v52_H,v52_I,v52_J,v52_K,v52_L,v56_A,v56_AA,v56_AB,v56_AC,v56_AE,v56_AF,v56_AG,v56_AH,v56_AI,v56_AJ,v56_AK,v56_AL,v56_AM,v56_AN,v56_AO,v56_AP,v56_AR,v56_AS,v56_AT,v56_AU,v56_AV,v56_AW,v56_AX,v56_AY,v56_AZ,v56_B,v56_BA,v56_BC,v56_BD,v56_BE,v56_BF,v56_BG,v56_BH,v56_BI,v56_BJ,v56_BK,v56_BL,v56_BM,v56_BN,v56_BO,v56_BP,v56_BQ,v56_BR,v56_BS,v56_BT,v56_BU,v56_BV,v56_BW,v56_BX,v56_BY,v56_BZ,v56_C,v56_CA,v56_CB,v56_CC,v56_CD,v56_CE,v56_CF,v56_CG,v56_CH,v56_CI,v56_CJ,v56_CK,v56_CL,v56_CM,v56_CN,v56_CO,v56_CP,v56_CQ,v56_CS,v56_CT,v56_CV,v56_CW,v56_CX,v56_CY,v56_CZ,v56_D,v56_DA,v56_DB,v56_DC,v56_DD,v56_DE,v56_DF,v56_DG,v56_DH,v56_DI,v56_DJ,v56_DK,v56_DL,v56_DM,v56_DN,v56_DO,v56_DP,v56_DQ,v56_DR,v56_DS,v56_DT,v56_DU,v56_DV,v56_DW,v56_DX,v56_DY,v56_DZ,v56_E,v56_F,v56_G,v56_H,v56_I,v56_L,v56_M,v56_N,v56_O,v56_P,v56_Q,v56_R,v56_T,v56_U,v56_V,v56_W,v56_X,v56_Y,v56_Z,v66_A,v66_B,v66_C,v71_A,v71_B,v71_C,v71_D,v71_F,v71_G,v71_I,v71_K,v71_L,v74_A,v74_B,v74_C,v75_A,v75_B,v75_C,v75_D,v79_A,v79_B,v79_C,v79_D,v79_E,v79_F,v79_G,v79_H,v79_I,v79_J,v79_K,v79_L,v79_M,v79_N,v79_O,v79_P,v79_Q,v79_R,v91_A,v91_B,v91_C,v91_D,v91_E,v91_F,v91_G,v107_A,v107_B,v107_C,v107_D,v107_E,v107_F,v107_G,v110_A,v110_B,v110_C,v112_A,v112_B,v112_C,v112_D,v112_E,v112_F,v112_G,v112_H,v112_I,v112_J,v112_K,v112_L,v112_M,v112_N,v112_O,v112_P,v112_Q,v112_R,v112_S,v112_T,v112_U,v112_V,v113_A,v113_AA,v113_AB,v113_AC,v113_AD,v113_AE,v113_AF,v113_AG,v113_AH,v113_AI,v113_AJ,v113_AK,v113_B,v113_C,v113_D,v113_E,v113_F,v113_G,v113_H,v113_I,v113_J,v113_L,v113_M,v113_N,v113_O,v113_P,v113_Q,v113_R,v113_S,v113_T,v113_U,v113_V,v113_W,v113_X,v113_Y,v113_Z,v125_A,v125_AA,v125_AB,v125_AC,v125_AD,v125_AE,v125_AF,v125_AG,v125_AH,v125_AI,v125_AJ,v125_AK,v125_AL,v125_AM,v125_AN,v125_AO,v125_AP,v125_AQ,v125_AR,v125_AS,v125_AT,v125_AU,v125_AV,v125_AW,v125_AX,v125_AY,v125_AZ,v125_B,v125_BA,v125_BB,v125_BC,v125_BD,v125_BE,v125_BF,v125_BG,v125_BH,v125_BI,v125_BJ,v125_BK,v125_BL,v125_BM,v125_BN,v125_BO,v125_BP,v125_BQ,v125_BR,v125_BS,v125_BT,v125_BU,v125_BV,v125_BW,v125_BX,v125_BY,v125_BZ,v125_C,v125_CA,v125_CB,v125_CC,v125_CD,v125_CE,v125_CF,v125_CG,v125_CH,v125_CI,v125_CJ,v125_CK,v125_CL,v125_D,v125_E,v125_F,v125_G,v125_H,v125_I,v125_J,v125_K,v125_L,v125_M,v125_N,v125_O,v125_P,v125_Q,v125_R,v125_S,v125_T,v125_U,v125_V,v125_W,v125_X,v125_Y,v125_Z,WOE
0,3,1,-0.36267,0.56766,-0.25975,-0.53588,0.36146,1.56529,-0.70322,0.6679,-0.99017,1.66284,-0.8609,-1.0548,-0.31712,-1.31874,2.71315,-0.11275,-1.58649,-0.57054,1.26315,0.65568,-0.36591,-0.69537,-0.37492,1.03733,-2.23895,0.69859,-1.27378,-1.55579,0.42679,0.25149,-1.14744,-0.70572,-0.15584,1.5668,-0.86957,0.62416,-0.29031,-0.4132,-0.18717,0.4556,-0.691,-0.222,0.10339,-0.5179,0.04252,1.72675,-0.69347,-0.41475,-0.20954,-1.39583,0.19471,-0.31167,0.80023,-0.04409,-0.68187,0.01026,1.7415,0.03177,-1.35614,5.39255,-0.27669,-0.46816,-1.36429,-0.16749,-0.01584,-3.44096,0.97899,0.14306,0.95162,-1.91013,-0.43615,-1.04999,-1.01117,-0.27639,1.77406,-0.66719,-0.45817,-0.7767,0.27503,-0.52193,-1.02718,1.16318,-1.50037,0.68631,-0.17136,1.42626,0.86084,-0.09816,-0.99947,-1.60655,-0.63044,0.35498,0.33237,-0.08897,-2.62463,0.78858,-5.38792,-0.6754,-0.62415,-1.63315,-1.0017,-0.41766,-1.89871,0.87311,-0.80279,-0.55547,0.34788,-0.10303,-0.00748,-0.44737,-1.35778,1.3122,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
1,4,1,0.0,0.0,0.0,0.29085,0.0,0.0,0.38159,0.0,-0.40915,0.0,-0.40433,0.0,-0.31712,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.24933,-0.0,0.60583,0.0,-0.0,0.0,0.0,0.0,0.0,-1.3789,0.0,0.67417,0.0,-0.15584,-0.0,1.21221,-0.0,-0.0,-0.0,0.0,-0.0,0.37327,-0.0,0.0,-0.10708,-0.0,0.0,-0.07089,-0.0,-0.0,0.0,-0.0,0.0,-0.0,1.3922,0.38116,0.0,0.0,-0.0,0.0,0.0,-0.12347,0.61613,0.0,0.0,0.0,0.0,0.0,-0.00565,-0.99622,0.0,0.0,0.0,0.0,-0.14612,-0.0,0.54405,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.36099,-0.0,0.0,0.0,0.0,0.0,0.0,0.24912,0.0,-0.47277,0.02366,0.0,-1.25012,0.0,-0.0,0.8355,0.0,0.0,0.0,0.0,-0.0,0.0,-0.20164,0.0,-0.0,-0.08909,-0.44737,-0.0,-0.0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.02343
2,5,1,-0.84451,-0.96823,0.3082,-2.21338,3.42466,3.26318,-0.70004,2.50757,-0.80173,-1.16504,-0.5374,-1.46383,-1.72499,-0.17583,0.7133,-0.46058,-1.28873,-0.60648,0.20606,-1.66889,-0.36591,-0.70671,0.89102,4.08945,-2.1167,-0.7317,-0.3971,-0.57719,-1.16707,-0.16218,-0.36124,-1.18487,-0.15584,3.46784,0.82436,-1.33756,-0.77195,1.45938,-1.19659,-2.44968,-0.68532,0.10504,-1.74021,-0.77042,1.3024,-1.01483,-0.66584,-1.26739,-0.09271,-0.66644,-0.9525,1.08244,0.9367,-0.04409,-0.69512,-0.43521,0.37551,-1.09034,-3.79321,-2.50017,-2.17135,1.70042,-0.03282,-0.4243,-1.47236,-1.13405,1.39191,1.7247,0.73806,-0.80617,-0.10941,-0.36949,-0.4252,-0.76213,1.83253,-0.65982,-1.15688,-1.72334,-0.25862,0.56382,-1.79533,-0.94432,-1.14087,2.22449,-1.17988,0.65715,-0.55798,0.29724,0.68018,-1.46471,-0.62987,-0.05442,-1.12229,-1.06508,0.00182,-0.90662,1.67652,1.66475,-1.75921,-1.41721,-1.0017,1.52829,-0.48968,1.86172,-0.55178,-0.56948,0.26739,0.55899,-1.11736,2.43755,-1.09794,-0.66084,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0
3,6,1,-1.0246,0.37768,0.0937,1.86926,-0.75165,-1.1212,-0.62785,-0.04577,3.34392,1.51685,2.99234,0.11878,1.38543,-0.24683,0.44177,-0.15423,0.82761,0.0725,0.69495,0.45576,-0.36591,-0.69116,-1.37146,-0.70185,1.9059,0.69202,-0.08153,-0.66915,1.13493,0.77124,-1.16076,-0.75691,-0.15584,-0.53632,0.33372,0.99213,0.0145,-1.5323,1.20655,0.3409,-0.69336,-0.20799,0.84872,1.56258,-1.29084,1.51642,-0.63498,-0.6086,-0.22054,-1.16183,0.76457,-1.20128,0.84369,-0.04409,-0.68774,-0.02674,0.82531,0.49281,0.58379,-0.01686,0.75513,0.61613,-0.26802,-0.20861,0.96062,0.08221,-0.3251,-1.47675,2.54913,-0.61763,-0.32345,-1.16326,0.06469,0.24936,-0.65539,-0.69375,0.56192,1.00643,0.01925,-0.66797,0.84075,1.09029,-0.60594,-0.41164,0.24208,1.19162,0.92757,-0.28597,-0.46002,-0.10396,-0.56781,0.34376,0.11288,-0.77868,-0.64596,0.07777,0.01868,-1.54977,-1.22298,-0.78741,-0.82295,-0.2253,-0.76681,0.14519,-0.89386,-0.57651,-0.68118,-0.2335,-0.04854,0.99509,-0.26184,-0.82753,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.30707
4,8,1,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.59759,0.0,-0.60728,0.0,-0.76402,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.57525,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,-0.15961,0.0,-0.0,0.0,-0.15584,-0.0,-0.10323,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.11964,-0.0,0.0,0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.04409,-0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.46816,0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.20003,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,-0.0,0.0,-0.0,0.0,-0.44737,-0.0,-0.0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1.23863


In [61]:
preModel_data.describe()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v125_R,v125_S,v125_T,v125_U,v125_V,v125_W,v125_X,v125_Y,v125_Z,WOE
count,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,...,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0
mean,114228.928228,0.761199,-1.617637e-15,-3.022e-16,6.527068e-16,1.394996e-15,1.083279e-15,6.562815e-16,6.347794e-17,-4.342164e-15,...,0.010645,0.007077,0.00538,0.009473,0.028289,0.007768,0.007584,0.007698,0.013952,-0.060753
std,65934.487362,0.426353,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,1.000004,...,0.102627,0.083825,0.073148,0.096869,0.165797,0.087791,0.086755,0.087398,0.117292,0.534756
min,3.0,0.0,-2.148987,-2.318007,-3.18292,-2.759645,-2.848109,-2.780425,-0.7979532,-3.140482,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.105173
25%,57280.0,1.0,-0.3633102,-0.4058841,-0.08736342,-0.2349301,-0.2248939,-0.2530763,-0.6560571,-0.1583005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.263175
50%,114189.0,1.0,0.01412333,0.002681007,0.005015314,0.002708458,0.00620456,0.004141715,0.02458013,0.001054637,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,171206.0,1.0,0.01412333,0.04272183,0.2409559,0.1271938,0.1231602,0.1108399,0.02458013,0.1984919,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152924
max,228713.0,1.0,3.910348,3.245598,2.967975,3.982378,3.43202,3.328073,5.413679,3.024204,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.396085


In [8]:
preModel_data.isnull().values.any()

False

In [36]:
test = df[(np.abs(stats.zscore(df.select_dtypes(include='number'))) < 3).all(axis=1)]

In [60]:
test.describe()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
count,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,...,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0,69772.0
mean,114217.428295,0.75053,1.571544,7.540392,4.153586,8.668124,2.436794,2.492936,1.235355,9.097977,...,2.656286,6.883837,3.336465,0.7220649,1.651744,3.232834,1.963055,0.218741,1.834024,1.698761
std,65869.779051,0.432709,0.472245,1.425072,0.449381,0.955557,0.27273,0.257753,0.8124518,0.899288,...,0.540644,0.844063,0.81164,0.4912286,0.168427,0.754564,0.383011,0.519221,0.426076,0.4939189
min,4.0,0.0,-9.996497e-07,1.996901,1.757221,4.129119,1.092564,1.334033,-4.233768e-07,4.687501,...,0.375,2.81407,0.366438,-9.943133e-07,0.744772,0.134831,0.586567,0.0,0.227273,-9.932825e-07
25%,57533.5,1.0,1.630686,7.464411,4.145098,8.742359,2.436402,2.483921,0.6140613,9.031859,...,2.737596,6.822439,3.42001,0.2505038,1.672658,3.239542,2.030373,0.0,1.913446,1.739389
50%,114177.5,1.0,1.630686,7.464411,4.145098,8.742359,2.436402,2.483921,1.496569,9.031859,...,2.737596,6.822439,3.549938,0.919812,1.672658,3.239542,2.030373,0.0,1.925763,1.739389
75%,171049.5,1.0,1.630686,7.464411,4.145098,8.742359,2.436402,2.483921,1.496569,9.031859,...,2.737596,6.822439,3.549938,0.919812,1.672658,3.239542,2.030373,0.0,1.925763,1.739389
max,228713.0,1.0,4.07,14.13898,6.730156,13.362475,3.785289,3.807635,7.602383,13.380282,...,5.752774,10.86207,8.862303,5.111015,2.802905,6.900308,4.445235,2.0,4.666667,4.285715


## Model prep

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score #https://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.svm import SVC #https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
from sklearn.svm import LinearSVC #https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, classification_report, confusion_matrix 
import pickle
import time

In [42]:
validation = preModel_data.sample(frac=0.5, replace=False, random_state=1)
forTrain = preModel_data.drop(validation.index)
print(validation.shape)
print(forTrain.shape)

(57160, 479)
(57161, 479)


In [43]:
n1k = forTrain.sample(n=1000, replace=False, random_state=1)
n2k = forTrain.sample(n=2000, replace=False, random_state=1)
n5k = forTrain.sample(n=5000, replace=False, random_state=1)
n10k = forTrain.sample(n=10000, replace=False, random_state=1)
n20k = forTrain.sample(n=20000, replace=False, random_state=1)

In [44]:
X_1k = n1k.copy().drop(columns=["ID","target"])
X_2k = n2k.copy().drop(columns=["ID","target"])
X_5k = n5k.copy().drop(columns=["ID","target"])
X_10k = n10k.copy().drop(columns=["ID","target"])
X_20k = n20k.copy().drop(columns=["ID","target"])
X_57k = forTrain.copy().drop(columns=["ID","target"])


y_1k = n1k.loc[:,"target"].copy()
y_2k = n2k.loc[:,"target"].copy()
y_5k = n5k.loc[:,"target"].copy()
y_10k = n10k.loc[:,"target"].copy()
y_20k = n20k.loc[:,"target"].copy()
y_57k = forTrain.loc[:,"target"].copy()


In [None]:
y_10k.value_counts()

1    7580
0    2420
Name: target, dtype: int64

## SVC
This never finishes. Need to reduce features for it to do anything.

In [63]:
param_grid = {'C':[1,10,0.5,0.1],
              'gamma':[1.0,0.1,0.001,0.0001], 
              'kernel':['linear','poly','rbf'],
              'tol':[.01,.001,.0001]}

svc = SVC(cache_size = 4000,class_weight = 'balanced', random_state=42)

In [14]:
start = time.time()

n_iter_search = 20
svc_random_search = RandomizedSearchCV(
    svc, 
    param_distributions=param_grid, 
    cv = 2, 
    random_state=42,
    n_iter=n_iter_search, 
    refit=True, 
    n_jobs=-1)

svc_random_search.fit(X_1k, y_1k)

end = time.time()
time_1k = end - start
print(time_1k)

filename = 'svc_random_search_1k.p'
pickle.dump(svc_random_search, open(filename, 'wb'))

preds = svc_random_search.predict(validation.drop(columns=["ID","target"]))
print("accuracy: " + str(round(accuracy_score(y_pred=preds,y_true=validation.target),2)))
pd.crosstab(preds,validation.target)



87.58772087097168
accuracy: 0.27


target,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13539,41353
1,98,2170


In [64]:
start = time.time()

n_iter_search = 20
svc_random_search = RandomizedSearchCV(
    svc, 
    param_distributions=param_grid, 
    cv = 2, 
    random_state=42,
    n_iter=n_iter_search, 
    refit=True, 
    n_jobs=-1)

svc_random_search.fit(X_2k, y_2k)

end = time.time()
time_2k = end - start
print(time_2k)

filename = 'svc_random_search_2k.p'
pickle.dump(svc_random_search, open(filename, 'wb'))

preds = svc_random_search.predict(validation.drop(columns=["ID","target"]))
print("accuracy: " + str(round(accuracy_score(y_pred=preds,y_true=validation.target),2)))
pd.crosstab(preds,validation.target)



1431.365990638733
accuracy: 0.76


target,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,1
1,13635,43522


In [16]:
start = time.time()

n_iter_search = 20
svc_random_search = RandomizedSearchCV(
    svc, 
    param_distributions=param_grid, 
    cv = 2, 
    random_state=42,
    n_iter=n_iter_search, 
    refit=True, 
    n_jobs=-1)

svc_random_search.fit(X_5k, y_5k)

end = time.time()
time_5k = end - start
print(time_5k)

filename = 'svc_random_search_5k.p'
pickle.dump(svc_random_search, open(filename, 'wb'))

preds = svc_random_search.predict(validation.drop(columns=["ID","target"]))
print("accuracy: " + str(round(accuracy_score(y_pred=preds,y_true=validation.target),2)))
pd.crosstab(preds,validation.target)

KeyboardInterrupt: 

In [None]:
start = time.time()

n_iter_search = 20
svc_random_search = RandomizedSearchCV(
    svc, 
    param_distributions=param_grid, 
    cv = 2, 
    random_state=42,
    n_iter=n_iter_search, 
    refit=True, 
    n_jobs=-1)

svc_random_search.fit(X_10k, y_10k)

end = time.time()
time_10k = end - start
print(time_10k)

filename = 'svc_random_search_10k.p'
pickle.dump(svc_random_search, open(filename, 'wb'))

preds = svc_random_search.predict(validation.drop(columns=["ID","target"]))
print("accuracy: " + str(round(accuracy_score(y_pred=preds,y_true=validation.target),2)))
pd.crosstab(preds,validation.target)

In [None]:
start = time.time()

n_iter_search = 20
svc_random_search = RandomizedSearchCV(
    svc, 
    param_distributions=param_grid, 
    cv = 2, 
    random_state=42,
    n_iter=n_iter_search, 
    refit=True, 
    n_jobs=-1)

svc_random_search.fit(X_20k, y_20k)

end = time.time()
time_20k = end - start
print(time_20k)

filename = 'svc_random_search_20k.p'
pickle.dump(svc_random_search, open(filename, 'wb'))

preds = svc_random_search.predict(validation.drop(columns=["ID","target"]))
print("accuracy: " + str(round(accuracy_score(y_pred=preds,y_true=validation.target),2)))
pd.crosstab(preds,validation.target)

In [None]:
start = time.time()

n_iter_search = 20
svc_random_search = RandomizedSearchCV(
    svc, 
    param_distributions=param_grid, 
    cv = 2, 
    random_state=42,
    n_iter=n_iter_search, 
    refit=True, 
    n_jobs=-1)

svc_random_search.fit(X_57k, y_57k)

end = time.time()
time_57k = end - start
print(time_57k/60)

start = time.time()
filename = 'svc_random_search_57k.p'
pickle.dump(svc_random_search, open(filename, 'wb'))
end = time.time()
print(start-end)
start = time.time()
preds = svc_random_search.predict(validation.drop(columns=["ID","target"]))
print("accuracy: " + str(round(accuracy_score(y_pred=preds,y_true=validation.target),2)))
pd.crosstab(preds,validation.target)
end = time.time()
print(start-end)

In [25]:
1380/60

23.0

In [54]:
preds = svc_random_search.predict(validation.drop(columns=["ID","target"]))

In [60]:
print("accuracy: " + str(round(accuracy_score(preds,validation.target),2)))
np.unique(preds)

accuracy: 0.76


array([1], dtype=int64)

In [52]:
best = SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
best.fit(X,y)
preds2 = best.predict(validation.drop(columns=["ID","target"]))

In [55]:
pd.crosstab(preds,preds2)

col_0,1
row_0,Unnamed: 1_level_1
1,57160


In [52]:
pd.DataFrame(svc_random_search.cv_results_).sort_values('rank_test_score')



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tol,param_kernel,param_gamma,param_C,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
11,215.48913,1.80422,181.06287,3.93296,0.01,rbf,1.0,0.5,"{'tol': 0.01, 'kernel': 'rbf', 'gamma': 1.0, '...",0.75992,0.76008,0.76,8e-05,1,1.0,1.0,1.0,0.0
10,223.19722,12.16083,160.58147,2.67261,0.01,rbf,0.1,10.0,"{'tol': 0.01, 'kernel': 'rbf', 'gamma': 0.1, '...",0.75432,0.76028,0.7573,0.00298,2,0.9999,0.9999,0.9999,0.0
12,217.88237,4.24804,175.76268,5.3058,0.001,poly,0.001,0.5,"{'tol': 0.001, 'kernel': 'poly', 'gamma': 0.00...",0.75822,0.75398,0.7561,0.00212,3,0.75748,0.75742,0.75745,3e-05
18,204.05205,0.86173,153.48479,1.9202,0.0001,rbf,0.1,0.1,"{'tol': 0.0001, 'kernel': 'rbf', 'gamma': 0.1,...",0.72743,0.73817,0.7328,0.00537,4,0.75318,0.89251,0.82284,0.06967
5,345.84377,5.4236,105.59209,0.1308,0.01,poly,0.1,1.0,"{'tol': 0.01, 'kernel': 'poly', 'gamma': 0.1, ...",0.70403,0.70157,0.7028,0.00123,5,0.9992,0.9995,0.99935,0.00015
0,234.64795,12.23269,118.61389,3.77499,0.01,linear,0.1,0.1,"{'tol': 0.01, 'kernel': 'linear', 'gamma': 0.1...",0.66213,0.68377,0.67295,0.01082,6,0.69957,0.69663,0.6981,0.00147
16,237.61662,1.00785,101.076,2.49059,0.01,linear,0.0001,1.0,"{'tol': 0.01, 'kernel': 'linear', 'gamma': 0.0...",0.66193,0.68087,0.6714,0.00947,7,0.70187,0.70133,0.7016,0.00027
9,259.41874,2.50456,115.15389,0.21691,0.01,linear,0.001,1.0,"{'tol': 0.01, 'kernel': 'linear', 'gamma': 0.0...",0.66193,0.68087,0.6714,0.00947,7,0.70187,0.70133,0.7016,0.00027
2,294.99839,0.62014,125.40264,2.82204,0.001,linear,0.1,0.5,"{'tol': 0.001, 'kernel': 'linear', 'gamma': 0....",0.66243,0.67967,0.67105,0.00862,9,0.70157,0.70113,0.70135,0.00022
14,223.14077,1.42725,86.74306,2.392,0.001,linear,0.0001,0.5,"{'tol': 0.001, 'kernel': 'linear', 'gamma': 0....",0.66243,0.67967,0.67105,0.00862,9,0.70157,0.70113,0.70135,0.00022


## Linear SVC

In [65]:
param_grid = {'penalty':['l2'], 
              'loss':['hinge','squared_hinge'], 
              'C':[1,10, 100, 10000, 0.5,0.1]}

lsvc = LinearSVC(class_weight = 'balanced', random_state=42, max_iter=100000)

In [66]:
n_iter_search = 2
lsvc_random_search = RandomizedSearchCV(lsvc, param_distributions=param_grid, random_state=42, cv=3,
                                   n_iter=n_iter_search,return_train_score=True)
lsvc_random_search.fit(X_5k, y_5k)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          fit_params=None, iid='warn', n_iter=2, n_jobs=None,
          param_distributions={'penalty': ['l2'], 'loss': ['hinge', 'squared_hinge'], 'C': [1, 10, 100, 10000, 0.5, 0.1]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [48]:
filename = 'lsvc_random_search.p'
pickle.dump(lsvc_random_search, open(filename, 'wb'))



In [2]:
lsvc_random_search = pd.read_pickle('lsvc_random_search.p')

In [67]:
pd.DataFrame(lsvc_random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_loss,param_C,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.580203,0.038849,0.005001,2.247832e-07,l2,hinge,0.1,"{'penalty': 'l2', 'loss': 'hinge', 'C': 0.1}",0.665468,0.657863,0.680072,0.6678,0.009214,2,0.732293,0.724655,0.723755,0.726901,0.00383
1,20.556804,0.712806,0.005002,1.072147e-06,l2,squared_hinge,0.5,"{'penalty': 'l2', 'loss': 'squared_hinge', 'C'...",0.667866,0.666267,0.686675,0.6736,0.009265,1,0.741297,0.746251,0.739052,0.7422,0.003007


In [20]:
param_grid = {'C':[1,10],'penalty':['l1', 'l2']}

In [30]:
scoring

{'Accuracy': 'accuracy', 'Log Loss': 'neg_log_loss'}

In [45]:
n_iter_search = 2
lsvc_random_search_2 = RandomizedSearchCV(LinearSVC(), param_distributions=param_grid, random_state=42, scoring= 'neg_log_loss',
                                   n_iter=n_iter_search,cv=2)
lsvc_random_search_2.fit(X_train, y_train)

filename = 'lsvc_random_search_2.p'
pickle.dump(lsvc_random_search_2, open(filename, 'wb'))

NameError: name 'X_train' is not defined

In [26]:
pd.DataFrame(lsvc_random_search_2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_C,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,1.465384,0.037277,0.012004,0.001000524,l2,1,"{'penalty': 'l2', 'C': 1}",0.759269,0.759206,0.759237,3.1e-05,1,0.805693,0.805744,0.805719,2.5e-05
1,1.591785,0.059439,0.011003,3.576279e-07,l2,10,"{'penalty': 'l2', 'C': 10}",0.694517,0.724471,0.709492,0.014977,2,0.735962,0.762402,0.749182,0.01322
