# Week 8 Bank Data Case Study

## Load Packages

In [1]:
import pandas as pd
import numpy as np

## Read the Data

In this section we read in the data. 

In [2]:
df = pd.read_csv("../../../case_8.csv")
df.head()

Unnamed: 0,ID,target,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,v128,v129,v130,v131
0,3,1,1.335739,8.727474,C,3.921026,7.915266,2.599278,3.176895,0.012941,...,8.0,1.98978,0.035754,AU,1.804126,3.113719,2.024285,0,0.636365,2.857144
1,4,1,1.630686,7.464411,C,4.145098,9.191265,2.436402,2.483921,2.30163,...,6.822439,3.549938,0.598896,AF,1.672658,3.239542,1.957825,0,1.925763,1.739389
2,5,1,0.943877,5.310079,C,4.410969,5.326159,3.979592,3.928571,0.019645,...,9.333333,2.477596,0.013452,AE,1.773709,3.922193,1.120468,2,0.883118,1.176472
3,6,1,0.797415,8.304757,C,4.22593,11.627438,2.0977,1.987549,0.171947,...,7.018256,1.812795,0.002267,CJ,1.41523,2.954381,1.990847,1,1.677108,1.034483
4,8,1,1.630686,7.464411,C,4.145098,8.742359,2.436402,2.483921,1.496569,...,6.822439,3.549938,0.919812,Z,1.672658,3.239542,2.030373,0,1.925763,1.739389


No obvious issues like parsing errors or missings. Lets see what we have for data types.

In [3]:
df.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114321 entries, 0 to 114320
Data columns (total 133 columns):
 #   Column  Dtype  
---  ------  -----  
 0   ID      int64  
 1   target  int64  
 2   v1      float64
 3   v2      float64
 4   v3      object 
 5   v4      float64
 6   v5      float64
 7   v6      float64
 8   v7      float64
 9   v8      float64
 10  v9      float64
 11  v10     float64
 12  v11     float64
 13  v12     float64
 14  v13     float64
 15  v14     float64
 16  v15     float64
 17  v16     float64
 18  v17     float64
 19  v18     float64
 20  v19     float64
 21  v20     float64
 22  v21     float64
 23  v22     object 
 24  v23     float64
 25  v24     object 
 26  v25     float64
 27  v26     float64
 28  v27     float64
 29  v28     float64
 30  v29     float64
 31  v30     object 
 32  v31     object 
 33  v32     float64
 34  v33     float64
 35  v34     float64
 36  v35     float64
 37  v36     float64
 38  v37     float64
 39  v38     int64  
 40  v

We see mostly floats. There are some object data types we should probably recast. We have 114K observations, plenty to work with.  No variable names, as expected. He said there are no missings but lets check anyway.

In [4]:
df.isnull().values.any()

False

Fine. He told the truth. How about that target variable

In [5]:
counts = df.target.value_counts()
print(counts)
print(round(counts[0]/sum(counts),4))

1    87021
0    27300
Name: target, dtype: int64
0.2388


The target is binary and a little unbalanced, but not terrible.

## Data Cleaning

In [4]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.5f' % x)


df.describe()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,v10,v11,v12,v13,v14,v15,v16,v17,v18,v19,v20,v21,v23,v25,v26,v27,v28,v29,v32,v33,v34,v35,v36,v37,v38,v39,v40,v41,v42,v43,v44,v45,v46,v48,v49,v50,v51,v53,v54,v55,v57,v58,v59,v60,v61,v62,v63,v64,v65,v67,v68,v69,v70,v72,v73,v76,v77,v78,v80,v81,v82,v83,v84,v85,v86,v87,v88,v89,v90,v92,v93,v94,v95,v96,v97,v98,v99,v100,v101,v102,v103,v104,v105,v106,v108,v109,v111,v114,v115,v116,v117,v118,v119,v120,v121,v122,v123,v124,v126,v127,v128,v129,v130,v131
count,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0,114321.0
mean,114228.92823,0.7612,1.63069,7.46441,4.1451,8.74236,2.4364,2.48392,1.49657,9.03186,1.88305,15.44741,6.8813,3.7984,12.09428,2.08091,4.92322,3.83227,0.84105,0.2223,17.77359,7.02974,1.09309,1.69813,1.87603,2.74345,5.09333,8.20642,1.62215,2.16163,6.40624,8.12239,13.3756,0.74147,0.09093,1.23718,10.46593,7.18255,12.92497,2.2166,10.79517,9.14223,1.63053,12.53802,8.01655,1.50426,7.19816,15.7113,1.25386,1.55956,4.07783,7.70165,10.58794,1.71429,14.58303,1.03069,1.68733,6.34371,15.84756,9.28728,17.56412,9.44934,12.26996,1.43177,2.4333,2.40506,7.30737,13.33448,2.2097,7.28717,6.20836,2.17381,1.60796,2.82225,1.22018,10.18022,1.92418,1.51843,0.96691,0.58237,5.47518,3.85288,0.66576,6.45795,7.62255,7.66762,1.25072,12.09162,6.86641,2.89029,5.29672,2.64283,1.08105,11.79136,2.15262,4.18128,3.36531,13.57445,10.54805,2.29122,8.30386,8.36465,3.16897,1.29122,2.7376,6.82244,3.54994,0.91981,1.67266,3.23954,2.03037,0.31014,1.92576,1.73939
std,65934.48736,0.42635,0.81326,2.22504,0.86266,1.54344,0.45061,0.44271,2.10979,1.44954,1.39347,0.59338,0.92415,0.88317,1.44392,0.55045,1.34464,1.43607,0.46286,0.12868,0.86743,1.0694,2.98732,2.24158,0.41398,0.62666,2.01131,0.96545,0.42324,0.7397,2.0242,1.00628,1.78573,0.40657,0.58348,1.77108,3.16764,0.75443,0.7488,0.48667,1.58586,1.55058,2.19532,1.64993,0.67797,1.16789,1.87306,0.60036,1.7546,0.62668,0.50925,5.13806,1.5564,0.40378,1.59344,0.69624,2.24951,1.89742,1.4105,0.84371,1.71983,1.4267,1.75436,0.92227,0.59981,1.03956,0.94339,1.38423,0.80726,1.68567,2.78821,0.79785,0.70691,1.06186,0.34985,2.27357,0.78753,2.13245,0.13438,0.1804,1.23201,0.64216,0.19835,0.84155,1.44498,1.76276,0.34655,5.17341,1.76901,1.35412,0.92291,0.66527,1.70317,2.21935,0.69222,2.81395,1.11715,2.61288,1.42744,0.5034,2.74269,1.50358,3.1636,0.55455,1.0186,1.3487,1.94343,1.59155,0.37791,1.22123,0.81434,0.69326,0.94964,0.85182
min,3.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,1.51678,0.10618,-0.0,0.04104,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.06935,-0.0,-0.0,-0.0,-0.0,-0.0,0.01306,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0,0.05306,-0.0,0.6593,-0.0,1.50136,-0.0,0.42709,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.8724,-0.0,0.02237,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,9e-05,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.01914,-0.0,-0.0,-0.0,0.0,0.0,-0.0,-0.0
25%,57280.0,1.0,1.34615,6.57577,4.0687,8.39409,2.34097,2.37659,0.26531,8.81356,1.05033,15.39823,6.32262,3.46409,11.25602,1.90569,4.70588,3.37983,0.71949,0.19241,17.77359,6.41875,0.0,0.27448,1.75553,2.56647,4.74236,8.11437,1.49129,1.83074,5.0558,7.89615,13.09935,0.59072,0.0,0.30522,8.41039,7.06762,12.81317,2.06897,10.54256,8.88634,0.2563,12.15693,7.914,0.65879,6.83727,15.66772,0.20819,1.2766,3.97665,4.06136,10.2168,1.5996,14.58303,1.0,0.27094,5.85294,15.84756,9.16798,17.56412,9.27007,12.08748,1.0,2.23612,2.06003,7.19023,13.15175,1.95122,7.20499,3.59298,1.81535,1.31804,2.44395,1.10841,9.55184,1.62351,0.22454,0.94962,0.51793,5.13587,3.65008,0.59461,6.36225,7.18954,7.29994,1.17271,12.09162,6.34055,2.32905,4.98815,2.43202,0.17355,11.70203,1.85201,2.72122,2.92386,11.99667,10.26667,2.13904,7.604,7.86517,1.16943,1.05263,2.28261,6.51961,2.57105,0.08471,1.57097,2.7625,1.68126,0.0,1.44948,1.46341
50%,114189.0,1.0,1.63069,7.46441,4.1451,8.74236,2.4364,2.48392,1.49657,9.03186,1.31291,15.44741,6.61324,3.7984,11.96783,2.08091,4.92322,3.83227,0.84105,0.2223,17.77359,7.03937,0.33059,1.69813,1.87603,2.74345,5.09333,8.20642,1.62215,2.16163,6.53443,8.12239,13.3756,0.74147,0.0,1.23718,10.33934,7.18255,12.92497,2.2166,10.79517,9.14223,1.63053,12.53802,8.01655,1.21194,7.19816,15.7113,1.25386,1.55956,4.07783,7.70165,10.58794,1.71429,14.58303,1.0,1.68733,6.34371,15.84756,9.28728,17.56412,9.44934,12.26996,1.0,2.4333,2.40506,7.30737,13.33448,2.2097,7.28717,6.20836,2.17381,1.60796,2.82225,1.22018,10.18022,1.92418,1.51843,0.96691,0.58237,5.47518,3.85288,0.66576,6.45795,7.62255,7.66762,1.25072,12.09162,6.86641,2.89029,5.29672,2.64283,1.08105,11.79136,2.15262,4.18128,3.36531,14.03888,10.54805,2.29122,8.30386,8.36465,3.16897,1.29122,2.7376,6.82244,3.54994,0.91981,1.67266,3.23954,2.03037,0.0,1.92576,1.73939
75%,171206.0,1.0,1.63069,7.5515,4.34023,8.9248,2.4847,2.52845,1.49657,9.30233,2.10066,15.5939,7.0194,3.7984,12.71577,2.08091,5.14286,3.83227,0.84105,0.2223,18.1546,7.66652,1.09309,1.69813,1.89891,2.7791,5.33034,8.47939,1.62215,2.16163,7.70145,8.25076,14.32492,0.74147,0.0,1.23718,12.76246,7.34477,13.04965,2.23749,11.0221,9.41516,1.63053,12.67463,8.13559,2.00572,7.41788,15.87156,1.25386,1.55956,4.15366,7.70165,10.83954,1.73502,15.31291,1.0,1.68733,6.3844,16.47085,9.46899,18.4375,9.73384,12.9166,2.0,2.43665,2.40506,7.55221,13.55932,2.24359,7.82301,6.20836,2.17381,1.60796,2.82225,1.22018,10.43359,1.92418,1.51843,0.9901,0.58237,5.47518,3.85288,0.66576,6.669,7.71084,8.00612,1.30167,15.69721,6.93119,2.89029,5.29672,2.64283,1.08105,12.44363,2.15262,4.18128,3.36531,15.37219,10.71895,2.31017,8.64537,8.41772,3.16897,1.29122,2.7376,7.0,3.54994,0.91981,1.67266,3.23954,2.03037,0.0,1.92576,1.73939
max,228713.0,1.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,18.53392,20.0,18.71055,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,19.29605,20.0,20.0,20.0,20.0,19.84819,20.0,17.56098,20.0,20.0,20.0,20.0,20.0,12.0,19.91553,20.0,20.0,20.0,20.0,19.83168,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,18.84696,7.0,20.0,20.0,20.0,20.0,20.0,20.0,19.81631,12.0,20.0,20.0,15.97351,20.0,20.0,20.0,20.0,20.0,20.0,20.0,17.56098,19.84275,20.0,20.0,6.30577,8.92384,20.0,19.01631,9.07054,20.0,20.0,19.0588,20.0,20.0,20.0,20.0,18.77525,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,10.39427,20.0,20.0,19.68607,20.0,15.63161,20.0,20.0,11.0,20.0,20.0


In [7]:
df.describe(include='object')

Unnamed: 0,v3,v22,v24,v30,v31,v47,v52,v56,v66,v71,v74,v75,v79,v91,v107,v110,v112,v113,v125
count,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321,114321
unique,3,18210,5,7,3,10,12,122,3,9,3,4,18,7,7,3,22,36,90
top,C,AGDF,E,C,A,C,J,BW,A,F,B,D,C,A,E,A,F,G,BM
freq,114041,2886,55177,92288,91804,55425,11106,18233,70353,75094,113560,75087,34561,27082,27082,55688,22053,71556,5836


In [3]:
#https://github.com/Sundar0989/WOE-and-IV/blob/master/WOE_IV.ipynb
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [7]:
pd.set_option('display.max_rows', 500)
forWOE = df[["v22","target"]].copy()

final_iv, IV = data_vars(forWOE , forWOE.target)
final_iv.sort_values("WOE",ascending=False)

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,VAR_NAME,MIN_VALUE,MAX_VALUE,COUNT,EVENT,EVENT_RATE,NONEVENT,NON_EVENT_RATE,DIST_EVENT,DIST_NON_EVENT,WOE,IV
9055,v22,JEA,JEA,36,35,0.972222,1,0.027778,0.000402,0.000037,2.396085,0.297343
14740,v22,TPX,TPX,25,24,0.960000,1,0.040000,0.000276,0.000037,2.018791,0.297343
2624,v22,AEUR,AEUR,23,22,0.956522,1,0.043478,0.000253,0.000037,1.931780,0.297343
12366,v22,PEF,PEF,23,22,0.956522,1,0.043478,0.000253,0.000037,1.931780,0.297343
16518,v22,WWN,WWN,22,21,0.954545,1,0.045455,0.000241,0.000037,1.885260,0.297343
...,...,...,...,...,...,...,...,...,...,...,...,...
731,v22,ABIM,ABIM,6,1,0.166667,5,0.833333,0.000011,0.000183,-2.768701,0.297343
4876,v22,BMV,BMV,6,1,0.166667,5,0.833333,0.000011,0.000183,-2.768701,0.297343
1112,v22,ACBD,ACBD,6,1,0.166667,5,0.833333,0.000011,0.000183,-2.768701,0.297343
12304,v22,PAZ,PAZ,7,1,0.142857,6,0.857143,0.000011,0.000220,-2.951022,0.297343


## Data Prep

In [4]:
from sklearn.preprocessing import StandardScaler
def transform_data(data):
    #OH encode
    label_encode = data.select_dtypes(include='object').columns
    normalize = data.drop(columns=["ID","target"]).select_dtypes(include='number').columns

    data_OHE = pd.get_dummies(data, columns=label_encode)

    scaler = StandardScaler()
    data_OHE[normalize] = scaler.fit_transform(data_OHE[normalize])
 
    return data_OHE

In [5]:
df['v22'] = df['v22'].astype('category')
df2 = transform_data(df)
df3 = df2.merge(final_iv[["MIN_VALUE","WOE"]], how='left', left_on="v22",right_on="MIN_VALUE")
preModel_data = df3.drop(columns=["v22","MIN_VALUE"]).copy()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


NameError: name 'final_iv' is not defined

In [10]:
preModel_data.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v125_R,v125_S,v125_T,v125_U,v125_V,v125_W,v125_X,v125_Y,v125_Z,WOE
0,3,1,-0.362671,0.5676624,-0.259746,-0.535879,0.3614557,1.56529,-0.7032151,0.667897,...,0,0,0,0,0,0,0,0,0,0.0
1,4,1,2.730298e-16,7.983532e-16,0.0,0.290849,9.85525e-16,0.0,0.3815862,0.0,...,0,0,0,0,0,0,0,0,0,0.023433
2,5,1,-0.8445118,-0.9682275,0.3082,-2.213376,3.424656,3.263175,-0.7000377,2.507568,...,0,0,0,0,0,0,0,0,0,0.0
3,6,1,-1.024604,0.3776793,0.093701,1.86926,-0.7516472,-1.121205,-0.6278492,-0.045768,...,0,0,0,0,0,0,0,0,0,0.307074
4,8,1,2.730298e-16,7.983532e-16,0.0,0.0,9.85525e-16,0.0,-1.052455e-16,0.0,...,0,0,0,0,0,0,0,0,1,1.238633


## Model prep

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score #https://scikit-learn.org/stable/modules/model_evaluation.html

In [11]:
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

In [10]:
n_estimators= list(range(80, 110, 10))
max_features = list(range(5, 50, 5))
min_samples_split = list(range(500, 701, 100))
min_samples_leaf = [10, 20]
print(f'n_estimator_grid_search:{n_estimators}')
print(f'max_features_grid_search:{max_features}')
print(f'min_samples_split_grid_search:{min_samples_split}')
print(f'min_samples_leaf_grid_search:{min_samples_leaf}')


param_dist = {'n_estimators': n_estimators,
              'max_features': max_features,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf}

scoring = {  'Accuracy':'accuracy'
            , 'Log Loss':'neg_log_loss'}

n_estimator_grid_search:[80, 90, 100]
max_features_grid_search:[5, 10, 15, 20, 25, 30, 35, 40, 45]
min_samples_split_grid_search:[500, 600, 700]
min_samples_leaf_grid_search:[10, 20]


In [15]:
X = df.copy().drop(columns=["ID","target"]).select_dtypes(include=['number'])
print("The shape of X is: ", X.shape)

y = df.loc[:,"target"].copy()
print("The shape of y is: ", y.shape)

The shape of X is:  (114321, 112)
The shape of y is:  (114321,)


## Random Forest
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pickle

In [12]:
clf = RandomForestClassifier(max_depth=2, random_state=0, n_jobs=-1)
clf.fit(X, y)

NameError: name 'X' is not defined

Grid search for random forest

In [11]:
%%time
#%%script false --no-raise-error

n_iter_search = 2
rf_random_search = RandomizedSearchCV(clf, param_distributions=param_dist, scoring=scoring, cv = cv, random_state=42,
                                   n_iter=n_iter_search, refit='Accuracy')
rf_random_search.fit(X, y)

filename = 'rf_random_search.p'
pickle.dump(rf_random_search, open(filename, 'wb'))

Wall time: 56 s


In [12]:
rf_random_search = pd.read_pickle('rf_random_search.p')
pd.DataFrame(rf_random_search.cv_results_)




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,params,split0_test_Accuracy,...,mean_test_Log Loss,std_test_Log Loss,rank_test_Log Loss,split0_train_Log Loss,split1_train_Log Loss,split2_train_Log Loss,split3_train_Log Loss,split4_train_Log Loss,mean_train_Log Loss,std_train_Log Loss
0,4.877302,0.84036,0.376586,0.000893,100,600,20,45,"{'n_estimators': 100, 'min_samples_split': 600...",0.761207,...,-0.511646,0.000566,1,-0.510321,-0.510189,-0.510482,-0.510396,-0.510773,-0.510432,0.000196
1,3.229324,0.051449,0.377685,0.004225,90,500,10,35,"{'n_estimators': 90, 'min_samples_split': 500,...",0.761207,...,-0.51481,0.000694,2,-0.513723,-0.513504,-0.513954,-0.513783,-0.513657,-0.513724,0.000148


## XGBoost

In [6]:
from xgboost import XGBClassifier
#https://xgboost.readthedocs.io/en/latest/build.html

  import pandas.util.testing as tm


In [18]:
# A parameter grid for XGBoost
params = {
        'learning_rate': [0.005, 0.01, 0.02, 0.05, 0.1],
        'n_estimators': [100,200,400,600,800,1000],
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = XGBClassifier(objective='binary:logistic', early_stopping_rounds=10, tree_method='hist',
                    silent=True, nthread=-1)

In [19]:
%%time
#%%script false --no-raise-error
n_iter_search = 2
xgb_random_search = RandomizedSearchCV(xgb, param_distributions=params, scoring=scoring, cv = cv, random_state=42,
                                   n_iter=n_iter_search, refit='Accuracy')
xgb_random_search.fit(X, y)

#filename = 'xgb_random_search.p'
#pickle.dump(xgb_random_search, open(filename, 'wb'))`

Wall time: 5min 36s


In [16]:
xgb_random_search = pd.read_pickle('xgb_random_search.p')
pd.DataFrame(xgb_random_search.cv_results_)



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_n_estimators,param_min_child_weight,param_max_depth,param_learning_rate,param_gamma,...,mean_test_Log Loss,std_test_Log Loss,rank_test_Log Loss,split0_train_Log Loss,split1_train_Log Loss,split2_train_Log Loss,split3_train_Log Loss,split4_train_Log Loss,mean_train_Log Loss,std_train_Log Loss
0,78.695948,0.513996,0.563534,0.012532,0.8,1000,5,5,0.1,2,...,-0.499054,0.001442,2,-0.37165,-0.371244,-0.370991,-0.371692,-0.370583,-0.371232,0.000416
1,37.084821,0.255249,0.416493,0.004925,1.0,800,10,3,0.005,1,...,-0.496882,0.000619,1,-0.493681,-0.493714,-0.493701,-0.49341,-0.49411,-0.493723,0.000224


## SVC
This never finishes. Need to reduce features for it to do anything.

In [69]:
from sklearn.svm import SVC
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

In [32]:
sampled = preModel_data.sample(n=500, replace=False, random_state=1)


X = sampled.copy().drop(columns=["ID","target"]).select_dtypes(include=['number'])
print("The shape of X is: ", X.shape)

y = sampled.loc[:,"target"].copy()
print("The shape of y is: ", y.shape)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

The shape of X is:  (500, 477)
The shape of y is:  (500,)


In [33]:
param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}

In [35]:
%%time
#%%script false --no-raise-error
n_iter_search = 2
svc_random_search = RandomizedSearchCV(SVC(probability=True), param_distributions=param_grid, scoring=scoring, cv = 2, random_state=42,
                                   n_iter=n_iter_search, refit='Accuracy')
svc_random_search.fit(X, y)

filename = 'svc_random_search.p'
pickle.dump(svc_random_search, open(filename, 'wb'))

Wall time: 893 ms


In [37]:
pd.DataFrame(svc_random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_gamma,param_C,params,split0_test_Accuracy,split1_test_Accuracy,...,std_train_Accuracy,split0_test_Log Loss,split1_test_Log Loss,mean_test_Log Loss,std_test_Log Loss,rank_test_Log Loss,split0_train_Log Loss,split1_train_Log Loss,mean_train_Log Loss,std_train_Log Loss
0,0.105361,0.0001226664,0.029006,0.001,rbf,0.001,1000,"{'kernel': 'rbf', 'gamma': 0.001, 'C': 1000}",0.708,0.636,...,0.0,-0.53173,-0.53294,-0.532335,0.000605,2,-0.34814,-0.437948,-0.393044,0.044904
1,0.078018,4.768372e-07,0.029008,0.0,rbf,0.0001,10,"{'kernel': 'rbf', 'gamma': 0.0001, 'C': 10}",0.768,0.768,...,0.002,-0.533451,-0.529176,-0.531314,0.002137,1,-0.499605,-0.504951,-0.502278,0.002673


## Linear SVC

In [22]:
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score 
import pickle
#https://scikit-learn.org/stable/modules/model_evaluation.html
#https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

In [17]:
preModel_data.head()

Unnamed: 0,ID,target,v1,v2,v4,v5,v6,v7,v8,v9,...,v125_R,v125_S,v125_T,v125_U,v125_V,v125_W,v125_X,v125_Y,v125_Z,WOE
0,3,1,-0.362671,0.5676624,-0.259746,-0.535879,0.3614557,1.56529,-0.7032151,0.667897,...,0,0,0,0,0,0,0,0,0,0.0
1,4,1,2.730298e-16,7.983532e-16,0.0,0.290849,9.85525e-16,0.0,0.3815862,0.0,...,0,0,0,0,0,0,0,0,0,0.023433
2,5,1,-0.8445118,-0.9682275,0.3082,-2.213376,3.424656,3.263175,-0.7000377,2.507568,...,0,0,0,0,0,0,0,0,0,0.0
3,6,1,-1.024604,0.3776793,0.093701,1.86926,-0.7516472,-1.121205,-0.6278492,-0.045768,...,0,0,0,0,0,0,0,0,0,0.307074
4,8,1,2.730298e-16,7.983532e-16,0.0,0.0,9.85525e-16,0.0,-1.052455e-16,0.0,...,0,0,0,0,0,0,0,0,1,1.238633


In [18]:
sampled = preModel_data.sample(frac=0.1, replace=False, random_state=1)


X = sampled.copy().drop(columns=["ID","target"]).select_dtypes(include=['number'])
print("The shape of X is: ", X.shape)

y = sampled.loc[:,"target"].copy()
print("The shape of y is: ", y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

The shape of X is:  (11432, 477)
The shape of y is:  (11432,)


In [13]:
param_grid = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001]}

In [79]:

scoring = {  'Log Loss':'neg_log_loss'
           }

In [19]:
n_iter_search = 2
lsvc_random_search = RandomizedSearchCV(SVC(kernel='linear',probability=True), param_distributions=param_grid, random_state=42,
                                   n_iter=n_iter_search)
lsvc_random_search.fit(X_train, y_train)

filename = 'lsvc_random_search.p'
pickle.dump(lsvc_random_search, open(filename, 'wb'))



KeyboardInterrupt: 

In [2]:
lsvc_random_search = pd.read_pickle('lsvc_random_search.p')

In [3]:
lsvc_random_search.results

AttributeError: 'RandomizedSearchCV' object has no attribute 'results'

In [20]:
param_grid = {'C':[1,10],'penalty':['l1', 'l2']}

In [30]:
scoring

{'Accuracy': 'accuracy', 'Log Loss': 'neg_log_loss'}

In [31]:
n_iter_search = 2
lsvc_random_search_2 = RandomizedSearchCV(LinearSVC(), param_distributions=param_grid, random_state=42, scoring= 'neg_log_loss',
                                   n_iter=n_iter_search,cv=2)
lsvc_random_search_2.fit(X_train, y_train)

filename = 'lsvc_random_search_2.p'
pickle.dump(lsvc_random_search_2, open(filename, 'wb'))



AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

In [26]:
pd.DataFrame(lsvc_random_search_2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_penalty,param_C,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,1.465384,0.037277,0.012004,0.001000524,l2,1,"{'penalty': 'l2', 'C': 1}",0.759269,0.759206,0.759237,3.1e-05,1,0.805693,0.805744,0.805719,2.5e-05
1,1.591785,0.059439,0.011003,3.576279e-07,l2,10,"{'penalty': 'l2', 'C': 10}",0.694517,0.724471,0.709492,0.014977,2,0.735962,0.762402,0.749182,0.01322
