In [20]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt

import numpy as np
import seaborn as sns
import missingno as msno
#import joypy
import re
#from IPython.display import display, HTML
#import ipywidgets as widgets # for later

import gc

gc.enable()

sns.set(style="darkgrid", color_codes=True)
pd.options.display.float_format = '{:.3f}'.format

In [2]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [3]:
# My own function - It is less efficient

def Create_OHE(dataframe,variables,limit):
    df = dataframe.copy()
    for variable in variables:
        if len(pd.unique(df[variable])) < limit and len(pd.unique(df[variable])) > 2:
            columnsv = [variable + '_ohe' + str(value) for value in pd.unique(df[variable]).tolist()]
            dummy = pd.get_dummies(df[variable])
            dummy.columns = columnsv
            df = df.drop(variable,axis=1).copy().join(dummy)
            #print(variable,columnsv)
            
        else:
            pass
    return df

In [4]:
def MetaDataTypes(df,types_var=['cat','bin','target']):
    # Classifying the variables in the data
    variables = []
    vartype = {}
    for variable in df.columns:
        for types in types_var:
            ty = "None"
            if df[variable].dtype == int:
                tybin = "ordinal"
            elif df[variable].dtype == float:
                tybin = "continuous"
            match = re.search('^.*'+types+'.*$',variable)
            if match:
                ty = types
                if re.search('^.*bin.*$',variable):
                    tybin='binary'
                if re.search('^.*cat.*$',variable):
                    tybin='categorical'
                if 'target' in variable:
                    tybin = 'binary'
                break

            
        variables.append([variable,ty,tybin])
        
    variablesdf = pd.DataFrame(variables,columns=['name','type','bin'])
    
    for i in ['ordinal','continuous','binary','categorical']:
        vartype[i]=variablesdf.name[(variablesdf.bin==i)]

    # Creating dataframe containing variables
    return variablesdf,vartype

In [89]:
types_var = ['ind','reg','car', 'calc','target','id']
variablesdf,vartype = MetaDataTypes(train,types_var)

In [90]:
del(vartype['binary'][1]) #Deleting target variable

In [6]:
def MICC_CARDINALITY_TRANSFORM(dataframe,target, variables, k=1, f=1,heuristic = False, drop=True):
    
    def lb_card(n,k,f):
        lb = 1/(1+np.exp(-(n-k)/f))
        return lb
    if type(variables) == str:
        variables = [variables]
    for variable in variables:
        
        df = dataframe.copy()
        g = pd.DataFrame(df.groupby(variable).count().iloc[:,0]).reset_index()
        p = pd.DataFrame(df.groupby(variable).count().iloc[:,0]).reset_index()

        if heuristic == True:
            k = df[target][df[target]>0].count()/2
            f = df[target].count()/10000*5
        posterior = pd.DataFrame(df[df[target]>0]
                                 .groupby(variable).count().iloc[:,0]).reset_index().iloc[:,-1]/g.iloc[:,-1]
        prior = df[df[target]>0].count().iloc[0]/df.count().iloc[0]

        variable_tf = lb_card(g.iloc[:,-1],k,f)*posterior + (1-lb_card(g.iloc[:,-1],k,f))*prior

        g.iloc[:,-1] = variable_tf.values
        g['prior'] = prior
        g['posterior'] = posterior

        df[variable+'_micc'] = df.merge(g,on=variable,how='left').iloc[:,-1]
        
        if drop==True:
            df.drop(variable,axis=1,inplace=True)
    
    return df

def FREQUENCY_CARDINALITY(dataframe,variables,drop=True):
    
    if type(variables) == str:
        variables = [variables]
        
    for variable in variables:
        df = dataframe.copy()
        g = pd.DataFrame(df.groupby(variable).count().iloc[:,0]).reset_index()
        g.iloc[:,-1] = g.iloc[:,-1]/df.count().iloc[0]
        g.iloc[:,-1] = g.sort_values(by=g.columns[-1]).cumsum().iloc[:,-1]
        df[variable+'_freq'] = df.merge(g,on=variable,how='left').iloc[:,-1]
        
        if drop==True:
            df.drop(variable,axis=1,inplace=True)
    return df


def CAT_TRANSFORM(dataframe,dataframe_prep,variables, limitinf=20,limitsup=200):
# limitinf = 20 # the number to consider high cardinality. Variables below will be set with one-hot encoding
# limitsup = 50 # the number to consider very-high cardinality. Variables above will be set with one-hot encoding
    df = dataframe.copy()
    df_prep = dataframe_prep.copy()

    for variable in variables:

        if len(pd.unique(df[variable])) < limitinf and len(pd.unique(df[variable])) > 2:
            pass

        elif len(pd.unique(df[variable])) < limitsup and len(pd.unique(df[variable])) >= limitinf:
            df_prep = FREQUENCY_CARDINALITY(df_prep,variable)
            print('freq',variable)

        elif len(pd.unique(df[variable])) >= limitsup:
            df_prep = MICC_CARDINALITY_TRANSFORM(df_prep,'target',variable,heuristic=True)
            print('micci',variable)
    
    return df_prep
 

In [101]:
def FILL_MISSING(df, missing_value = np.NaN):
    ''' Fills the missing values with their modes'''
    
    if missing_value != np.NaN:
        df.replace(missing_value,np.NaN,inplace=True)
    col = df.columns
    for i in col:
        if df[i].isnull().sum()>0:
            df[i].fillna(df[i].mode()[0],inplace=True)

In [118]:
train_prep = train.copy()
test_prep  = test.copy()
train_prep = train.drop(variablesdf.name[variablesdf['type']=='calc'],axis=1)
train_prep = train_prep.drop('target',axis=1)
test_prep = test.drop(variablesdf.name[variablesdf['type']=='calc'],axis=1)

In [119]:
types_var = ['ind','reg','car', 'calc','target','id']
variablesdf,vartype = MetaDataTypes(train_prep,types_var)

FILL_MISSING(train_prep,-1)
FILL_MISSING(test_prep,-1)

In [121]:
def OHE(df1,df2,column):
    cat_col = column
    #cat_col = df.select_dtypes(include =['category']).columns
    len_df1 = df1.shape[0]
    
    df = pd.concat([df1,df2],ignore_index=True)
    c2,c3 = [],{}
    
    print('Categorical feature',len(column))
    for c in cat_col:
        if df[c].nunique()>2 :
            c2.append(c)
            c3[c] = 'ohe_'+c
    
    df = pd.get_dummies(df, prefix=c3, columns=c2,drop_first=True)

    df1 = df.loc[:len_df1-1]
    df2 = df.loc[len_df1:]
    print('Train',df1.shape)
    print('Test',df2.shape)
    return df1,df2


In [122]:
def outlier(df,columns):
    for i in columns:
        quartile_1,quartile_3 = np.percentile(df[i],[25,75])
        quartile_f,quartile_l = np.percentile(df[i],[1,99])
        IQR = quartile_3-quartile_1
        lower_bound = quartile_1 - (1.5*IQR)
        upper_bound = quartile_3 + (1.5*IQR)
        print(lower_bound,upper_bound)
        print(quartile_f,quartile_l)
        
        df[i].loc[df[i] < lower_bound] = quartile_f
        df[i].loc[df[i] > upper_bound] = quartile_l
        


In [123]:
num_col =['ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_14']
outlier(train_prep,num_col)
outlier(test_prep,num_col)

0.08484029175 1.54909582495
0.4183300133 1.8521946442
0.190569415 0.525658351
0.3155946768 0.5656854249
0.31788087655 1.25917611615
0.448300509774 1.61761689551


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


0.2891566485 0.4608812941
0.2887905816 0.5234500931
0.08156262285 1.55455860645
0.4190763654 1.8521946442
0.190569415 0.525658351
0.3155946768 0.5656854249
0.318887063862 1.25847823656
0.4485283558 1.61723992314
0.2891566485 0.4608812941
0.2887905816 0.5234500931


In [125]:
train_prep.shape
tot_cat_col = ['target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 
               'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 
               'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 
               'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 
               'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_car_01_cat', 'ps_car_02_cat', 
               'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 
               'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11', 'ps_car_15']
len(tot_cat_col)

34

In [149]:
import itertools
c = list(itertools.chain(vartype['categorical'].values,vartype['binary'].values))

In [154]:
vartype

{'binary': 6     ps_ind_06_bin
 7     ps_ind_07_bin
 8     ps_ind_08_bin
 9     ps_ind_09_bin
 10    ps_ind_10_bin
 11    ps_ind_11_bin
 12    ps_ind_12_bin
 13    ps_ind_13_bin
 16    ps_ind_16_bin
 17    ps_ind_17_bin
 18    ps_ind_18_bin
 Name: name, dtype: object, 'categorical': 2     ps_ind_02_cat
 4     ps_ind_04_cat
 5     ps_ind_05_cat
 22    ps_car_01_cat
 23    ps_car_02_cat
 24    ps_car_03_cat
 25    ps_car_04_cat
 26    ps_car_05_cat
 27    ps_car_06_cat
 28    ps_car_07_cat
 29    ps_car_08_cat
 30    ps_car_09_cat
 31    ps_car_10_cat
 32    ps_car_11_cat
 Name: name, dtype: object, 'continuous': 19    ps_reg_01
 20    ps_reg_02
 21    ps_reg_03
 34    ps_car_12
 35    ps_car_13
 36    ps_car_14
 37    ps_car_15
 Name: name, dtype: object, 'ordinal': 0            id
 1     ps_ind_01
 3     ps_ind_03
 14    ps_ind_14
 15    ps_ind_15
 33    ps_car_11
 Name: name, dtype: object}

In [109]:
train1,test1 = OHE(train_prep,test_prep,vartype['categorical'])

Categorical feature 14
Train (595212, 185)
Test (892816, 185)


In [110]:
train1,test1 = OHE(train1,test1,vartype['binary'])

Categorical feature 11
Train (595212, 185)
Test (892816, 185)


In [60]:
train_prep = Create_OHE(train_prep,vartype['categorical'].values,120)
test_prep = Create_OHE(test_prep,vartype['categorical'].values,120)

In [61]:
print(train_prep.shape)
print(test_prep.shape)

(595212, 193)
(892816, 193)


In [None]:
train_prep = CAT_TRANSFORM(train, train_prep,vartype['categorical']) 

train_prep['ps_car_11_cat'] = train['ps_car_11_cat']
key_cat = train_prep[['ps_car_11_cat','ps_car_11_cat_freq']]
key_cat = key_cat[key_cat[['ps_car_11_cat','ps_car_11_cat_freq']].duplicated()==False]
train_prep.drop(['ps_car_11_cat'],axis=1,inplace=True)

test_prep = test_prep.merge(right=key_cat,on=['ps_car_11_cat']).drop('ps_car_11_cat',axis=1)

test_prep = test_prep[train_prep.drop(['target'],axis=1).columns.values] # Adjust the columns order
train_prep.drop(['target'],axis=1).columns == test_prep.columns #Check to see if all are equal

## Model Fitting

In [None]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
