# Data Preparation

This notebook is used to do some data preparation for the models. We will create some scripts to be used in the other notebooks.

In [1]:
%matplotlib inline
import pandas as pd
from matplotlib import pyplot as plt

import numpy as np
import seaborn as sns

#import joypy
import re
#from IPython.display import display, HTML
#import ipywidgets as widgets # for later

import gc

gc.enable()

sns.set(style="darkgrid", color_codes=True)
pd.options.display.float_format = '{:.3f}'.format

In [2]:
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

In [3]:
# My own function - It is less efficient

def Create_OHE(dataframe,variables,limit):
    df = dataframe.copy()
    for variable in variables:
        if len(pd.unique(df[variable])) < limit and len(pd.unique(df[variable])) > 2:
            columnsv = [variable + '_ohe' + str(value) for value in pd.unique(df[variable]).tolist()]
            dummy = pd.get_dummies(df[variable])
            dummy.columns = columnsv
            df = df.drop(variable,axis=1).copy().join(dummy)
            #print(variable,columnsv)
            
        else:
            pass
    return df

In [4]:
def MetaDataTypes(df,types_var=['cat','bin','target']):
    # Classifying the variables in the data
    variables = []
    vartype = {}
    for variable in df.columns:
        for types in types_var:
            ty = "None"
            if df[variable].dtype == int:
                tybin = "ordinal"
            elif df[variable].dtype == float:
                tybin = "continuous"
            match = re.search('^.*'+types+'.*$',variable)
            if match:
                ty = types
                if re.search('^.*bin.*$',variable):
                    tybin='binary'
                if re.search('^.*cat.*$',variable):
                    tybin='categorical'
                if 'target' in variable:
                    tybin = 'binary'
                break

            
        variables.append([variable,ty,tybin])
        
    variablesdf = pd.DataFrame(variables,columns=['name','type','bin'])
    
    for i in ['ordinal','continuous','binary','categorical']:
        vartype[i]=variablesdf.name[(variablesdf.bin==i)]

    # Creating dataframe containing variables
    return variablesdf,vartype

In [5]:
types_var = ['ind','reg','car', 'calc','target','id']
variablesdf,vartype = MetaDataTypes(train,types_var)

### Treating high cardinality categorical data

Using bayesian estimator for highcardinality categorical data ('https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf').

In [6]:
def MICC_CARDINALITY_TRANSFORM(dataframe,target, variables, k=1, f=1,heuristic = False, drop=True):
    
    def lb_card(n,k,f):
        lb = 1/(1+np.exp(-(n-k)/f))
        return lb
    if type(variables) == str:
        variables = [variables]
    for variable in variables:
        
        df = dataframe.copy()
        g = pd.DataFrame(df.groupby(variable).count().iloc[:,0]).reset_index()
        p = pd.DataFrame(df.groupby(variable).count().iloc[:,0]).reset_index()

        if heuristic == True:
            k = df[target][df[target]>0].count()/2
            f = df[target].count()/10000*5
        posterior = pd.DataFrame(df[df[target]>0]
                                 .groupby(variable).count().iloc[:,0]).reset_index().iloc[:,-1]/g.iloc[:,-1]
        prior = df[df[target]>0].count().iloc[0]/df.count().iloc[0]

        variable_tf = lb_card(g.iloc[:,-1],k,f)*posterior + (1-lb_card(g.iloc[:,-1],k,f))*prior

        g.iloc[:,-1] = variable_tf.values
        g['prior'] = prior
        g['posterior'] = posterior

        df[variable+'_micc'] = df.merge(g,on=variable,how='left').iloc[:,-1]
        
        if drop==True:
            df.drop(variable,axis=1,inplace=True)
    
    return df

In [7]:
def FREQUENCY_CARDINALITY(dataframe,variables,drop=True):
    
    if type(variables) == str:
        variables = [variables]
        
    for variable in variables:
        df = dataframe.copy()
        g = pd.DataFrame(df.groupby(variable).count().iloc[:,0]).reset_index()
        g.iloc[:,-1] = g.iloc[:,-1]/df.count().iloc[0]
        g.iloc[:,-1] = g.sort_values(by=g.columns[-1]).cumsum().iloc[:,-1]
        df[variable+'_freq'] = df.merge(g,on=variable,how='left').iloc[:,-1]
        
        if drop==True:
            df.drop(variable,axis=1,inplace=True)
    return df

In [8]:
train_prep = train.copy()
test_prep  = test.copy()
train_prep = Create_OHE(train_prep,vartype['categorical'].values,19)
test_prep = Create_OHE(test_prep,vartype['categorical'].values,19)

In [9]:
def CAT_TRANSFORM(dataframe,dataframe_prep,variables, limitinf=20,limitsup=200):
# limitinf = 20 # the number to consider high cardinality. Variables below will be set with one-hot encoding
# limitsup = 50 # the number to consider very-high cardinality. Variables above will be set with one-hot encoding
    df = dataframe.copy()
    df_prep = dataframe_prep.copy()

    for variable in variables:

        if len(pd.unique(df[variable])) < limitinf and len(pd.unique(df[variable])) > 2:
            pass

        elif len(pd.unique(df[variable])) < limitsup and len(pd.unique(df[variable])) >= limitinf:
            df_prep = FREQUENCY_CARDINALITY(df_prep,variable)
            print('freq',variable)

        elif len(pd.unique(df[variable])) >= limitsup:
            df_prep = MICC_CARDINALITY_TRANSFORM(df_prep,'target',variable,heuristic=True)
            print('micci',variable)
    
    return df_prep
 

In [10]:
train_prep = CAT_TRANSFORM(train, train_prep,vartype['categorical']) 

freq ps_car_11_cat


In [11]:
train_prep['ps_car_11_cat'] = train['ps_car_11_cat']
key_cat = train_prep[['ps_car_11_cat','ps_car_11_cat_freq']]
key_cat = key_cat[key_cat[['ps_car_11_cat','ps_car_11_cat_freq']].duplicated()==False]
train_prep.drop(['ps_car_11_cat'],axis=1,inplace=True)

In [12]:
test_prep = test_prep.merge(right=key_cat,on=['ps_car_11_cat']).drop('ps_car_11_cat',axis=1)

In [13]:
test_prep = test_prep[train_prep.drop(['target'],axis=1).columns.values] # Adjust the columns order
train_prep.drop(['target'],axis=1).columns == test_prep.columns #Check to see if all are equal

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True], dtype=bool)

## Model Fitting

In [28]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split

# Using tiny fraction of data to test tpot
# X_train, X_test, y_train, y_test = train_test_split(np.array(train_prep.drop(['target'],axis=1).iloc[0:10000,:]),
#                                                     np.array(train_prep.target[0:10000]),
#                                                     train_size=0.75, test_size=0.25)
X_train, X_test, y_train, y_test = train_test_split(np.array(train_prep.drop(['target'],axis=1)),
                                                    np.array(train_prep.target),
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTClassifier(generations=5, population_size=20,config_dict='TPOT light',
                      verbosity=2,scoring= 'roc_auc',n_jobs=-1,random_state=42,max_time_mins=60,cv=3)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_pipeline_porto.py')

                                                                                 


79.11827201666667 minutes have elapsed. TPOT will close down.
TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: LogisticRegression(input_matrix, C=25.0, dual=False, penalty=l1)
0.627500486634


True

In [16]:
from sklearn.linear_model import LogisticRegression
# Score on the training set was:0.6300687794243328
exported_pipeline = LogisticRegression(C=25., dual=False, penalty="l1")
# X = np.array(train_prep.drop(['target'],axis=1))
X = np.array(train_prep.drop(['target'],axis=1))
y = np.array(train_prep.target)

In [17]:
from sklearn.model_selection import cross_val_score
cvscore = cross_val_score(exported_pipeline,X,y,scoring='roc_auc',cv=3)

In [18]:
cvscore.mean()

0.62819730029799248

In [19]:
exported_pipeline.fit(X,y)
id_test = test_prep['id'].values
y_pred = exported_pipeline.predict_proba(test_prep)[:,1]

In [20]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_pred
sub.to_csv('submission1_logistictpot1.csv', index=False)

In [183]:
sum(id_test == test_prep.id.values)

892816