# Datathon Interbank Internacional 2019
Panuccio Abraham Alan

# Preparacion de datos

## Librerias y configuraciones iniciales

In [None]:
import numpy as np 
import pandas as pd
import copy
from datetime import timedelta
from datetime import datetime
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import string
import functools
from importlib import reload
import functools
from sklearn import preprocessing
import joblib
from google.colab import drive
from importlib.machinery import SourceFileLoader
from sklearn.preprocessing import KBinsDiscretizer  
from sklearn import model_selection
from sklearn.metrics import roc_auc_score

from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from lightgbm import LGBMClassifier
from scipy.optimize import differential_evolution
from lightgbm import LGBMRegressor
from scipy.optimize import differential_evolution
from sklearn.model_selection import train_test_split

#import custom_feature_selection as cfs
#import custom_constants as cc

BASE_PATH="/content/gdrive/My Drive/Competencias DM/interbank-internacional-2019/"

drive.mount('/content/gdrive/')

cff = SourceFileLoader("custom_features_functions", BASE_PATH+"custom_features_functions.py").load_module()
cc = SourceFileLoader("custom_constants", BASE_PATH+"custom_constants.py").load_module()

## Lectura de Datos

Se leerán las bases básicas, que solo tienen registros a nivel de usuario. Sólo se leerá la información de campaña que dependa del tiempo. Queda para mejorar, la incorporacion de más informaición temporal. 

In [None]:
# Bases de training y testing
train = pd.read_csv(BASE_PATH+"ib_base_inicial_train.csv")
X_test = pd.read_csv(BASE_PATH+"ib_base_inicial_test.csv")

# Datos agregados
sunat = pd.read_csv(BASE_PATH+"ib_base_sunat.csv")
reniec = pd.read_csv(BASE_PATH+"ib_base_reniec.csv")
vehicular = pd.read_csv(BASE_PATH+"ib_base_vehicular.csv")
digital = pd.read_csv(BASE_PATH+"ib_base_digital.csv")

# Las basese mas grandes van en zip
rcc = pd.read_csv(BASE_PATH+"ib_base_rcc.zip")
campanias = pd.read_csv(BASE_PATH+"ib_base_campanias.zip")

In [None]:
X_test["codtarget"] = 0
X_test["margen"] = 0
train["codtarget"] = (train["margen"] > 0).astype(int)

## Fuentes de Train y Test
Comienzo analizando las fuentes de datos de train y testing, para entender de que se tratan y que variables puedo agregar.

In [None]:
train.head(5)

In [None]:
train.groupby(["codmes"])[["id_persona"]].count()

### Correlacion de variables vs target
Si analizamos la correlacion de las variables de dt vs el target, vemos que el id_persona es la variable más correlacionada positivamente con el target, esto puede deberse a que sea un autoincremental, entonces lo que no estaría diciendo es que son clientes más "antiguos", la variables que entiendo esta influyendo no es el id sino la antiguedad del cliente como tal.

In [None]:
train.corr()[["codtarget"]].sort_values(by="codtarget", ascending=False)


In [None]:
# Analizamos la distribución de la probabilidad del evento en N bins 
# creados en base al id_persona y vemos como se diferencian los id más altos de los más bajos.
bins = []
for i in range(20):
  train2 = copy.copy(train)
  cff.scale_feature(train2, ["id_persona"], i, preprocessing.MaxAbsScaler())
  rr = train2.groupby(["SCL_id_persona"])[["codtarget"]].mean().sort_values(["codtarget"], ascending=False).head(1)
  bins.append({"bins":str(i), "P":round(list(rr["codtarget"])[0], 4)})

# Elijo la cantidad de bins que deja el corte con mayor probabilidad de convertir
bins = pd.DataFrame(bins)
bins = bins.sort_values(by=["P"], ascending=False).head(1)

# Realizo el bining tanto en train como en test
cff.scale_feature(train, ["id_persona"], int(bins["bins"].iloc[0]), preprocessing.MaxAbsScaler())
cff.scale_feature(X_test, ["id_persona"], int(bins["bins"].iloc[0]), preprocessing.MaxAbsScaler())

# Veo si la correlación mejora bajando mucho la dimensionalidad de la variable y minimizando el overfitting
train.corr()[["codtarget"]].sort_values(by="codtarget", ascending=False)

In [None]:
# Ahora analizo las variables financieras con mayor correlacion
# intuyendo que un cociente entre alguna de estas pueda explicar mejor la varianza
for dt in [train, X_test]:
  dt["c_linea_ingreso"] = list(map(lambda x,y: 0 if y==0 and x==0 else 100 if y==0 and x >0 else round(10*x/y), dt["linea_ofrecida"], dt["ingreso_neto"]))
  dt["c_ingreso_cem"] = list(map(lambda x,y: 0 if y==0 and x==0 else 100 if y==0 and x >0 else round(10*x/y), dt["ingreso_neto"], dt["cem"]))

train.corr()[["codtarget"]].sort_values(by="codtarget", ascending=False)

## Base SUNAT - Actividades Económicas

In [None]:
# Por lo pronto no voy a usar esta fuente, porque no especifíca a que mes es la foto y hay clientes con mas de un registro, con lo cual 
# podría ser un falso predictor
sunat.groupby(["id_persona"])[["activ_econo"]].count().sort_values(by="activ_econo", ascending=False)

## Base Reniec - Variables Sociodemográficas

In [None]:
reniec.head(5)

### Correlacion de variables vs target
Si analizamos la correlacion de las variables de dt vs el target, vemos que variable soc_var1 es la variable más correlacionada positivamente con el target, no tenemos muchos detalles sobre que es cada variable como para sacar conclusiones.

In [None]:
reneic_tg = train.merge(reniec, how="left", on=["id_persona"])
dt_corr = reneic_tg.corr()[["codtarget"]]
dt_corr["codtarget"] = abs(dt_corr["codtarget"])
dt_corr.sort_values(by="codtarget", ascending=False)

In [None]:
dt_corr = train.corr()[["codtarget"]]
dt_corr["codtarget"] = abs(dt_corr["codtarget"])
dt_corr.sort_values(by="codtarget", ascending=False)

In [None]:
del reneic_tg

## Base Vehicular - Datos sobre el vehículo que tiene y variables relacionadas

In [None]:
vehicular.head(5)

### Desnormalizacion de datos y tabla resumen

In [None]:
# Para el dataset final, necesito una tabla con un dato por persona

# Para esto primero armo datasets con el valor total por persona de cada columna (media, promedio, max, min)
var1_ag = vehicular.groupby(["id_persona"], as_index=False)["veh_var1"].agg({'sum_var1' : np.sum, 'avg_var1' : np.mean, 'min_var1' : np.min, 'max_var1' : np.max})
var2_ag = vehicular.groupby(["id_persona"], as_index=False)["veh_var2"].agg({'sum_var2' : np.sum, 'avg_var2' : np.mean, 'min_var2' : np.min, 'max_var2' : np.max})
var1_ag.head(5)

In [None]:
vehicular[["marca"]].drop_duplicates().count()

### Cateogoría propia de automóviles

In [None]:
# Para realizar una apertura por marca de automovil, agrupo los automoviles en 25 categoricas dependiendo del valor de las
# variables 1 y 2 y luego calculo variables separadas para dichas variables
marcas_var1_ag = vehicular.groupby(["marca"], as_index=False)["veh_var1"].median().fillna(0)
marcas_var2_ag = vehicular.groupby(["marca"], as_index=False)["veh_var2"].median().fillna(0)
#marcas_var1_ag.head(5)

# Realizo el bining tanto en train como en test
cff.binning_feature(marcas_var1_ag, ["veh_var1"], 1, KBinsDiscretizer(n_bins=25, encode='ordinal', strategy='uniform'))
cff.binning_feature(marcas_var2_ag, ["veh_var2"], 1, KBinsDiscretizer(n_bins=25, encode='ordinal', strategy='uniform'))

# Cambio el nombre de las columnas
marcas_var1_ag.columns = ["marca","veh_var1","cat_var1"]
marcas_var2_ag.columns = ["marca","veh_var2","cat_var2"]
marcas_var1_ag["cat_var1"] = marcas_var1_ag["cat_var1"].astype("str")
marcas_var2_ag["cat_var2"] = marcas_var2_ag["cat_var2"].astype("str")

# Agrego las nuevas categorias al dataset original
vehicular_cat = vehicular.merge(marcas_var1_ag[["marca","cat_var1"]], how="left", on="marca")
vehicular_cat = vehicular_cat.merge(marcas_var2_ag[["marca","cat_var2"]], how="left", on="marca")
vehicular_cat.head(10)

dt_vehiculos = pd.DataFrame({"id_persona":vehicular_cat["id_persona"].drop_duplicates()})
for i in [1,2]:
  vehicular_end = vehicular_cat.groupby(["id_persona", "cat_var"+str(i)])["veh_var"+str(i)].sum().unstack(level=1, fill_value=0).astype("float32")
  vehicular_end.columns = ["sum_v"+str(i)+"_cat"+c for c in vehicular_end.columns]
  dt_vehiculos = dt_vehiculos.merge(vehicular_end, how="left", on="id_persona")
  del vehicular_end
  
  vehicular_end = vehicular_cat.groupby(["id_persona", "cat_var"+str(i)])["veh_var"+str(i)].mean().unstack(level=1, fill_value=0).astype("float32")
  vehicular_end.columns = ["avg_v"+str(i)+"_cat"+c for c in vehicular_end.columns]
  dt_vehiculos = dt_vehiculos.merge(vehicular_end, how="left", on="id_persona")
  del vehicular_end

print(dt_vehiculos.shape)
dt_vehiculos.head(10)

### Correlacion de variables vs target

In [None]:
# Ahora analizo la correlación de dichas variables y puedo ver que las categorias más altas (de valores más altos en las variables)
# son las que mayor correlación tienen con el target
dt_vehiculos_tg = train[["codmes","id_persona","codtarget"]].merge(dt_vehiculos, on=["id_persona"])
dt_cors = dt_vehiculos_tg.corr()[["codtarget"]]
dt_cors = abs(dt_cors)
dt_cors.drop(["codtarget","id_persona"], axis=0).sort_values(by="codtarget", ascending=False).head(10)

In [None]:
# Scoring features focused in the conversion target
print("     Feature Selection [ ", end = '')
final_iv, scored_fe = cff.score_features(dt_vehiculos_tg.drop(["codmes"],axis=1), dt_vehiculos_tg.codtarget)
scored_fe   = scored_fe.rename(columns={'VAR_NAME':'index'})
selected_fe = scored_fe.sort_values(['IV'],ascending=0).head(10)
selected_fe = selected_fe.set_index('index').index
print(" ]")
del dt_vehiculos_tg

# Always we need the index columns to do the join after
col_index = pd.Index(["id_persona"]) 
col_index = col_index.append(selected_fe)
col_index = list(set(col_index))
col_index

# At the end we keep just the selected columns and the index columns
dt_vehiculos = dt_vehiculos[col_index]
dt_vehiculos.head(5)

## Base Digital
Esta fuente es diaria, para empezar unificaremos la fuente en una mensual y después en base asu correlación con la variable target, vamos a realizar un análisis por ventanas de tiempo y de tendencia.

In [None]:
print(digital.shape)
digital.head(5)

In [None]:
# Empiezo creando un agrupado por mes, con la suma total de cada mes 
# y a demas la cantidad de dias distintos
digital["month"] = list(map(lambda datee: datetime(int(str(datee)[:4]), int(str(datee)[4:6]), 1), digital["codday"]))
digital_mes = digital.groupby(["id_persona","month"], as_index=False).sum()
digital_mes.drop(["codday"], axis=1, inplace=True)
digital_mes["codday"] = digital.groupby(["id_persona","month"], as_index=False)["codday"].count()["codday"]

#Paso a int el mes
digital_mes["month_int"] = digital_mes['month'].dt.strftime("%Y%m").astype(int)

print(digital_mes.shape)
digital_mes.head(5)

In [None]:
digital_mes.groupby(["month_int"])["id_persona"].count()

### Variables historicas por ventanas de tiempo

In [None]:
columns_to_ignore = ["month", "month_int", "id_persona"]
data_temp = copy.copy(digital_mes)
data_temp.sort_values(by=["id_persona","month"], ascending=False)

# Initializing the result dataset
digital_hist = []
q_columns = len(data_temp.drop(columns_to_ignore, axis=1).columns)
q_col = 0
global q_max2

for c in data_temp.drop(columns_to_ignore, axis=1).columns:
    print("haciendo", c)

    #traigo a la linea del mes los datos del mes anterior, -2 y -3
    data_temp[c + "_M1"] = data_temp.groupby(['id_persona'], as_index=False)[c].shift(1)
    data_temp[c + "_M2"] = data_temp.groupby(['id_persona'], as_index=False)[c].shift(2)
    #data_temp[c + "_M3"] = data_temp.groupby(['id_persona'], as_index=False)[c].shift(3)

    #Calculo la suma mes actual + 3
    data_temp["sm_"+c]= list(map(lambda t2, t1, t: t+t1+t2, data_temp[c + "_M2"], data_temp[c + "_M1"], data_temp[c]))

    #Calculo el promedio mes actual + 3
    data_temp["avg_"+c]= list(map(lambda t2, t1, t: cff.promedio([t, t1, t2]), data_temp[c + "_M2"], data_temp[c + "_M1"], data_temp[c]))

    # Normalizo las variables antes de calcular la tendencia
    cff.scale_feature(data_temp, [c + "_M2", c + "_M1", c], 100, preprocessing.MaxAbsScaler())
    
     #Calculo la tendencia actual+1 vs +2+3
    data_temp["td_"+c]= list(map(lambda t2, t1, t: cff.tendencia((t+t1)/2, (t1+t2)/2), data_temp["SCL_"+c + "_M2"], data_temp["SCL_"+c + "_M1"], data_temp["SCL_"+c]))

    # Elimino las variables auxiliares creadas
    data_temp = data_temp.drop(c + "_M1",axis=1)
    data_temp = data_temp.drop(c + "_M2",axis=1)
    #data_temp = data_temp.drop(c + "_M3",axis=1)
    delete=pd.DataFrame(data_temp.filter(like='SCL_').columns).set_index(0).index
    data_temp=data_temp.drop(delete,axis=1)

    digital_hist.append(data_temp)

digital_hist = pd.concat(digital_hist, axis=1)
digital_hist = digital_hist.loc[:,~digital_hist.columns.duplicated()]

delete=pd.DataFrame(digital_hist.filter(like='M3').columns).set_index(0).index
digital_hist = digital_hist.drop(delete, axis=1)

delete=pd.DataFrame(digital_hist.filter(like='M2').columns).set_index(0).index
digital_hist = digital_hist.drop(delete, axis=1)

delete=pd.DataFrame(digital_hist.filter(like='M1').columns).set_index(0).index
digital_hist = digital_hist.drop(delete, axis=1)

delete=pd.DataFrame(digital_hist.filter(like='SCL_').columns).set_index(0).index
digital_hist = digital_hist.drop(delete, axis=1)

digital_hist.fillna(0)
digital_hist.head(5)


In [None]:
# Ahora analizo la correlación de dichas variables y veo que variables resultaron más correlacionadas
X_train2 = copy.copy(train)
X_train2["month"] = list(map(lambda datee: datetime(int(str(datee)[:4]), int(str(datee)[4:6]), 1), X_train2["codmes"]))

dt_result_tg = X_train2[["month","id_persona","codtarget"]].merge(digital_hist, on=["id_persona", "month"])
dt_cors = dt_result_tg.corr()[["codtarget"]]
dt_cors = abs(dt_cors)
dt_cors.drop(["codtarget","id_persona","month_int"], axis=0).sort_values(by="codtarget", ascending=False).head(5)

In [None]:
# Scoring features focused in the conversion target
print("     Feature Selection [ ", end = '')
final_iv, scored_fe = cff.score_features(dt_result_tg, dt_result_tg.codtarget)
scored_fe   = scored_fe.rename(columns={'VAR_NAME':'index'})
selected_fe = scored_fe.sort_values(['IV'],ascending=0).head(30)
selected_fe = selected_fe.set_index('index').index
print(" ]")

# Always we need the index columns to do the join after
col_index = pd.Index(["month","id_persona"]) 
col_index = col_index.append(selected_fe)
col_index = list(set(col_index))
col_index

# At the end we keep just the selected columns and the index columns
digital_hist = digital_hist[col_index]
del dt_result_tg
digital_hist.head(5)

## Base RCC - Deuda otros bancos

In [None]:
rcc = pd.read_csv(BASE_PATH+"ib_base_rcc.zip")
#rcc.groupby(["codmes"])[["id_persona"]].count()
rcc.groupby(["producto"])[["id_persona"]].count()

In [None]:
cod_bancos = rcc.groupby(["cod_banco"], as_index=False)[["mto_saldo"]].median()
cod_bancos = cod_bancos.sort_values(by=["mto_saldo"], ascending=False)
cff.binning_feature(cod_bancos, ["mto_saldo"], 1, KBinsDiscretizer(n_bins=200, encode='ordinal', strategy='uniform'))
cod_bancos["BN_mto_saldo"] = list(map(lambda b: 0 if b==0 else 1 if b==1 else 2 if b ==2 else 3, cod_bancos["BN_mto_saldo"]))
rcc = rcc.merge(cod_bancos[["cod_banco","BN_mto_saldo"]], on=["cod_banco"], how="left")
#rcc.head(5)
rcc.groupby(["BN_mto_saldo"])[["id_persona"]].count()

In [None]:
keys = dict()
keys["AVANCE"]="OTROS"
keys["CARTAS_DE_CREDITO_OTORGADAS"]="OTROS"
keys["CARTAS_FIANZAS_OTORGADAS"]="OTROS"
keys["CREDITOS_CASTIGOS"]="PRESTAMOS_COMERCIALES"
keys["CREDITO_CASTIGADOS_SIENDO_AMORTIZADOS"]="PRESTAMOS_COMERCIALES"
keys["CREDITO_INMOBILIARIOS"]="PRESTAMOS_COMERCIALES"
keys["DERIVADOS_ME_--_FORWARDS"]="OTROS"
keys["DERIVADOS_ME_--_OPCIONES"]="OTROS"
keys["DESCUENTOS"]="OTROS"
keys["FACTORING"]="PRESTAMOS_COMERCIALES"
keys["FINANC_COMEX"]="PRESTAMOS_COMERCIALES"
keys["HIPOTECARIO_MIVIVIENDA"]="PRESTAMOS"
keys["HIPOTECARIO_REGULAR"]="PRESTAMOS"
keys["LEASING"]="PRESTAMOS_COMERCIALES"
keys["LINEA_TOTAL_TC"]="TARJETAS"
keys["OTROS_CREDITOS"]="PRESTAMOS"
keys["PRESTAMOS_COMERCIALES"]="PRESTAMOS_COMERCIALES"
keys["PRESTAMO_PERSONAL"]="PRESTAMOS"
keys["REFINANCIADOS"]="PRESTAMOS"
keys["RESTO_RD"]="OTROS"
keys["RESTO_RI"]="OTROS"
keys["SOBREGIRO"]="OTROS"
keys["TARJETAS_COMPRAS"]="TARJETAS"
keys["TARJETAS_EFECTIVO"]="TARJETAS"
keys["TARJETAS_OTROS_CONCEPTOS"]="TARJETAS"
keys["TARJETAS_SIN_DEFINIR"]="TARJETAS"
keys["TARJETA_EMP_COMPRA"]="TARJETA_EMP"
keys["TARJETA_EMP_EFECTIVO"]="TARJETA_EMP"
keys["TARJETA_EMP_OTROS_CONCEPTOS"]="TARJETA_EMP"
keys["TARJETA_EMP_SIN_DEFINIR"]="TARJETA_EMP"
keys["VEHICULAR"]="PRESTAMOS"

def normalizar_producto_rcc(text):
  text = str(text).encode("ascii", "ignore").decode("ascii", "ignore").replace(" ","_").upper()
  return keys[text]

In [None]:
# Empiezo creando un agrupado por mes, con la suma total de cada mes 
# y ademas la cantidad de dias distintos
rcc["producto2"] = list(map(lambda p: normalizar_producto_rcc(p), rcc["producto"]))
#rcc.groupby(["producto2"])[["id_persona"]].count()
rcc.groupby(["producto"])[["id_persona"]].count()

## Base de Campanias

In [None]:
print(campanias.shape)
campanias.head(5)

In [None]:
campanias.groupby(["codmes"])[["id_persona"]].count()

In [None]:
def normalizar_producto(text):
  text = str(text).encode("ascii", "ignore").decode("ascii", "ignore").replace(" ","_").upper()
  if "SEGURO" in text:
    return "SEGURO"
  else:  
    if "PRSTAMO" in text or "PRESTAMO" in text or "PRÉSTAMO" in text or "CRDITO" in text:
      return "PRESTAMO"
    else:    
      if ("_TC" in text or "TC_" in text) and "ADQUISICIN_TC" not in text:
        return "OTROS_TC"
      else:    
        r = text
        r = r.split("_")
        if len(r) > 1:
          return r[0] + "_" + r[1]
        else:
          return r[0]

def normalizar_canal(text):
  text = str(text).encode("ascii", "ignore").decode("ascii", "ignore").replace(" ","_").replace("(","_").replace(")","_").upper()  
  r = text
  if "ABP" in r:
    return "ABP"
  else:  
    if "ATM" in r:
      return "ATM"
    else:
      if "BANCA_TELEFONICA" in r:
        return "BANCA_TELEFONICA"
      else:  
        if "BOLSA" in r:
          return "BOLSA"
        else:
          if "CALL_EXTERNO" in r:
            return "CALL_EXTERNO"
          else:  
            if "CARTERA" in r:
              return "CARTERA"
            else:
              if "EBP" in r:
                return "EBP"
              else:  
                if "EXPRESS" in r:
                  return "EXPRESS"
                else:
                  if "FFVV" in r:
                    return "FFVV"
                  else:  
                    if "GT" in r:
                      return "GT"
                    else:
                      if "RED_DE_TIENDAS" in r:
                        return "RED_DE_TIENDAS"
                      else:  
                        if "TELEVENTAS" in r:
                          return "TELEVENTAS"
                        else:
                          if "TLV" in r:
                            return "TLV"
                          else:  
                            if "NAN" in r:
                              return "NAN"

In [None]:
# Empiezo creando un agrupado por mes, con la suma total de cada mes 
# y ademas la cantidad de dias distintos
#campanias["month"] = list(map(lambda datee: datetime(int(str(datee)[:4]), int(str(datee)[4:6]), 1), campanias["codmes"]))
campanias["producto"] = list(map(lambda p: normalizar_producto(p), campanias["producto"]))
campanias["canal_asignado"] = list(map(lambda c: normalizar_canal(c), campanias["canal_asignado"]))
campanias["prod_canal"] = list(map(lambda p, c: str(p)+"_"+str(c), campanias["producto"], campanias["canal_asignado"]))
campanias.head(5)

## Creación del Target de predicción

Se opta por construir un target binario, para establecer quienes son clientes rentables y, por tanto, es conveniente hacerles campaña para atraerlos. 

In [None]:
global X_train
global y_train
global X_test
global y_test

y_train = train[['codmes', 'id_persona', 'margen']].copy()
y_train["prediction_id"] = y_train["id_persona"].astype(str) + "_" + y_train["codmes"].astype(str)
y_train["target"] = (y_train["margen"] > 0).astype(int)
y_train = y_train.set_index("prediction_id")
X_train = train.drop(["codtarget", "margen"], axis=1)
X_train["prediction_id"] = X_train["id_persona"].astype(str) + "_" + X_train["codmes"].astype(str)
X_test["prediction_id"] = X_test["id_persona"].astype(str) + "_" + X_test["codmes"].astype(str)
del train

## Consolidación de Bases

Se unene todas las bases por id_persona

In [None]:
sunat = sunat.groupby(["id_persona", "activ_econo"]).meses_alta.sum().unstack(level=1, fill_value=0).astype("int32")
vehicular1 = vehicular.groupby(["id_persona", "marca"]).veh_var1.sum().unstack(level=1, fill_value=0).astype("float32")
vehicular2 = vehicular.groupby(["id_persona", "marca"]).veh_var2.sum().unstack(level=1, fill_value=0).astype("float32")
reniec = reniec.set_index("id_persona").astype("float32")
del vehicular

In [None]:
vehicular1.columns = [c + "_v1" for c in vehicular1.columns]
vehicular2.columns = [c + "_v2" for c in vehicular2.columns]

In [None]:
X_train.head(10)

In [None]:
X_train["month"] = list(map(lambda datee: datetime(int(str(datee)[:4]), int(str(datee)[4:6]), 1), X_train["codmes"]))
#X_train = X_train.set_index("prediction_id").astype("int32").reset_index().set_index("id_persona")
X_train = X_train.merge(digital_hist, how="left", on=["id_persona","month"])
X_train = X_train.merge(dt_vehiculos, how="left", on=["id_persona"])
X_train = X_train.merge(reniec, how="left", on=["id_persona"])
X_train = X_train.merge(sunat, how="left", on=["id_persona"])
X_train.drop(["month"], axis=1, inplace=True)

X_test["month"] = list(map(lambda datee: datetime(int(str(datee)[:4]), int(str(datee)[4:6]), 1), X_test["codmes"]))
#X_test = X_test.set_index("prediction_id").astype("int32").reset_index().set_index("id_persona")
X_test = X_test.merge(digital_hist, how="left", on=["id_persona","month"])
X_test = X_test.merge(dt_vehiculos, how="left", on=["id_persona"])
X_test = X_test.merge(reniec, how="left", on=["id_persona"])
X_test = X_test.merge(sunat, how="left", on=["id_persona"])
X_test.drop(["month"], axis=1, inplace=True)

print(X_train.shape)
X_train.head(5)

In [None]:
del vehicular1, vehicular2, reniec, sunat

## Variables historicas de campañas

In [None]:
camp_canal = campanias.groupby(["codmes", "id_persona", "canal_asignado"]).size().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype("int32")
camp_prod = campanias.groupby(["codmes", "id_persona", "producto"]).size().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype("int32")
del campanias
import gc
gc.collect()

In [None]:
import gc
gc.collect()

In [None]:
meses = {
    201901: slice(201801, 201810),
    201902: slice(201802, 201811),
    201903: slice(201803, 201812),
    201904: slice(201804, 201901),
    201905: slice(201805, 201902),
    201906: slice(201806, 201903),
    201907: slice(201807, 201904)
}
tw="10M"
complementos = []
for mes in meses.keys():
    print("*"*10, mes, "*"*10)
    
    res = pd.concat([camp_prod.loc[meses[mes]].groupby("id_persona").sum()], axis=1)
    res["codmes"] = mes
    res = res.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
    res.columns = [tw+"_sum_"+c for c in res.columns]

    res_c = pd.concat([camp_canal.loc[meses[mes]].groupby("id_persona").sum()], axis=1)
    res_c["codmes"] = mes
    res_c = res_c.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
    res_c.columns = [tw+"_sum_c"+c for c in res_c.columns]
    
    res2 = pd.concat([camp_prod.loc[meses[mes]].groupby("id_persona").mean()], axis=1)
    res2["codmes"] = mes
    res2 = res2.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
    res2.columns = [tw+"_avg_"+c for c in res2.columns]

    res2_c = pd.concat([camp_canal.loc[meses[mes]].groupby("id_persona").mean()], axis=1)
    res2_c["codmes"] = mes
    res2_c = res2_c.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
    res2_c.columns = [tw+"_avg_c"+c for c in res2_c.columns]

    res3 = pd.concat([camp_prod.loc[meses[mes]].groupby("id_persona").max()], axis=1)
    res3["codmes"] = mes
    res3 = res3.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
    res3.columns = [tw+"_max_"+c for c in res3.columns]

    res3_c = pd.concat([camp_canal.loc[meses[mes]].groupby("id_persona").max()], axis=1)
    res3_c["codmes"] = mes
    res3_c = res3_c.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
    res3_c.columns = [tw+"_max_c"+c for c in res3_c.columns]

    res = pd.concat([res, res2, res3, res_c, res2_c, res3_c], axis=1)

    complementos.append(res)

gc.collect()
print("contatenando complementos "+str(tw))
complementos = pd.concat(complementos)
gc.collect()
print("X_train join")
X_train = X_train.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
gc.collect()
print("X_test join")
X_test = X_test.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
gc.collect()

del camp_canal, camp_prod, complementos,res
gc.collect()

X_train.head()

## Renombrado de Variables con nombre no ascii

El algoritmo que usamos no se lleva bien con cadenas de texto con caracteres especiales, las renombramos.

In [None]:
def emprolijar_dataset():
  global X_train
  global y_train
  global X_test
  global y_test
  
  non_ascii = X_train.columns[[not all(ord(c) < 128 for c in s) for s in X_train.columns]].tolist()
  for i, c in enumerate(non_ascii):
      X_train["non_ascii_" + str(i)] = X_train[c]
      X_train = X_train.drop(c, axis= 1)
      X_test["non_ascii_" + str(i)] = X_test[c]
      X_test = X_test.drop(c, axis= 1)
  
  X_train=X_train.dropna(axis=1,how='all')
  X_test=X_test.dropna(axis=1,how='all')

  vars = set(X_train.columns).intersection(set(X_test.columns))

  X_test = X_test[vars]
  X_train = X_train[vars]

  X_train.fillna(0)
  X_test.fillna(0)

## Variables historicas de RCC

In [None]:
global rcc_prod
global rcc_prod2
global rcc_tot
global rcc_bco
global rcc_sdo_prod
global rcc_sdo_prod2
global rcc_sdo_tot
global rcc_sdo_bco
global rcc_mora_prod
global rcc_mora_prod2
global rcc_mora_tot
global rcc_mora_bco

rcc_prod = rcc.groupby(["codmes", "id_persona", "producto"]).size().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype("int32")
rcc_prod2 = rcc.groupby(["codmes", "id_persona", "producto2"]).size().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype("int32")
rcc_tot = rcc.groupby(["codmes", "id_persona"]).size().reset_index().set_index(["codmes","id_persona"]).sort_index().astype("int32")
rcc_bco = rcc.groupby(["codmes", "id_persona", "BN_mto_saldo"]).size().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype("int32")

rcc_sdo_prod = rcc.groupby(["codmes", "id_persona", "producto"])["mto_saldo"].sum().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)
rcc_sdo_prod2 = rcc.groupby(["codmes", "id_persona", "producto2"])["mto_saldo"].sum().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)
rcc_sdo_tot = rcc.groupby(["codmes", "id_persona"])["mto_saldo"].sum().reset_index().set_index(["codmes","id_persona"]).sort_index().astype(float)
rcc_sdo_bco = rcc.groupby(["codmes", "id_persona", "BN_mto_saldo"])["mto_saldo"].sum().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)
rcc_sdo_clasif = rcc.groupby(["codmes", "id_persona", "clasif"])["mto_saldo"].sum().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)
rcc_sdo_mora = rcc.groupby(["codmes", "id_persona", "rango_mora"])["mto_saldo"].sum().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)

rcc_mora_prod = rcc.groupby(["codmes", "id_persona", "producto"])["rango_mora"].max().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)
rcc_mora_prod2 = rcc.groupby(["codmes", "id_persona", "producto2"])["rango_mora"].max().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)
rcc_mora_tot = rcc.groupby(["codmes", "id_persona"])["rango_mora"].max().reset_index().set_index(["codmes","id_persona"]).sort_index().astype(float)
rcc_mora_bco = rcc.groupby(["codmes", "id_persona", "BN_mto_saldo"])["rango_mora"].max().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)

gc.collect()

### 3 Meses

In [None]:
def agregar_rcc_3m(FEATURE_SELECTION):
  global X_train
  global y_train
  global X_test
  global y_test

  meses = {
      201901: slice(201807, 201809),
      201902: slice(201808, 201810),
      201903: slice(201809, 201811),
      201904: slice(201810, 201812),
      201905: slice(201811, 201901),
      201906: slice(201812, 201902),
      201907: slice(201901, 201903)
  }
  tw="3M_RCC_"
  reses=[]
  dts = dict()
  dts["sz_prod"] = rcc_prod
  dts["sz_tot"] = rcc_tot
  dts["sz_bco"] = rcc_bco
  dts["sdo_prod"] = rcc_sdo_prod
  dts["sdo_tot"] = rcc_sdo_tot
  dts["sdo_bco"] = rcc_sdo_bco
  #dts["sdo_mora"] = rcc_sdo_mora 

  complementos = []

  print("agregar_rcc_3m() - INICIO")

  for mes in meses.keys():
      print(" "*5 + "Mes: "+str(mes))
      reses = []

      for dd in dts.keys():
        res = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").sum()], axis=1)
        res["codmes"] = mes
        res = res.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res.columns = [tw+"_sum_"+dd+"_"+str(c) for c in res.columns]
        reses.append(copy.copy(res))
        
        res2 = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").mean()], axis=1)
        res2["codmes"] = mes
        res2 = res2.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res2.columns = [tw+"_avg_"+dd+"_"+str(c) for c in res2.columns]
        reses.append(copy.copy(res2))

      res = pd.concat(reses, axis=1)

      complementos.append(res)

  complementos = pd.concat(complementos)
  gc.collect()
  
  if not(FEATURE_SELECTION):
    X_train = X_train.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")    
    try:
      X_train = X_train.drop(["index"], axis=1)
    except:
      ww =0

    X_test = X_test.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")

    try:
      X_test = X_test.drop(["index"], axis=1)
    except:
      ww = 0
    gc.collect()

  else:
    cols_to_hang = X_train.columns
    cols_to_rmv = [ x for x in X_train.columns if 'codmes' not in x and 'id_persona' not in x ]
    X_train = X_train.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
    X_train_fe = X_train.drop(cols_to_rmv, axis=1)
    X_train_fe.fillna(0)
    gc.collect()

    # Scoring features focused in the conversion target
    print("     Feature Selection [ ", end = '')
    final_iv, scored_fe = cff.score_features(X_train_fe, y_train["target"])
    scored_fe   = scored_fe.rename(columns={'VAR_NAME':'index'})
    selected_fe = scored_fe.sort_values(['IV'],ascending=0).head(50)
    selected_fe = selected_fe.set_index('index').index
    print(" ]")

    # Always we need the index columns to do the join after
    col_index = pd.Index(["codmes","id_persona"]) 
    col_index = col_index.append(selected_fe)
    col_index = col_index.append(cols_to_hang)
    col_index = list(set(col_index))

    # At the end we keep just the selected columns and the index columns
    X_train = X_train[col_index]

    try:
      X_train = X_train.drop(["index"], axis=1)
    except:
      ww =0

    X_test = X_test.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
    X_test = X_test[col_index]

    try:
      X_test = X_test.drop(["index"], axis=1)
    except:
      ww = 0

  gc.collect()

  emprolijar_dataset()

  print("agregar_rcc_3m() - FIN")

### 10 Meses

In [None]:
def agregar_rcc_10m(FEATURE_SELECTION):
  global X_train
  global y_train
  global X_test
  global y_test
  meses = {
      201901: slice(201712, 201809),
      201902: slice(201801, 201810),
      201903: slice(201802, 201811),
      201904: slice(201803, 201812),
      201905: slice(201804, 201901),
      201906: slice(201805, 201902),
      201907: slice(201806, 201903)
  }
  tw="10M_RCC_"
  reses=[]
  dts = dict()
  dts["sz_prod"] = rcc_prod
  dts["sz_prod2"] = rcc_prod2
  dts["sz_tot"] = rcc_tot
  dts["sz_bco"] = rcc_bco
  
  dts["sdo_prod"] = rcc_sdo_prod
  dts["sdo_prod2"] = rcc_sdo_prod2
  dts["sdo_tot"] = rcc_sdo_tot
  dts["sdo_bco"] = rcc_sdo_bco
  #dts["sdo_clasif"] = rcc_sdo_clasif
  #dts["sdo_mora"] = rcc_sdo_mora 

  dts["mora_prod"] = rcc_mora_prod
  dts["mora_prod2"] = rcc_mora_prod2
  dts["mora_tot"] = rcc_mora_tot
  dts["mora_bco"] = rcc_mora_bco
  complementos = []
  print("agregar_rcc_10m() - INICIO")
  for mes in meses.keys():
      print(" "*5 + "Mes: "+str(mes))
      reses = []

      for dd in dts.keys():
        res = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").sum()], axis=1)
        res["codmes"] = mes
        res = res.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res.columns = [tw+"_sum_"+dd+"_"+str(c) for c in res.columns]
        reses.append(copy.copy(res))
        
        res2 = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").mean()], axis=1)
        res2["codmes"] = mes
        res2 = res2.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res2.columns = [tw+"_avg_"+dd+"_"+str(c) for c in res2.columns]
        reses.append(copy.copy(res2))

        res3 = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").max()], axis=1)
        res3["codmes"] = mes
        res3 = res3.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res3.columns = [tw+"_max_"+dd+"_"+str(c) for c in res3.columns]
        reses.append(copy.copy(res3))

        res4 = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").size()], axis=1)
        res4["codmes"] = mes
        res4 = res4.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res4.columns = [tw+"_dis_"+dd+"_"+str(c) for c in res4.columns]
        reses.append(copy.copy(res4))

      res = pd.concat(reses, axis=1)

      complementos.append(res)

  gc.collect()
  complementos = pd.concat(complementos)
  try:
    X_train = X_train.drop(["index"], axis=1)
  except:
    ww = 0
    
  if not(FEATURE_SELECTION):
    X_train = X_train.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")    
    try:
      X_train = X_train.drop(["index"], axis=1)
    except:
      ww =0

    X_test = X_test.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")

    try:
      X_test = X_test.drop(["index"], axis=1)
    except:
      ww = 0
    gc.collect()
    
  else:
    cols_to_hang = X_train.columns
    cols_to_rmv = [ x for x in X_train.columns if 'codmes' not in x and 'id_persona' not in x ]
    X_train = X_train.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
    X_train_fe = X_train.drop(cols_to_rmv, axis=1)
    X_train_fe.fillna(0)
    gc.collect()

    # Scoring features focused in the conversion target
    print("     Feature Selection [ ", end = '')
    final_iv, scored_fe = cff.score_features(X_train_fe, y_train["target"])
    scored_fe   = scored_fe.rename(columns={'VAR_NAME':'index'})
    selected_fe = scored_fe.sort_values(['IV'],ascending=0).head(200)
    selected_fe = selected_fe.set_index('index').index
    print(" ]")

    # Always we need the index columns to do the join after
    col_index = pd.Index(["codmes","id_persona"]) 
    col_index = col_index.append(selected_fe)
    col_index = col_index.append(cols_to_hang)
    col_index = list(set(col_index))

    # At the end we keep just the selected columns and the index columns
    X_train = X_train[col_index]

    try:
      X_train = X_train.drop(["index"], axis=1)
    except:
      ww = 0

    X_test = X_test.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
    X_test = X_test[col_index]

    try:
      X_test = X_test.drop(["index"], axis=1)
    except:
      ww = 0

  gc.collect()

  emprolijar_dataset()

  print("agregar_rcc_10m() - FIN")

## Comparacion de limite con otros bancos

In [None]:
# Tomo solo los items que representan el limite de TC y consumo TC en otros bancos
rcc_tarj = rcc[rcc.producto.isin(["TARJETAS COMPRAS","TARJETAS EFECTIVO","LINEA TOTAL TC"])]

# Unifico compras y extraccion de efectivo en un solo item
rcc_tarj["producto"] = list(map(lambda b: "TARJETAS COMPRAS" if "TARJETAS EFECTIVO" in b else b, rcc_tarj["producto"]))

# Agrupo por cliente, mes y producto tomando el maximo y la suma total de todos los bancos
rcc_sum_prod = rcc_tarj.groupby(["codmes", "id_persona", "producto"])["mto_saldo"].sum().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)
rcc_max_prod = rcc_tarj.groupby(["codmes", "id_persona", "producto"])["mto_saldo"].max().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(float)

# Calculo para cada cliente, el % de uso de su limite en otros bancos
rcc_max_prod["uso_lim"] = list(map(lambda uso, lim: round(10*(uso/lim)) if uso >0 and lim > 0 else 100 if uso >0 and lim == 0 else 0 if lim > 0 else -1, rcc_max_prod["TARJETAS COMPRAS"], rcc_max_prod["LINEA TOTAL TC"]))
rcc_sum_prod["uso_lim"] = list(map(lambda uso, lim: round(10*(uso/lim)) if uso >0 and lim > 0 else 100 if uso >0 and lim == 0 else 0 if lim > 0 else -1, rcc_sum_prod["TARJETAS COMPRAS"], rcc_sum_prod["LINEA TOTAL TC"]))

rcc_max_prod.head(5)

In [None]:
# Linea maxima por grupo de banco
rcc_lim_tarj = rcc_tarj[rcc_tarj.producto.isin(["LINEA TOTAL TC"])]
rcc_lim_max_bco = rcc_lim_tarj.groupby(["codmes", "id_persona", "BN_mto_saldo"])["mto_saldo"].max().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(int)
rcc_lim_max_bco.columns = ["id_persona","lim_bco0","lim_bco1","lim_bco2","lim_bco3"]
del rcc_lim_tarj
rcc_lim_max_bco.head(5)

In [None]:
# Consumo maximo por grupo de banco
rcc_cons_tarj = rcc_tarj[rcc_tarj.producto.isin(["TARJETAS COMPRAS"])]
rcc_cons_max_bco = rcc_cons_tarj.groupby(["codmes", "id_persona", "BN_mto_saldo"])["mto_saldo"].max().unstack(level=2, fill_value=0).reset_index().set_index("codmes").sort_index().astype(int)
rcc_cons_max_bco.columns = ["id_persona","cons_bco0","cons_bco1","cons_bco2","cons_bco3"]
del rcc_cons_tarj
del rcc_tarj
rcc_cons_max_bco.head(5)

In [None]:
def linea_vs_otros_bancos(meses, tw, precision):
  global X_train
  global y_train
  global X_test
  global y_test

  dts = dict()
  dts["sum_prod"] = rcc_sum_prod
  dts["mx_prod"] = rcc_max_prod
  dts["lim_mx_bco"] = rcc_lim_max_bco
  #dts["cons_mx_bco"] = rcc_cons_max_bco

  complementos = []
  for mes in meses.keys():
      reses = []

      for dd in dts.keys():
        res = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").max()], axis=1)
        res["codmes"] = mes
        res = res.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res.columns = [tw+"_max_"+dd+"_"+str(c) for c in res.columns]
        reses.append(copy.copy(res))
        
        res2 = pd.concat([dts[dd].loc[meses[mes]].groupby("id_persona").mean()], axis=1)
        res2["codmes"] = mes
        res2 = res2.reset_index().set_index(["id_persona", "codmes"]).astype("float32")
        res2.columns = [tw+"_avg_"+dd+"_"+str(c) for c in res2.columns]
        reses.append(copy.copy(res2))

      res = pd.concat(reses, axis=1)

      complementos.append(res)

  gc.collect()
  complementos = pd.concat(complementos)
  X_train = X_train.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
  try:
    X_train = X_train.drop(["index"], axis=1)
  except:
    ww = 0

  X_test = X_test.reset_index().join(complementos, on=["id_persona", "codmes"]).set_index("prediction_id")
  try:
    X_test = X_test.drop(["index"], axis=1)
  except:
    ww = 0

  # Relleno con ceros los casos donde no se encontro datos
  X_train.fillna(0)
  X_test.fillna(0)
  gc.collect()

  # Por cada nueva columna creada, creo el cociente en training y testing y luego borro las variables usadas para calcularlo  
  for c in X_train.columns:
      if (tw in c):
        if c in X_test.columns:
          if 'LINEA TOTAL TC' in c or 'TARJETAS COMPRAS' in c:
            X_train["linea_ofrecida_"+c] = list(map(lambda ofre, actual: round(precision*cff.tendencia(ofre, actual)), X_train["linea_ofrecida"], X_train[c]))
            X_train.fillna(0)
            X_test["linea_ofrecida_"+c] = list(map(lambda ofre, actual: round(precision*cff.tendencia(ofre, actual)), X_test["linea_ofrecida"], X_test[c]))
            X_test.fillna(0)

        # Elimino las variables agregadas, para dejar solo los cocientes
        if 'LINEA TOTAL TC' in c or 'TARJETAS COMPRAS' in c:
          X_train = X_train.drop([c], axis=1)
        
        # La borro de testing solo si existe
        if c in X_test.columns:
          if 'LINEA TOTAL TC' in c or 'TARJETAS COMPRAS' in c:
            X_test = X_test.drop([c], axis=1)

def comparar_linea_otros_bancos(precision):
  global X_train
  global y_train
  global X_test
  global y_test

  print("comparar_linea_otros_bancos() - INICIO")
  
  #meses = {201901: slice(201809, 201809),201902: slice(201810, 201810),201903: slice(201811, 201811),201904: slice(201812, 201812),201905: slice(201901, 201901),201906: slice(201902, 201902),201907: slice(201903, 201903)}
  #tw="1M_RCCTARJ_"
  #linea_vs_otros_bancos(meses, tw, precision)

  meses = {201901: slice(201807, 201809),201902: slice(201808, 201810),201903: slice(201809, 201811),201904: slice(201810, 201812),201905: slice(201811, 201901),201906: slice(201812, 201902),201907: slice(201901, 201903)}
  tw="3M_RCCTARJ_"
  linea_vs_otros_bancos(meses, tw, precision)
  
  meses = {201901: slice(201712, 201809),201902: slice(201801, 201810),201903: slice(201802, 201811),201904: slice(201803, 201812),201905: slice(201804, 201901),201906: slice(201805, 201902),      201907: slice(201806, 201903)}
  tw="10M_RCCTARJ_"
  linea_vs_otros_bancos(meses, tw, precision)

  emprolijar_dataset()

  print("comparar_linea_otros_bancos() - FIN")

## Tendencia historica

In [None]:
def var_tend_hist():
  trcols = set([x.replace('3M_', '') for x in X_train.columns if '3M_RCC' in x])
  trcols = trcols.intersection(set([x.replace('10M_', '') for x in X_train.columns if '10M_RCC' in x]))
  tecols = set([x.replace('3M_', '') for x in X_test.columns if '3M_RCC' in x])
  tecols = tecols.intersection(set([x.replace('10M_', '') for x in X_test.columns if '10M_RCC' in x]))
  cols_to_apply = trcols.intersection(tecols)

  for c in cols_to_apply:
    for data_temp in [X_train, X_test]:
      # Normalizo las variables antes de calcular la tendencia
      cff.scale_feature(data_temp, ["10M_"+c , "3M_"+c ], 200, preprocessing.MaxAbsScaler())

      #Calculo la tendencia actual+1 vs +2+3
      data_temp["td_"+c]= list(map(lambda t2, t1: cff.tendencia(t1, t2), data_temp["SCL_10M_"+c], data_temp["SCL_3M_"+c]))
      
      delete=pd.DataFrame(data_temp.filter(like='SCL_').columns).set_index(0).index
      data_temp=data_temp.drop(delete,axis=1)


# LGBMCLassifier

In [None]:
def aplicar_LGBMClassfier(indice, kseeds, ENTRENAR_TODOS_MESES, k_folds):

  global X_train
  global y_train
  global X_test
  global y_test
  global Xt
  global yt
  global wt
  global Xv
  global yv
  global wv
  global learner
  global flag
  flag = 1

  def evaluar_ganancia(y_true, y_pred):
    global flag
    if flag == 1:
      optimization2 = sum(y_pred * wt)
      flag = 0
    else:
      optimization2 = sum(y_pred * wv)
      flag = 1

    return 'GAIN', optimization2, True
    
  drop_cols = ["codmes","id_persona"]
  fi = []
  test_probs = []
  train_probs2 = []
  #for s in kseeds:
  train_probs = []
  s = 20191125
  #print("Seed: "+str(s))
  #for mes in X_train.codmes.unique():
      #print("   Mes: "+str(mes))
      #Xt = X_train[X_train.codmes != mes]
      #yt = y_train.loc[Xt.index, "target"]
      #Xt = Xt.drop(drop_cols, axis=1)    
      #wt = y_train.loc[Xt.index, "margen"]

      #Xv = X_train[X_train.codmes == mes]
      #yv = y_train.loc[Xv.index, "target"]
      #wv = y_train.loc[Xv.index, "margen"]

      #learner = LGBMClassifier(n_estimators=10000, random_state=s, objective="binary", metric=['GAIN'])
      #learner.fit(Xt, yt, early_stopping_rounds=10, eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=0, eval_metric=evaluar_ganancia)
      
      #if not(ENTRENAR_TODOS_MESES):
          #test_probs.append(pd.Series(learner.predict_proba(X_test.drop(drop_cols, axis=1))[:, -1],index=X_test.index, name="fold_" + str(mes)))
          #train_probs2.append(pd.Series(learner.predict_proba(Xv.drop(drop_cols, axis=1))[:, -1], index=Xv.index, name="fold_" + str(mes)))

      #train_probs.append(pd.Series(learner.predict_proba(Xv.drop(drop_cols, axis=1))[:, -1],index=Xv.index, name="probs"))
      #fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

  if ENTRENAR_TODOS_MESES:  
    i = 0
    for train_idx, valid_idx in model_selection.StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=s).split(X_train, y_train["target"]):
      i += 1

      # Split the train & validation datasets with the cv fold ids
      Xt = X_train.iloc[train_idx]
      Xt = Xt.drop(drop_cols, axis=1)
      Xt = Xt.reset_index().set_index(["prediction_id"])
      yt = y_train.loc[Xt.index, "target"]
      wt = y_train.loc[Xt.index, "margen"]

      Xv = X_train.iloc[valid_idx]
      Xv = Xv.reset_index().set_index(["prediction_id"])
      yv = y_train.loc[Xv.index, "target"]
      wv = y_train.loc[Xv.index, "margen"]

      learner = LGBMClassifier(n_estimators=10000, random_state=s, objective="binary", metric=['GAIN'])
      learner.fit(Xt, yt, early_stopping_rounds=10, eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=0, eval_metric=evaluar_ganancia)

      test_probs.append(pd.Series(learner.predict_proba(X_test.drop(drop_cols, axis=1))[:, -1],index=X_test.index, name="fold_" + str(i)))
      train_probs2.append(pd.Series(learner.predict_proba(Xv.drop(drop_cols, axis=1))[:, -1], index=Xv.index, name="fold_" + str(i)))

      fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

  test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
  train_probs2 = pd.concat(train_probs2)
  #train_probs = pd.concat(train_probs)

  fi = pd.concat(fi, axis=1).mean(axis=1)

  print(fi.sort_values(ascending=False).head(20).to_frame())

  #res = y_train.join(train_probs.rename("probs"))
  #optimization = differential_evolution(lambda c: -((res.probs > c[0]) * res.margen / res.margen.sum()).sum(), [(0, 1)], tol=0.00001, seed=19911125)
  #print("aplicar_LGBMClassfier - "+str(indice)+" - Lift: "+str(optimization['fun']))

  res = y_train.join(train_probs2.rename("probs"))
  optimization = differential_evolution(lambda c: -((res.probs > c[0]) * res.margen / res.margen.sum()).sum(), [(0, 1)], tol=0.00001, seed=19911125)
  print("aplicar_LGBMClassfier - "+str(indice)+" - K-Fold Lift: "+str(optimization['fun']))

  train_probs2.to_csv(BASE_PATH+str(indice)+"_ib_train_stacking.csv", header=True)
  test_probs.to_csv(BASE_PATH+str(indice)+"_ib_test_stacking.csv", header=True)

  return optimization['fun'], fi.sort_values(ascending=False).to_frame()

In [None]:
def limpiar_datasets_rcc():
  del rcc_prod
  del rcc_tot
  del rcc_sdo_prod
  del rcc_sdo_tot
  del rcc_mora_prod
  del rcc_mora_tot
  del rcc_prod2
  del rcc_bco
  del rcc_sdo_prod2
  del rcc_sdo_bco
  del rcc_mora_prod2
  del rcc_mora_bco

  gc.collect()


# LGBMRegressor Target

In [None]:
def aplicar_LGBMRegressor_target(indice, X_train, X_test):
  #global X_train
  #global X_test
  global y_train
  global y_test
  global Xt
  global yt
  global wt
  global Xv
  global yv
  global wv
  global learner
  global flag
  flag = 1

  def evaluar_ganancia(y_true, y_pred):
    global flag
    if flag == 1:
      optimization2 = sum(y_pred * wt)
      flag = 0
    else:
      optimization2 = sum(y_pred * wv)
      flag = 1

    return 'GAIN', optimization2, True
    
  drop_cols = ["codmes","id_persona"]
  fi = []
  test_probs = []
  train_probs = []
  for mes in X_train.codmes.unique():
      Xt = X_train[X_train.codmes != mes]
      yt = y_train.loc[Xt.index, "target"]
      Xt = Xt.drop(drop_cols, axis=1)    
      wt = y_train.loc[Xt.index, "margen"]

      Xv = X_train[X_train.codmes == mes]
      yv = y_train.loc[Xv.index, "target"]
      wv = y_train.loc[Xv.index, "margen"]

      learner = LGBMRegressor(n_estimators=10000, random_state=19911125, metric=['GAIN'])
      learner.fit(Xt, yt, early_stopping_rounds=10, eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=200, eval_metric=evaluar_ganancia)
      
      test_probs.append(pd.Series(learner.predict(X_test.drop(drop_cols, axis=1)),index=X_test.index, name="fold_" + str(mes)))
      train_probs.append(pd.Series(learner.predict(Xv.drop(drop_cols, axis=1)),index=Xv.index, name="probs"))
      fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

  test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
  train_probs = pd.concat(train_probs)
  fi = pd.concat(fi, axis=1).mean(axis=1)

  print(fi.sort_values(ascending=False).head(20).to_frame())

  res = y_train.join(train_probs.rename("probs"))
  optimization = differential_evolution(lambda c: -((res.probs > c[0]) * res.margen / res.margen.sum()).sum(), [(0, 1)], tol=0.00001, seed=19911125)
  print("aplicar_LGBMClassfier_target - "+str(indice)+" - Lift: "+str(optimization['fun']))

  train_preds = (train_probs > optimization["x"][0]).astype(int)
  train_preds.index.name="prediction_id"
  train_preds.name="class"
  train_probs.to_csv(str(indice)+"_ib_train_stacking.csv", header=True)

  test_preds = (test_probs > optimization["x"][0]).astype(int)
  test_preds.index.name="prediction_id"
  test_preds.name="class"
  test_preds.to_csv(BASE_PATH+"benchmark1.csv", header=True)
  test_probs.to_csv(str(indice)+"_ib_test_stacking.csv", header=True)

# Deep Learning Classifier

In [None]:
global cn_cols
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from keras.constraints import maxnorm
from keras.optimizers import SGD
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings("ignore")

def deep_learning_classifier(indice, k_epochs, k_folds, lr2):
	# Define custom loss function
	def custom_loss(weight2):

		# KL weight (to be used by total loss and by annealing scheduler)
		weight = K.variable(weight2, name='weight')
	
		# Create a loss function that adds the MSE loss to the mean of all squared activations of a specific layer
		def loss(y_true, y_pred):		
			return -sum(y_pred * weight)

		def loss_auc(y_true, y_pred):
			auc = tf.metrics.auc(y_true, y_pred[:, -1])
			K.get_session().run(tf.local_variables_initializer())
			return auc

		# Return a function
		return loss_auc

	# define custon auc roc function
	def auc_roc(y_true, y_pred):
			# any tensorflow metric
			value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)

			# find all variables created for this metric
			metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

			# Add metric variables to GLOBAL_VARIABLES collection.
			# They will be initialized for new session.
			for v in metric_vars:
					tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

			# force to update metric values
			with tf.control_dependencies([update_op]):
					value = tf.identity(value)
					return value

	# define wider model
	def wider_model():
		global cn_cols

		# create model
		model = Sequential()
		model.add(Dense(32, input_dim=cn_cols, activation='relu'))
		model.add(Dropout(0.2))
		model.add(Dense(16, activation='relu'))
		model.add(Dropout(0.1))
		model.add(Dense(8, activation='relu'))
		model.add(Dense(1, activation='sigmoid'))

		# Compile model
		model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(lr=lr2), metrics=['accuracy',auc_roc])
		#model.compile(optimizer=keras.optimizers.RMSprop(lr=0.001, rho=0.9), loss=custom_loss(y_train["margen"]))
	
		return model

	from numpy.random import seed
	from tensorflow import set_random_seed
	seed(19911125)
	set_random_seed(19911125)

	# Global variables
	global Xt
	global yt
	global Xv
	global yv
	global X_train
	global y_train
	global X_test
	global y_test
	global cn_cols

	drop_cols = ["codmes","id_persona"]
	cn_cols = len(X_train.drop(drop_cols, axis=1).columns)	
	i = 0
	train_probs = []
	test_probs = []
	# Ten-Folds Cross Validation
	for train_idx, valid_idx in model_selection.StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=19911125).split(X_train, y_train["target"]):
			i += 1
			print("Fold: "+str(i))
			# Split the train & validation datasets with the cv fold ids
			Xt = X_train.iloc[train_idx]
			Xt = Xt.drop(drop_cols, axis=1)
			Xt = Xt.fillna(0)
			res = copy.copy(y_train)
			res = res.loc[Xt.index]
			yt = y_train.loc[Xt.index, "target"]
			yt = yt.fillna(0)

			Xv = X_train.iloc[valid_idx]
			Xv = Xv.drop(drop_cols, axis=1)
			Xv = Xv.fillna(0)
			yv = y_train.loc[Xv.index, "target"]
			yv = yv.fillna(0)
			 
			# evaluate model with standardized dataset
			scaler = StandardScaler()
			scaler = scaler.fit(Xt)
			Xt2 = scaler.transform(Xt)
			Xv2 = scaler.transform(Xv)
			X_test2 = scaler.transform(X_test.drop(drop_cols, axis=1))

			kr = KerasClassifier(build_fn=wider_model, batch_size=128, verbose=0)
			kr2 = kr.fit(Xt2, yt, epochs=k_epochs, verbose=0, validation_data=(Xv2, yv))

			tt_probs = kr.predict_proba(Xt2)[:,-1]
			res["probs"] = tt_probs
			optimization = differential_evolution(lambda c: -((res.probs > c[0]) * res.margen / res.margen.sum()).sum(), [(0, 1)], tol=0.0001, seed=19911125)
			print("Lift: "+str(optimization['fun']))
			del res

			train_probs.append(pd.Series(kr.predict_proba(Xv2)[:,-1],index=Xv.index, name="fold_"+str(i)))
			test_probs.append(pd.Series(kr.predict_proba(X_test2)[:,-1],index=X_test.index, name="fold_"+str(i)))

	# Agrupo las predicciones, en el caso de validation por separado en el caso de test por la media
	test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
	train_probs = pd.concat(train_probs)

	# Calculo el lift en ganancia para validation
	res = copy.copy(y_train)
	res["probs"] = train_probs
	optimization = differential_evolution(lambda c: -((res.probs > c[0]) * res.margen / res.margen.sum()).sum(), [(0, 1)], tol=0.0001, seed=19911125)
	print("Lift: "+str(optimization['fun']))

	test_probs.to_csv(str(indice)+"_ib_test_stacking.csv", header=True)
	train_probs.to_csv(str(indice)+"_ib_train_stacking.csv", header=True)
 

# Stacking

In [None]:
def stacking(indice, BINING, nbins):
  global X_train
  global y_train
  global X_test
  global y_test

  # Leo el resultado del algoritmo de clasificacion
  X_train_st = pd.read_csv(BASE_PATH+str(indice)+"_ib_train_stacking.csv")
  X_train_st = X_train_st.reset_index()
  try:
    X_train_st = X_train_st.drop(["index"], axis=1)
  except:
    w=0
  try:
    X_train_st = X_train_st.drop(["Unnamed: 0"], axis=1)
  except:
    w=0
  X_train_st.columns = ["prediction_id", indice+"_prob"]
  X_train_st = X_train_st.fillna(0)

  X_test_st = pd.read_csv(BASE_PATH+str(indice)+"_ib_test_stacking.csv")
  X_test_st = X_test_st.reset_index()
  try:
    X_test_st = X_test_st.drop(["Unnamed: 0"], axis=1)
  except:
    w=0
  try:
    X_test_st = X_test_st.drop(["index"], axis=1)
  except:
    w=0
  X_test_st.columns = ["prediction_id", indice+"_prob"]
  X_test_st = X_test_st.fillna(0)

  if BINING:
    # Realizo el bining tanto en train como en test
    cff.binning_feature(X_train_st, [indice+"_prob"], 1, KBinsDiscretizer(n_bins=nbins, encode='ordinal', strategy='uniform'))
    cff.binning_feature(X_test_st, [indice+"_prob"], 1, KBinsDiscretizer(n_bins=nbins, encode='ordinal', strategy='uniform'))

  # Agrego los datos al dataset de training (stacking)
  X_train = X_train.merge(X_train_st, how="left", on=["prediction_id"])
  X_test = X_test.merge(X_test_st, how="left", on=["prediction_id"])
  
  if BINING:
    # Borro la variable original
    X_train = X_train.drop([indice+"_prob"], axis=1)
    X_test = X_test.drop([indice+"_prob"], axis=1)

  X_test = X_test.reset_index().set_index(["prediction_id"])
  X_train = X_train.reset_index().set_index(["prediction_id"])

  try:
    X_train = X_train.drop(["index"], axis=1)
  except:
    print("Column doesn't exists")
    
  try:
    X_test = X_test.drop(["index"], axis=1)
  except:
    print("Column doesn't exists")


# LGBM Regressor

In [None]:
def aplicar_LGBMRegressor(indice, kseeds, ENTRENAR_TODOS_MESES, k_folds, params):
  global X_train
  global y_train
  global X_test
  global y_test
  global Xt
  global yt
  global wt
  global Xv
  global yv
  global wv
  global learner
  global flag
  flag = 1

  def evaluar_ganancia(y_true, y_pred):
    global flag
    #print(y_pred)
    if flag == 1:
      optimization2 = (sum([ 1 if v > 0 else 0 for v in y_pred ] * wt) / wt.sum()).sum()
      flag = 0
    else:
      optimization2 = (sum([ 1 if v > 0 else 0 for v in y_pred ] * wv) / wv.sum()).sum()
      flag = 1

    return 'GAIN', optimization2, True

  drop_cols = ["codmes","id_persona"]
  fi = []
  test_probs = []
  train_probs = []
  for s in kseeds:
    print("Seed: "+str(s))
    #for mes in X_train.codmes.unique():
        #Xt = X_train[X_train.codmes != mes]
        #yt = y_train.loc[Xt.index, "margen"]
        #Xt = Xt.drop(drop_cols, axis=1)    
        #wt = y_train.loc[Xt.index, "margen"]

        #Xv = X_train[X_train.codmes == mes]
        #yv = y_train.loc[Xv.index, "margen"]
        #wv = y_train.loc[Xv.index, "margen"]

        #if params != None:
          #learner = LGBMRegressor(n_estimators=10000, random_state=s, 
                                  #metric=['GAIN'],  
                                  #max_depth=params["max_depth"], 
                                  #gamma=params["gamma"], 
                                  #colsample_bytree=params["colsample_bytree"])
        #else:
          #learner = LGBMRegressor(n_estimators=10000, random_state=s, metric=['GAIN'])

        #learner.fit(Xt, yt, early_stopping_rounds=30, eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=0, eval_metric=evaluar_ganancia)

        #if not(ENTRENAR_TODOS_MESES):
          #test_probs.append(pd.Series(learner.predict(X_test.drop(drop_cols, axis=1)),index=X_test.index, name="fold_" + str(mes)))

        #train_probs.append(pd.Series(learner.predict(Xv.drop(drop_cols, axis=1)),index=Xv.index, name="probs"))
        #fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

    if ENTRENAR_TODOS_MESES:  
      i = 0
      for train_idx, valid_idx in model_selection.StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=s).split(X_train, y_train["target"]):
          i += 1
          
          # Split the train & validation datasets with the cv fold ids
          Xt = X_train.iloc[train_idx]
          Xt = Xt.drop(drop_cols, axis=1)
          yt = y_train.loc[Xt.index, "margen"]
          wt = y_train.loc[Xt.index, "margen"]

          Xv = X_train.iloc[valid_idx]
          yv = y_train.loc[Xv.index, "margen"]
          wv = y_train.loc[Xv.index, "margen"]

          learner = LGBMRegressor(n_estimators=10000, random_state=s, metric=['GAIN'])
          learner.fit(Xt, yt, early_stopping_rounds=20, eval_set=[(Xt, yt), (Xv.drop(drop_cols, axis=1), yv)], verbose=0, eval_metric=evaluar_ganancia)

          train_probs.append(pd.Series(learner.predict(Xv.drop(drop_cols, axis=1)),index=Xv.index, name="fold_tot"+str(i)))
          test_probs.append(pd.Series(learner.predict(X_test.drop(drop_cols, axis=1)),index=X_test.index, name="fold_tot"+str(i)))

          fi.append(pd.Series(learner.feature_importances_ / learner.feature_importances_.sum(), index=Xt.columns))

  # Calculo la salida final
  #train_probs = pd.concat(train_probs)
  train_probs = pd.concat(train_probs, axis=1).mean(axis=1)
  test_probs = pd.concat(test_probs, axis=1).mean(axis=1)
  fi = pd.concat(fi, axis=1).mean(axis=1)

  res = y_train.join(train_probs.rename("probs"))
  optimization = differential_evolution(lambda c: -(sum([1 if v > c[0] else 0 for v in res.probs] * res.margen) / res.margen.sum()).sum(), [(-1, 1)], tol=0.0001, seed=19911125)
  
  if params == None:
    print("Lift: "+str(optimization['fun']))

  lift = -(sum([1 if v > 0 else 0 for v in res.probs] * res.margen) / res.margen.sum()).sum()
  if params == None:
    print("Lift: "+str(lift))

  if params == None:
    print(fi.sort_values(ascending=False).head(20).to_frame())

  #test_preds = (test_probs > optimization['x'][0]).astype(int)
  test_preds = (test_probs > 0).astype(int)
  test_preds.index.name="prediction_id"
  test_preds.name="class"
  test_preds.to_csv(BASE_PATH+indice+"_benchmark_regressor_stacking.csv", header=True)
  
  test_probs.index.name="prediction_id"
  test_preds.name="prob"
  test_probs.to_csv(BASE_PATH+indice+"_ib_test_stacking.csv", header=True)
  
  train_probs.index.name="prediction_id"
  train_probs.name="prob"
  train_probs.to_csv(BASE_PATH+indice+"_ib_train_stacking.csv", header=True)

  return -1 * (optimization['fun']), fi.sort_values(ascending=False).to_frame()


# Pruebo combinaciones de modelos, stacking y variables agregadas

In [None]:
# Guardo un bkp de los datasets base
X_train_base = copy.copy(X_train)
X_test_base = copy.copy(X_test)

In [None]:
X_train = copy.copy(X_train_base)
X_test = copy.copy(X_test_base)

# Agrego las variables historicas de RCC de 10 meses
agregar_rcc_10m(True)
#agregar_rcc_10m(False)

# Construyo un modelo con estas variables
#aplicar_LGBMClassfier("10m")

# Realizo el stacking de este model al dataset base
#stacking("10m", True, 100)

X_train.head(5)

In [None]:
# Agrego las variables historicas de RCC de 10 meses
agregar_rcc_3m(True)
#agregar_rcc_3m(False)

# Construyo un modelo con estas variables
#aplicar_LGBMClassfier("3m")

# Realizo el stacking de este model al dataset base
#stacking("3m", True, 100)

X_train.head(5)

In [None]:
# Guardo un bkp de los datasets base
X_train_3m_10m = copy.copy(X_train)
X_test_3m_10m = copy.copy(X_test)

In [None]:
# Guardo un bkp de los datasets base
X_train = copy.copy(X_train_3m_10m)
X_test = copy.copy(X_test_3m_10m)

# Agrego las variables historicas de RCC de 10 meses
comparar_linea_otros_bancos(10)

# Construyo un modelo con estas variables
aplicar_LGBMClassfier("om", [20191125, 19770804, 19870605, 19721010, 19761126], True, 7)

# Realizo el stacking de este model al dataset base, antes guardo el dataset original
stacking("om", True, 1000)

X_train.head(5)

In [None]:
# Guardo un bkp de los datasets base
X_train_om = copy.copy(X_train)
X_test_om = copy.copy(X_test)

In [None]:
#X_train = copy.copy(X_train_om)
#X_test = copy.copy(X_test_om)
#var_tend_hist()
#X_train.head(5)

In [None]:
gain, fi = aplicar_LGBMRegressor("lgbmreg_base", [20191125], True, 7, None)

In [None]:
# Guardo un bkp de los datasets base
X_train_fe2 = copy.copy(X_train)
X_test_fe2 = copy.copy(X_test)
fi = fi.reset_index()
fi.columns = ["var", "importance"]

In [None]:
X_train = copy.copy(X_train_fe2)
X_test = copy.copy(X_test_fe2)

dt = fi[fi["importance"] > 0.005]
dt = dt.append(pd.Series({"var":"id_persona", "importance":1}), ignore_index=True)
dt = dt.append(pd.Series({"var":"codmes", "importance":1}), ignore_index=True)

print(dt.shape)
print(fi.shape)

X_train = X_train[dt["var"]]
X_test = X_test[dt["var"]]

In [None]:
gain_fe, fi_fe = aplicar_LGBMRegressor("lgbmreg_base_fe", [20191125], True, 7, None)

# Tuning Hiperparametros

In [None]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin

def objective(params):
  params2 = {
      'max_depth': int(params['max_depth']),
      'gamma': "{:.3f}".format(params['gamma']),
      'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
  }

  gain = aplicar_LGBMRegressor("lgbmreg", False, 10, params2)

  print("Gain: "+str(gain)+" Params:{"+str(params2)+"}")

  return (-1 * gain)

space = {
    'max_depth': hp.quniform('max_depth', 6, 40, 4),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.4, 0.9),
    'gamma': hp.uniform('gamma', 0.2, 0.4)
}

#best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=300)
#print("Best: "+str(best))

In [None]:
# Best
#params2 = {
    #'max_depth': int(best['max_depth']),
    #'gamma': "{:.3f}".format(best['gamma']),
    #'colsample_bytree': '{:.3f}'.format(best['colsample_bytree']),
#}
#gain = aplicar_LGBMRegressor("lgbmreg_best", True, 10, params2)
#print("Gain: "+str(gain))

In [None]:
#Gain: 1.1823120941202543 Params:{{'max_depth': 7, 'gamma': '0.277', 'colsample_bytree': '0.574'}}
#gain = aplicar_LGBMRegressor("lgbmreg_2do", True, 10, {'max_depth': 7, 'gamma': '0.277', 'colsample_bytree': '0.574'})
#print("Gain: "+str(gain))

In [None]:
#Gain: 1.1807729124945683 Params:{{'max_depth': 6, 'gamma': '0.230', 'colsample_bytree': '0.570'}}
#gain = aplicar_LGBMRegressor("lgbmreg_1er", True, 10, {'max_depth': 6, 'gamma': '0.230', 'colsample_bytree': '0.570'})
#print("Gain: "+str(gain))