In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import lightgbm as lgb

from category_encoders import TargetEncoder


In [None]:
#INPUT
input = 'HT004-FE002'
fe = input.split("-")[1]
#EXPERIMENTO
experimento = 'EN-006'

#CARGO DATASETS
os.chdir("C:/Users/vyago/Desktop/Yago/Competencia/ypf")  # Directorio actual
train = pd.read_csv(f"../Exp/{fe}/train.csv")
test = pd.read_csv(f"../Exp/{fe}/test.csv")
y = np.sqrt(train["delta_WHP"])


In [None]:
enc = TargetEncoder(cols=['PAD_HIJO','HIJO','PADRE'], min_samples_leaf=20, smoothing=10).fit(train, y)

x_train= enc.transform(train)
x_test = enc.transform(test)

In [None]:
# INTERACCIÓN ENTRE VARIABLES CATEGÓRICAS 

var_cat = ['PAD_HIJO','HIJO','PADRE']

for feature_1 in var_cat:
    for feature_2 in var_cat:
        if feature_1!=feature_2:
            x_train[f'{feature_1}-{feature_2}'] = x_train[f'{feature_1}']*x_train[f'{feature_2}']  #CREO VARIABLES DE INTERACCIONES EN TRAIN
            x_test[f'{feature_1}-{feature_2}'] = x_test[f'{feature_1}']*x_test[f'{feature_2}']  #CREO VARIABLES DE INTERACCIONES EN TEST


In [None]:
x_train = x_train.select_dtypes("number")

In [None]:
x_train = x_train[x_train.columns.drop(["delta_WHP","ID_FILA"])]

x_test = x_test[x_train.columns]
x_test = x_test.astype("float32")

In [None]:
train_data = lgb.Dataset(x_train, label=y)


In [None]:
#LEO LAS SALIDAS DE LA OPTIMIZACIÓN BAYESIANA DEL EXPERIMENTO

log_bo = pd.read_csv(f'../Exp/{input}/HT.csv', sep=",")
log_bo = log_bo.sort_values("loss")


In [None]:
log_bo["params"]


In [None]:
# Parámetros 

params = {'feature_fraction': 0.31293008809690764, 'learning_rate': 0.07988247330134716, 'min_data_in_leaf': 610, 'num_leaves': 718, 'boosting_type': 'gbdt', 'subsample': 1.0, 'max_bin': 256, 'objective': 'regression', 'feature_pre_filter': False}
params['metric'] = 'rmse' # métrica
params["num_iterations"] = 7240
params["max_depth"] = -1

## STACKED GENERALIZATION MODEL

El modelo a realizar será un stacking de LightGBM, entrenados con mismos hiperparámetros pero con diferentes semillas

In [None]:
def generador_numeros(cantidad):
    semillas=[]
    for num in range(0,cantidad):
        semillas.append(1+num+1234*num)
    return semillas

semillas = generador_numeros(100)

In [None]:
semillas

In [None]:
predicciones = pd.DataFrame()

for semilla in semillas:
    
    params['seed'] = semilla
    modelo=lgb.train(params,train_data)
    predicciones[f'modelo_seed_{semilla}'] = np.square(modelo.predict(x_test))
    


predicciones["mean"] = predicciones.mean(axis=1)  

In [None]:
predicciones

In [None]:

prediccion=test[["ID_FILA"]]

prediccion = pd.concat([prediccion,predicciones["mean"]],axis=1)


In [None]:
modelo.feature_importance

In [None]:
if not os.path.isdir(f'../Exp/{experimento}'):
    os.makedirs(f'../Exp/{experimento}')
 
prediccion.to_csv(f"../Exp/{experimento}/prediccion.csv",header=False,index=False)

In [None]:
#IMPORTANCIA DE FEATURES

def plotImp(model, X , num = 20, fig_size = (40, 20)):
    feature_imp = pd.DataFrame({'Value':model.feature_importance(),'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()

    plt.show()
    
plotImp(modelo,x_train)

In [None]:
"""max_bin 
learning_rate    
num_iterations    
num_leaves        
min_data_in_leaf  
feature_fraction  
semilla  """         