# 2 ETAPA: Random Forest y Redes Neurales: Problema multietiqueta

El objetivo de esta sección es el entrenamiento de modelos basados en arboles de decisión y redes neuronales (usando tanto la librería scikit-learn como tensorflow) para abordar el problema multietiqueta.

Para la ejecución de este notebook, es importante la instalación de las siguientes módulos:

* pip install scikit-multilearn
* pip install imblearn
* pip install tensorflow

En este notebook se encuentra la ejecución de los modelos random forest y redes neurales, tomandolos de las librerías de scikit-learn. Para cada modelos, se realiza un cross-validation manual y se obtiene el mejor modelo.

## Importación de librerías

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import scipy.stats as stats
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit, ShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, plot_confusion_matrix, confusion_matrix, log_loss, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.svm import SVC
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
import seaborn as sn
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.model_selection import IterativeStratification

In [None]:
train_features = pd.read_csv('train_features.csv')
train_target = pd.read_csv('train_targets_scored.csv')
test_features = pd.read_csv('test_features.csv')

In [None]:
#embedding cp_type, cp_time and cp_dose categorical columns of train dataset
# train_features_['cp_type'] = train_features_['cp_type'].map({'trt_cp':0, 'ctl_vehicle':1})
train_features['cp_time'] = train_features['cp_time'].map({24:1, 48:2, 72:3})
train_features['cp_dose'] = train_features['cp_dose'].map({'D1':0, 'D2':1})

In [None]:
data_train = pd.concat([train_features, train_target], axis = 1)
train = data_train[data_train['cp_type'] == 'trt_cp']
evaluar = test_features[test_features['cp_type'] == 'trt_cp']

In [None]:
# Selección de datos
X = train.iloc[:,2:876].reset_index(drop=True)
y = train.iloc[:,877:].reset_index(drop=True)
x_eval = evaluar.iloc[:,4:]

In [None]:
higher = [col for col in y.columns if (y[col].sum() > 100)]
ytarget = y[higher]

## Split iterativo

In [None]:
X_train, y_train, X_test, y_test = iterative_train_test_split(np.array(X), np.array(ytarget), test_size = 0.2)

In [None]:
nfolds = 5
cv_i = IterativeStratification(n_splits=nfolds)
cv = StratifiedKFold(n_splits = nfolds, shuffle=False)
cv_k = KFold(n_splits = nfolds)
cv_s = StratifiedShuffleSplit(n_splits=nfolds)

## Modelamiento

In [None]:
def loga_loss(y_test, y_pred, eps=1e-15):
    """función para el cálculo de la perdida logarítmica establecida en el concurso de kaggle
    
    y_test --> Variable de salida de prueba
    
    y_pred --> Variable predicha (Predicción de probabilidad de activación)"""
    
    los = np.zeros(y_test.shape)
    n, m = y_test.shape
    y_true = np.clip(y_test, eps, 1-eps)
    for M in range(m):
        for N in range(n):
            log_los = -((y_true[N,M]*np.log(y_pred[N,M]+eps))+((1-y_true[N,M])*np.log(1-y_pred[N,M]+eps)))
            los[N,M] = log_los
    return los

In [None]:
def test_model(xtrain, ytrain, xtest, ytest, model):
    """función para el entrenamiento y prueba de los modelos generados en el la sección de cross_validation. 
    EL objetivo es reentrenar con los datos de pruebar y verificar las métricas obtenidas en el paso anterior de CV
    
    xtrain --> Dataset de entrenamiento
    
    ytrain --> Variables objetivo de entrenamiento
    
    xtest --> Dataset de prueba
    
    ytest --> Variables objetivo de prueba
    
    model --> Lista con los modelos generados en el paso de cross_validation"""
    
    for i in range(len(model)):
        
        #Training
        train = model[i].fit(xtrain, ytrain)
        
        #Predictions and metrics
        pred = train.predict_proba(xtest)
        log_l = np.array(pred)[:,:,1].T
        
        log_loss = loga_loss(ytest, pred, eps=1e-15)
        print(f'Log_loss for model {i+1} --> {round(np.mean(log_loss),3)}')

In [None]:
def test(xtest, ytest, model):
    """función para la prueba de los modelos generados en el la sección de cross_validation. EL objetivo es predecir con 
    los datos de prueba y verificar las métricas obtenidas en el paso anterior de CV
    
    xtest --> Dataset de prueba
    
    ytest --> Variables objetivo de prueba
    
    model --> Lista con los modelos generados en el paso de cross_validation"""
    
    for i in range(len(model)):
              
        #Predictions and metrics
        pred = model[i].predict_proba(xtest)
        log_l = np.array(pred)[:,:,1].T
        
        log_loss = loga_loss(ytest, log_l, eps=1e-15)
        # log_loss = loga_loss(ytest, pred, eps=1e-15)
        print(f'Log_loss for model {i+1} --> {round(np.mean(log_loss),3)}')

In [None]:
def global_multiLabel_confusion_matrix(y_test_g,y_est_g):
    n_samples, n_class = y_test_g.shape
    CM = np.zeros((n_class,n_class))
    Temp = np.zeros((1,n_class))
    def acum_CM(y_test,y_est,CM,Temp):
        ind_real = np.asarray(y_test > 0).nonzero()[0]
        ind_est = np.asarray(y_est > 0).nonzero()[0]
        #--------------------------------
        if ind_real.size == 0:
            #In case in the ground truth not even one class is active
            Temp = Temp + y_est
        elif ind_est.size == 0:
            return CM, Temp
        else:
            mesh_real = np.array(np.meshgrid(ind_real,ind_real))
            comb_real = mesh_real.T.reshape(-1, 2)
            ind_remove_real = comb_real[:,0] != comb_real[:,1]
            comb_real = comb_real[ind_remove_real]
            #--------------------------------
            mesh_est = np.array(np.meshgrid(ind_real,ind_est))
            comb_est = mesh_est.T.reshape(-1, 2)
            #--------------------------------
            comb_real2 = comb_real[:,0] + comb_real[:,1]*1j
            comb_est2 = comb_est[:,0] + comb_est[:,1]*1j
            ind_remove = np.in1d(comb_est2,comb_real2)
            comb_est = comb_est[np.logical_not(ind_remove)]
            #--------------------------------
            CM[comb_est[:,0],comb_est[:,1]] += 1
        return CM, Temp
    
    for i in range(n_samples):
        CM,Temp = acum_CM(y_test_g[i,:],y_est_g[i,:],CM,Temp)
        
    return CM,Temp

## 1. Random forest classifier

### 1.1. Entrenamiento

In [None]:
parameters_rf = {'n_estimators':[20,40,60,80,100],'max_depth':[2,4,6,8,10,12]}
values = [parameters_rf[k] for k in parameters_rf]
modelo_rf, ml_rf, log_rf = [],[],[]
for i in values[0]:
    for j in values[1]:
        print('PARAMETER COMBINATION')
        print('n_estimator, max_depth:', (i, j))
        clf_rf = RandomForestClassifier(criterion='entropy', class_weight='balanced', 
                                                              n_estimators=i, max_depth=j, n_jobs=-1)
        scaler = RobustScaler()
        clf_pipe = Pipeline(steps=[('scaler', scaler), ('clf_rf', clf_rf)])
        for (fn, (train_ind, val_ind)) in enumerate(cv_s.split(X_train, y_train)):
            X_tr, X_val = np.array(X_train[train_ind]), np.array(X_train[val_ind])
            y_tr, y_val = np.array(y_train[train_ind]), np.array(y_train[val_ind])
            
            #Cheking empty columns
            check_for_empty_cols = np.where(y_tr.sum(axis = 0) == 0)[0]
            if len(check_for_empty_cols):
                y_tr[0,check_for_empty_cols] = 1
            
            #Training
            clf_pipe.fit(X_tr, y_tr)
    
            y_pred = clf_pipe.predict_proba(X_val)
            log_val = np.array(y_pred)[:,:,1].T
    
            log_v = loga_loss(y_val, log_val, eps=1e-15)
            print(f'fold {fn+1} --> log_loss: {round(np.mean(log_v),3)}')
            print('--------------------------')
            ml_rf.append(clf_pipe)
            log_rf.append(np.mean(log_v))
        l = 0
        for k in range(len(log_rf)-1):
            if log_rf[k] <= log_rf[k+1]:
                l = k
        modelo_rf.append(ml_rf[l])
        print('-------------------------------------------')

### 1.2. Prueba de los modelos

In [None]:
test(X_test, y_test, modelo_rf)

## 2. Multi-layer Perceptron classifier (Neural network sklearn)

### 2.1. Entrenamiento

In [None]:
parameters_mlp = {'activation':['logistic', 'tanh', 'relu'],'hidden_layer_sizes':[900,1000,1500,1700,2000],
            'learning_rate_init':[0.001,0.003,0.005]}
values_mlp = [parameters_mlp[k] for k in parameters_mlp]
modelo_mlp, ml_mlp, log_mlp = [],[],[]
for i in values_mlp[0]:
    for j in values_mlp[1]:
        for l in values_mlp[2]:
            print('PARAM_G COMB')
            print('activation, layer, learn_rate:', (i, j, l))
            clf_mlp = MLPClassifier(learning_rate='adaptive', random_state=42, activation=i,
                        hidden_layer_sizes=(j,y_train.shape[1]), learning_rate_init=l, max_iter=500)
            scaler = RobustScaler()
            mlp_pipe = Pipeline(steps=[('scaler', scaler), ('clf_rf', clf_mlp)])
            for (fn, (train_ind, val_ind)) in enumerate(cv_s.split(X_train, y_train)):
                X_tr, X_val = np.array(X_train[train_ind]), np.array(X_train[val_ind])
                y_tr, y_val = np.array(y_train[train_ind]), np.array(y_train[val_ind])
            
                #Cheking empty columns
                check_for_empty_cols = np.where(y_tr.sum(axis = 0) == 0)[0]
                if len(check_for_empty_cols):
                    y_tr[0,check_for_empty_cols] = 1
            
                #Training
                mlp_pipe.fit(X_tr, y_tr)
    
                y_pred = mlp_pipe.predict_proba(X_val)
                # log_val = np.array(y_pred)[:,:,1].T
    
                log_v = loga_loss(y_val, y_pred, eps=1e-15)
                print(f'fold {fn+1} --> log_loss: {round(np.mean(log_v),3)}')
                print('--------------------------')
                ml_mlp.append(mlp_pipe)
                log_mlp.append(np.mean(log_v))
            p = 0
            for k in range(len(log_mlp)-1):
                if log_mlp[k] <= log_mlp[k+1]:
                    p = k
            modelo_mlp.append(ml_mlp[p])
            print('-------------------------------------------')

### 2.2. Prueba de los modelos

In [None]:
test(X_test, y_test, modelo_mlp)