In [None]:
import pandas as pd
from qgrid import show_grid
import seaborn as sns
from pandas_profiling import ProfileReport
from funpymodeling.exploratory import freq_tbl, status, profiling_num, cat_vars, num_vars

In [None]:
import pickle

with open('data/d_eph5.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [None]:
status(data)

In [None]:
data_x=data.drop('ingreso_15k_si', axis=1)
data_y=data['ingreso_15k_si']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2)

### 2) Creación del modelo predictivo

In [None]:
from sklearn.ensemble import RandomForestClassifier 

# Creamos 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 99)

In [None]:
rf.fit(x_train, y_train)

### 3) Predicción de la clase y score

In [None]:
rf.predict(x_train)

In [None]:
pred_probs=rf.predict_proba(x_train)

In [None]:
pred_probs

In [None]:
y_prob_tr=pred_probs[:,1]

In [None]:
y_prob_tr

### 4) Matriz de confusión

In [None]:
from sklearn.metrics import plot_confusion_matrix # atención, asume 0.5 como punto de corte

Vamos a usar directamente el que grafica porque incluye la matriz original

4.a) Valores absolutos:

In [None]:
sns.set(font_scale=1.2) #  Ajuste tamaño de letra (var global)

plot_confusion_matrix(rf, x_train, y_train,
                      display_labels=['no','si'],
                      cmap='Blues')

4.b) Normalizando por fila:

In [None]:
plot_confusion_matrix(rf, x_train, y_train,
                      display_labels=['no','si'],
                      cmap='Blues',
                      normalize='true',
                     )

#### 4.c) No permite variar el punto de corte, entonces:

In [None]:
y_pred1=rf.predict(x_train)

sns.set(font_scale=1.5) #  Ajuste tamaño de letra (var global)
conf_mat1=pd.crosstab(index=y_train, # filas = valor real
                     columns=y_pred1, # columnas = valor predicho
                     rownames=['Actual'], 
                     colnames=['Pred'], 
                     normalize='index')

import seaborn as sns
sns.heatmap(conf_mat1, annot=True, cmap='Blues', fmt='g')

### 5) Seteo del punto de corte

In [None]:
y_prob_tr

In [None]:
import numpy as np

In [None]:
y_prob_tr.mean()

In [None]:
np.where(y_prob_tr > 0.27, "si", "no")

In [None]:
y_pred2=np.where(y_prob_tr > 0.27, 1, 0)

In [None]:
y_pred2

### 6) Matriz de confusión (punto de corte custom)


In [None]:
conf_mat2=pd.crosstab(index=y_train, 
                      columns=y_pred2,  # cambio!
                      rownames=['Actual'], 
                      colnames=['Pred'], 
                      normalize='index')

import seaborn as sns
sns.heatmap(conf_mat2, annot=True, cmap='Blues', fmt='g')

Comparando antes y después:

In [None]:
sns.set(font_scale=1) #  Ajuste tamaño de letra (var global)

import matplotlib.pyplot as plt
fig, ax =plt.subplots(1,2)

sns.heatmap(conf_mat1, annot=True, cmap='Blues', fmt='g', ax=ax[0])
sns.heatmap(conf_mat2, annot=True, cmap='Blues', fmt='g', ax=ax[1])

fig.show(warn=False)

### 7) Análisis de los distintos puntos de corte

In [None]:
from yellowbrick.classifier.threshold import discrimination_threshold

dis_res=discrimination_threshold(rf, x_train, y_train)
dis_res

### 8) Curva ROC

Graficamos para training y para test

In [None]:
from sklearn.metrics import plot_roc_curve
import matplotlib.pyplot as plt

tr_disp = plot_roc_curve(rf, x_train, y_train)
ts_disp = plot_roc_curve(rf, x_test, y_test, ax=tr_disp.ax_)
ts_disp.figure_.suptitle("ROC curve comparison")

plt.show()

In [None]:
import pickle

# Lo guardamos al disco
filename = 'rf.pkl'
pickle.dump(rf, open(filename, 'wb')) # rf = nuestro modelo
 

# Lo cargamos para usarlo en otro momento. Obviamente no tiene sentido tener el dump y el load juntos ;)
rf_loaded = pickle.load(open(filename, 'rb'))

In [None]:
from yellowbrick.datasets import load_spam

# Carga del set de datos
x_data, y_data = load_spam()

import pandas as pd

x_data['is_spam']=y_data
x_data.to_csv('data/spam_data.csv')