# Seminarios de Procesos Gaussianos

### Grupo de procesamiento de la información visual (VIP) 

<div style="text-align: right"> Miguel López Pérez </div>

# Clasificación

In [1]:
import gpflow
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn import datasets
import sklearn.datasets
from sklearn import model_selection
import sklearn.ensemble
%matplotlib inline

In [None]:
def gridParams():
    mins = [-0.75,-1.5 ]
    maxs = [ 2.5, 1.25 ]
    nGrid = 50
    xspaced = np.linspace(mins[0], maxs[0], nGrid)
    yspaced = np.linspace(mins[1], maxs[1], nGrid)
    xx, yy = np.meshgrid(xspaced, yspaced)
    Xplot = np.vstack((xx.flatten(),yy.flatten())).T
    return mins, maxs, xx, yy, Xplot

def plot(m, ax, Xtrain, Ytrain):
    col1 = '#0172B2'
    col2 = '#CC6600'
    mins, maxs, xx, yy, Xplot = gridParams()
    p = m.predict_y(Xplot)[0]
    ax.plot(Xtrain[:,0][Ytrain[:,0]==1], Xtrain[:,1][Ytrain[:,0]==1], 'ko', color='b', mew=2)
    ax.plot(Xtrain[:,0][Ytrain[:,0]==0], Xtrain[:,1][Ytrain[:,0]==0], 'ko', color='r', mew=2)
    if hasattr(m, 'feature') and hasattr(m.feature, 'Z'):
        Z = m.feature.Z.read_value()
        ax.plot(Z[:,0], Z[:,1], 'ko', mew=0, ms=4)
        ax.set_title('m={}'.format(Z.shape[0]))
    else:
        ax.set_title('full')
    ax.contour(xx, yy, p.reshape(*xx.shape), [0.5], colors='k', linewidths=1.8, zorder=100)

## Ejemplo de juguete

In [None]:
X, y = sklearn.datasets.make_moons(n_samples = 200, noise = 0.2, random_state = 110)
y = y.reshape(-1,1)
plt.figure()
plt.plot(X[:,0][y[:,0]==0], X[:,1][y[:,0]==0], 'ko', mew=2, color = 'r')
plt.plot(X[:,0][y[:,0]==1], X[:,1][y[:,0]==1], 'ko', mew=2, color = 'b')
plt.show()

In [None]:
m = gpflow.models.VGP(X, y,
                      kern=gpflow.kernels.RBF(2),
                      likelihood=gpflow.likelihoods.Bernoulli())
gpflow.train.ScipyOptimizer(options=dict(maxiter=200)).minimize(m)


models = []
models.append(m)

# make plots.

fig, axes = plt.subplots(1, len(models), sharex=True, sharey=True)
for i, m in enumerate(models):
    plot(m, axes, X, y)
    axes.set_yticks([])
    axes.set_xticks([])

In [None]:
X, y = sklearn.datasets.make_moons(n_samples = 200, noise = 0.2, random_state = 110)
y = y.reshape(-1,1)
m = gpflow.models.VGP(X, y,
                      kern=gpflow.kernels.RBF(2),
                      likelihood=gpflow.likelihoods.Bernoulli())

gpflow.train.ScipyOptimizer(options=dict(maxiter=200)).minimize(m)
m.kern.lengthscales = 0.1


models = []
models.append(m)

# make plots.

fig, axes = plt.subplots(1, len(models), sharex=True, sharey=True)
for i, m in enumerate(models):
    plot(m, axes, X, y)
    axes.set_yticks([])
    axes.set_xticks([])

** Pregunta: ** ¿Qué tipo de frontera obtendríamos con el kernel Matern12?

## Ejemplo de clasificación binaria con breast cancer

Para clasificación Binaria deberemos elegir como likelihood la *Bernoulli*.

In [None]:
X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
print('El tamaño de este dataset es', X.shape)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.4, random_state = 100)
m = gpflow.models.VGP(X_train, y_train.reshape(-1,1), kern=gpflow.kernels.RBF(X_train.shape[1]), likelihood=gpflow.likelihoods.Bernoulli())
gpflow.train.ScipyOptimizer().minimize(m, maxiter=300)
print('La log-verosimilitud del modelo es', m.compute_log_likelihood())

In [None]:
pred_gp = m.predict_y(X_test)
pred_gp = [1 if x> 0.5 else 0 for x in pred_gp[0]]
acc_gp = sklearn.metrics.accuracy_score(y_test, pred_gp)

In [None]:
rf_model = sklearn.ensemble.RandomForestClassifier(n_estimators = 200, max_depth = 12, random_state = 12)
rf_model.fit(X_train, y_train)
pred_rf = rf_model.predict(X_test)
acc_rf = sklearn.metrics.accuracy_score(y_test, pred_rf)

In [None]:
print('El accuracy del random forest es ', acc_rf, 'mientras que en el modelo GP es ', acc_gp)

**Pregunta:** ¿Alguna sugerencia para mejorar el modelo GP?

##  Ejemplo de clasificación multiclase con iris 

Para clasificación multiclase deberemos elegir como likelihood la *Multiclass*.

In [2]:
X, y = sklearn.datasets.load_iris(return_X_y=True)
print('El tamaño de este dataset es', X.shape)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.4, random_state = 100)
m = gpflow.models.VGP(X_train, y_train.reshape(-1,1), kern=gpflow.kernels.RBF(X_train.shape[1], ARD = True), likelihood=gpflow.likelihoods.MultiClass(3), num_latent=3)
gpflow.train.ScipyOptimizer().minimize(m, maxiter=300)
print('La log-verosimilitud del modelo es', m.compute_log_likelihood())

El tamaño de este dataset es (150, 4)
INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 37.068741
  Number of iterations: 300
  Number of functions evaluations: 334


INFO:tensorflow:Optimization terminated with:
  Message: b'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
  Objective function value: 37.068741
  Number of iterations: 300
  Number of functions evaluations: 334


La log-verosimilitud del modelo es -34.46370962162432


In [3]:
pred_gp = m.predict_y(X_test)
pred_gp = np.argmax(pred_gp[0], 1)
acc_gp = sklearn.metrics.accuracy_score(y_test, pred_gp)

In [None]:
rf_model = sklearn.ensemble.RandomForestClassifier(n_estimators = 200, max_depth = 12, random_state = 12)
rf_model.fit(X_train, y_train)
pred_rf = rf_model.predict(X_test)
acc_rf = sklearn.metrics.accuracy_score(y_test, pred_rf)

In [None]:
print('El accuracy del random forest es ', acc_rf, 'mientras que en el modelo GP es ', acc_gp)

Importancia de las variables en iris

In [None]:
m.kern.lengthscales.as_pandas_table()['value'][0]

Podemos ver que las segunda variable es la más información nos da sobre la clase del iris. Aún así todas son relevantes, si alguna variable estuviera cerca de $0$ nos indicaría que no afecta en nada para la clasificación.