# Tutorial numpy y matplotlib

El objetivo del presente tutorial es ilustrar los conceptos básicos de las librerías numpy y matplotlib para la ciencia de datos y visualización de los mismos.

Comenzaremos con Numpy, librería para realizar computación científica en python.

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import matplotlib.pyplot as plt
import time
import itertools


a = np.array([1, 2, 3])
print('Construyendo con listas\n', a)

b = np.arange(1, 10)
print('Usando arange\n', b)

c = np.zeros((2, 4))
print('Usando zeros\n', c)

Algunas funciones típicas de Numpy:

In [None]:
a = np.arange(1, 17)
b = a.reshape([2, 8])
c = a.reshape([4, 4])

print(c, '\n')
print(c[:, 1])

In [None]:
lista_de_python = [[1, 2],  [3, 4]]
array_de_np = np.array(lista_de_python)

print(lista_de_python)
print(array_de_np)
print(array_de_np.shape)

In [None]:
a = np.arange(128).reshape([128, 1])
b = a.reshape([-1, 4, 8, 2])
c = a.reshape([1, 1, 128])

print(a.shape)
print(b.shape)
print(c.shape)

Ahora juguemos con matplotlib!

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

x = np.arange(1000)
y = np.sin(2*np.pi*x/500.)

fig = plt.figure(figsize=(10, 8))
fig.set_facecolor('white')

plt.xlabel('Tiempo [s]')
plt.ylabel('Posicion [m]')
plt.title('Movimiento oscilatorio')

plt.plot(x,y,'r^',label=':)')
plt.legend()

# Problema: graficar la distribución empírica del lanzamiento de dados (actividad 1)

In [None]:
def simulate_dice(throws, digit):
    assert digit in range(1, 7)
    count = 0
    for i in range(throws):
        sample = np.random.randint(1, 7)
        if sample == digit:
            count += 1
    return count

In [None]:
t = time.time()
simulated_counts = []
for i in range(100):
    simulated_counts.append(simulate_dice(6000, 3))
    
fig = plt.figure(figsize=(10, 8))
fig.set_facecolor('white')

plt.hist(simulated_counts, bins=20);
plt.show()
print(f'It took {time.time() - t:.2f} seconds')

## Un poco lento, no?

In [None]:
def simulate_dice_2(throws, digit):
    samples = np.random.randint(1, 7, size=(throws))
    matches = samples == digit
    count = matches.astype(np.int).sum()
    return count

In [None]:
t = time.time()
simulated_counts = []
for i in range(100):
    simulated_counts.append(simulate_dice_2(6000, 3))
    
fig = plt.figure(figsize=(10, 8))
fig.set_facecolor('white')

plt.hist(simulated_counts, bins=20);
plt.show()
print(f'It took {time.time() - t:.2f} seconds')

## Bastante mejor ;)

# Resolvamos un problema de clasificación!

In [None]:
def create_dataset(n_per_class):
    sigma_1 = 2.0
    mean_1 = -1.0
    x1 = np.random.randn(n_per_class)*sigma_1 + mean_1
    
    x2 = np.random.standard_gamma(3, size=(n_per_class))
    x = np.concatenate((x1, x2), axis=0)
    y = np.concatenate(
        (np.zeros(n_per_class),
        np.ones(n_per_class)),
        axis=0
    )
    return x, y

x, y = create_dataset(2000)
print(x)
print(y)

In [None]:
x1 = x[y == 0]
x2 = x[y == 1]

fig = plt.figure(figsize=(10, 8))
fig.set_facecolor('white')

plt.hist(x1, bins=30, density=True, color='blue', alpha=0.3, label='Class 1')
plt.hist(x2, bins=30, density=True, color='red', alpha=0.3, label='Class 2')
plt.xlabel('X value')
plt.ylabel('Density')
plt.title('Dataset histogram per class')
plt.legend()
plt.show()

## Supongamos que nuestro clasificador consiste en un umbral simple que separa ambas clases

In [None]:
def classifier(x, threshold):
    return (x > threshold).astype(np.int)

In [None]:
predictions = classifier(x, threshold=1.0)

correctly_classified = x[(predictions == y)]
misclassified_samples = x[(predictions != y)]

print(f'{len(correctly_classified)} correctly classified samples and {len(misclassified_samples)} misclassified ones')

## Construyamos la matriz de confusión asociada al clasificador, i.e. calculemos los verdaderos positivos, verdaderos negativos, falsos positivos y falsos negativos

### Elijamos la clase 2 como "positiva"

In [None]:
def confusion_matrix(predictions, labels):
    VP = ((predictions == labels) & (labels == 1)).astype(np.int).sum()
    VN = ((predictions == labels) & (labels == 0)).astype(np.int).sum()
    FP = ((predictions != labels) & (labels == 0)).astype(np.int).sum()
    FN = ((predictions != labels) & (labels == 1)).astype(np.int).sum()
    return VP, VN, FP, FN

VP, VN, FP, FN = confusion_matrix(predictions, y)
print(VP, VN, FP, FN)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig = plt.figure(figsize=(10, 8))
    fig.set_facecolor('white')

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm = np.array([[VN, FP], [FN, VP]])
plot_confusion_matrix(cm, classes=['Class 1', 'Class 2'])

## Por último, construyamos una curva ROC variando el umbral

In [None]:
tprs = []
fprs = []

for threshold in np.linspace(-6, 10, 100):
    predictions = classifier(x, threshold)
    VP, VN, FP, FN = confusion_matrix(predictions, y)
    tpr = VP/(VP+FN)
    fpr = FP/(FP+VN)
    tprs.append(tpr)
    fprs.append(fpr)

In [None]:
fig = plt.figure(figsize=(10, 8))
fig.set_facecolor('white')

plt.plot(fprs, tprs, '*')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')