# Naive Bayes 

In [9]:
import pandas as pd
import numpy as np

## Carga de datos

### Iris Plant

In [10]:
data = pd.read_csv('data/iris.data', header=None).values

clm_label = 4 # Obtenemos el indice de la etiqueta
clm_chrs = (1, data.shape[1]) if clm_label == 0 else (0, data.shape[1]-1) # Obtener las demás columnas

data[:10]

array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
       [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
       [5.0, 3.6, 1.4, 0.2, 'Iris-setosa'],
       [5.4, 3.9, 1.7, 0.4, 'Iris-setosa'],
       [4.6, 3.4, 1.4, 0.3, 'Iris-setosa'],
       [5.0, 3.4, 1.5, 0.2, 'Iris-setosa'],
       [4.4, 2.9, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.1, 1.5, 0.1, 'Iris-setosa']], dtype=object)

### Breast Cancer

In [13]:
data = pd.read_csv('data/Breast_cancer_data.csv', header=None).values
clm_label = 5 # Columna de la etiqueta
clm_chrs = (1, data.shape[1]) if clm_label == 0 else (0, data.shape[1]-1) # Obtener las demás columnas
data[:10]

array([['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area',
        'mean_smoothness', 'diagnosis'],
       ['17.99', '10.38', '122.8', '1001.0', '0.1184', '0'],
       ['20.57', '17.77', '132.9', '1326.0', '0.08474', '0'],
       ['19.69', '21.25', '130.0', '1203.0', '0.1096', '0'],
       ['11.42', '20.38', '77.58', '386.1', '0.1425', '0'],
       ['20.29', '14.34', '135.1', '1297.0', '0.1003', '0'],
       ['12.45', '15.7', '82.57', '477.1', '0.1278', '0'],
       ['18.25', '19.98', '119.6', '1040.0', '0.09463', '0'],
       ['13.71', '20.83', '90.2', '577.9', '0.1189', '0'],
       ['13.0', '21.82', '87.5', '519.8', '0.1273', '0']], dtype=object)

### Wine

In [3]:
data = pd.read_csv('data/wine.data', header=None).values
clm_label = 0
clm_chrs = (1, data.shape[1]) if clm_label == 0 else (0, data.shape[1]-1) # Obtener las demás columnas
data[:5]

array([[1.000e+00, 1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02,
        2.800e+00, 3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00,
        3.920e+00, 1.065e+03],
       [1.000e+00, 1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02,
        2.650e+00, 2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00,
        3.400e+00, 1.050e+03],
       [1.000e+00, 1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02,
        2.800e+00, 3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00,
        3.170e+00, 1.185e+03],
       [1.000e+00, 1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02,
        3.850e+00, 3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01,
        3.450e+00, 1.480e+03],
       [1.000e+00, 1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02,
        2.800e+00, 2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00,
        2.930e+00, 7.350e+02]])

## Fórmula

$X = (x_0, x_1, x_2, ..., x_n)$ Donde $X$ son las características

$Y \in \lbrace 0,1 \rbrace$ Donde $Y$ son las clases

### Teorema de Bayes
$P(Y|X) = \frac{P(X = x_0, x_1, x_2 | Y = y)P(Y = y)}{P(X)}$

##### Calcular $P(Y = y)$ para todas las clases - prior

In [11]:
def calculate_prior(ds):
    total = sum(map(lambda arr: arr.shape[0], ds))
    return [arr.shape[0]/total for arr in ds]

### Calcular la distribución normal
**Nota:** Hay documentación que proporciona diferentes maneras de realizar el calculo, pero se toma en cuenta la que aparece en las diapositivas presentadas por el profesor.

In [12]:
def calculate_normal_distribution(feat_val, mean, std):
    # p_x_given_y = (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((feat_val - mean)**2 / (2 * std**2)))
    return (1/np.sqrt(2*np.pi*std))*np.exp(-.5*((feat_val-mean)/std)**2)

### Calcular $P(X=x_1|Y=y),P(X=x_2|Y=y), …, P(X=x_n|Y=y) * P(Y=y)$ para todos y encontrar el máximo

In [13]:
def naive_bayes(ds, c_set):
    # Calcular probabilidad a priori
    prior = calculate_prior(ds)

    measures = []
    for cl in ds:
        m = []
        for i in range(clm_chrs[0], clm_chrs[1]):
            m.append({'mean': cl[:, i].mean(), 'std': cl[:, i].std()})
        measures.append(m)

    accuracies = []
    for i, cl in enumerate(c_set):
        accuracy = 0
        for e in cl:
            likelihood = [1] * len(c_set)
            for im, m in enumerate(measures):
                for j, ch in enumerate(e[clm_chrs[0]:clm_chrs[1]]):
                    likelihood[im] *= (1/np.sqrt(2*np.pi*m[j]['std']))*np.exp(-.5*((ch-m[j]['mean'])/m[j]['std'])**2)
                likelihood[im] *= prior[im]
            if np.argmax(likelihood) == i:
                accuracy += 1
        accuracies.append(accuracy)

    return np.array(accuracies)

### Subdividir el modelo

In [15]:
def get_subarrs_from_dataset(array):
    l = []
    c = 0
    for i, e in enumerate(array):
        if e[clm_label] != array[c][clm_label]:
            l.append(data[c:i])
            c = i
    l.append(data[c:i+1])

    return np.array(l)

### Entrenamiento y prueba

In [16]:
from functools import reduce
from sklearn.model_selection import train_test_split

subarrs = get_subarrs_from_dataset(data)

train, test = [], []
for arr in subarrs:
    tr, te = train_test_split(arr, test_size=.2, random_state=50)
    train.append(tr)
    test.append(te)
train = np.array(train)
test = np.array(test)
del subarrs, arr, tr, te

# .:Resultados:.
# Entrenamiento
acc_train = naive_bayes(train, train)
# print('Resultados con el subconjunto de entrenamiento')
# for t in range(train.shape[0]):
#     print(f'· Clase \'{train[t][0][clm_label]}\': {accuracy[t]}/{train[t].shape[0]}')
# print(f'Exactitud: {100*sum(accuracy)/sum(map(lambda e: e.shape[0], train)):.2f}%')

# Prueba
acc_test = naive_bayes(train, test)
# print('\nResultados con el subjuntunto de prueba')
# for t in range(test.shape[0]):
#     print(f'· Clase \'{test[t][0][clm_label]}\': {accuracy[t]}/{train[t].shape[0]}')
# print(f'Exactitud: {100*sum(accuracy)/sum(map(lambda e: e.shape[0], test)):.2f}%')
# pass

### Muestra de resultados

In [17]:
def show_results(ds, accuracy, clm_label, ds_name):
    print('\nResultados con el subjuntunto de', ds_name)
    for t in range(ds.shape[0]):
        print(f'- Clase \'{ds[t][0][clm_label]}\': {accuracy[t]}/{train[t].shape[0]}')
    print(f'Exactitud: {100*sum(accuracy)/sum(map(lambda e: e.shape[0], ds)):.2f}%')

In [18]:
show_results(train, acc_train, clm_label, 'entrenamiento')
show_results(test, acc_test, clm_label, 'prueba')


Resultados con el subjuntunto de entrenamiento
- Clase 'Iris-setosa': 40/40
- Clase 'Iris-versicolor': 36/40
- Clase 'Iris-virginica': 37/40
Exactitud: 94.17%

Resultados con el subjuntunto de prueba
- Clase 'Iris-setosa': 10/40
- Clase 'Iris-versicolor': 10/40
- Clase 'Iris-virginica': 9/40
Exactitud: 96.67%
