# Machine Learning (Summer 2017)

## Homework 2:


- Implement Naive Bayes model (remember about smoothing). 
- Find a reasonably interesting but not to complicated dataset for which you will be able to use this model to perform binary classification. Do the latter.
- Produce the confussion matrix, calculate accuracy, precission, recall
- Check how your model does against its version from sklearn and logistic regression from sklearn. 

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB

### Implementing Naive Bayes

In [2]:
class MultinomialNaiveBayes:
    
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.features = [np.unique(X[:,column]) for column in range(X.shape[1])]
        grouped = [[x for x, t in zip(X, y) if t == c] for c in self.classes]
        self.class_log_prior = [np.log(len(group)/X.shape[0]) for group in grouped]
        self.log_probs = self.feature_log_probs(grouped)
        return self
    
    def log_prob(self, amount, total, unique):
        return np.log((amount + self.alpha)/np.sum(total + (unique * self.alpha)))
    
    def feature_log_probs(self, grouped):
        group_counts = [[np.unique(column, return_counts=True) for column in np.array(group).T] for group in grouped]
        counts_dicts = [[{k:v for k, v in zip(count[0],count[1])} for count in group] for group in group_counts]
        return [[[self.log_prob(d.get(key, 0), sum(d.values()),feature.shape[0]) for key in feature] for feature, d in zip(self.features, c)] for c in counts_dicts]
    
    def get_log_probs(self, row):
        inidices = [np.where(keys == key)[0][0] for keys, key in zip(self.features, row)]
        return [[f_probs[index] for f_probs, index in zip(c_probs, inidices)] for c_probs in self.log_probs]
    
    def predict_log_prob(self, X):
        return [np.array(self.get_log_probs(x)).sum(axis=1) + self.class_log_prior for x in X]
    
    def predict(self, X):
        return np.argmax(self.predict_log_prob(X), axis=1)

### Importing data set

Source: [Car Evaluation Data Set ](https://archive.ics.uci.edu/ml/datasets/car+evaluation)

**Class values:**   
unacc, acc, good, vgood

**Attributes:**
- buying:   vhigh, high, med, low.
- maint:    vhigh, high, med, low.
- doors:    2, 3, 4, 5more.
- persons:  2, 4, more.
- lug_boot: small, med, big.
- safety:   low, med, high.

In [3]:
DATASET_DIR = '~/Downloads/'

attributes = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
cars_dataset = pd.read_csv(DATASET_DIR+'car.data.txt')
cars_dataset.columns = attributes + ['class']

In [4]:
cars_dataset.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


### Convert dataset

In [5]:
def extract_glossary(dataset):
    unique_values_per_column = [np.unique(column) for column in dataset.T]
    glossary = [{key:index for index, key in enumerate(column)} for column in unique_values_per_column]
    return glossary

def convert_entry(entry, glossary):
    converted = [keys[feature] for feature, keys in zip(entry, glossary)]
    return np.array(converted)

def convert_dataset(dataset):
    glossary = extract_glossary(dataset)
    converted = [convert_entry(entry, glossary) for entry in dataset]
    return glossary, np.array(converted)

Now, using methods above we can convert the whole dataset with `int` values insted of `string`. This will make the whole thing easier. 

In [6]:
glossary, dataset = convert_dataset(np.array(cars_dataset))

Below we can compare the head of converted data set using printed glossary to the original data set.

In [7]:
_ = [print(attribute, keys) for attribute, keys in zip(attributes + ['class'], glossary)]

buying {'high': 0, 'low': 1, 'med': 2, 'vhigh': 3}
maint {'high': 0, 'low': 1, 'med': 2, 'vhigh': 3}
doors {'2': 0, '3': 1, '4': 2, '5more': 3}
persons {'2': 0, '4': 1, 'more': 2}
lug_boot {'big': 0, 'med': 1, 'small': 2}
safety {'high': 0, 'low': 1, 'med': 2}
class {'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3}


print(dataset[:5])

### Calculate results

In [8]:
def naive_bayes(X, y, train_ratio=0.8, alpha=1.0):
    nb = MultinomialNaiveBayes(alpha)
    train_X, train_y = X[:int(X.shape[0] * train_ratio)], y[:int(y.shape[0] * train_ratio)]
    nb.fit(train_X, train_y)
    predict_X, predict_y = X[int(X.shape[0] * train_ratio):], y[int(y.shape[0] * train_ratio):]
    return nb.predict(predict_X), predict_y

In [9]:
X = dataset[:,0:-1]
y = dataset[:,-1]

result, target = naive_bayes(X, y)

### Evaluation

Now lets evaluate the results taking `acc` class into account.

In [10]:
classes = glossary[-1]
t_class_name = "acc"
t_class = classes[t_class_name]

__Confusion matrix__

In [11]:
def get_confusion_matrix(results, target, classes):
    if len(results) != len(target):
        return None
    matrix = np.zeros(shape=(len(classes),len(classes)))
    for r, t in zip(results, target):
        matrix[r][t] += 1
    return matrix

In [12]:
cm = get_confusion_matrix(result, target, classes)
print("Confusion matrix:\n", classes, "\n", cm)

Confusion matrix:
 {'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3} 
 [[  38.   46.    2.   39.]
 [   0.    0.    0.    0.]
 [  25.    0.  196.    0.]
 [   0.    0.    0.    0.]]


__Table of confusion__

In [13]:
def get_table_of_confusion(confusion_matrix, t_class):
    table = np.zeros(shape=(2,2))
    for i_r, result in enumerate(confusion_matrix):
        for i_t, target in enumerate(confusion_matrix[i_r]):
            if i_r == t_class:
                if i_t == t_class:
                    table[0][0] += confusion_matrix[i_r][i_t]
                else:
                    table[0][1] += confusion_matrix[i_r][i_t]
            else:
                if i_t == t_class:
                    table[1][0] += confusion_matrix[i_r][i_t]
                else:
                    table[1][1] += confusion_matrix[i_r][i_t]
    return table

In [14]:
toc = get_table_of_confusion(cm, t_class)
print("table of confusion:\n", [t_class_name, "non-" + t_class_name], "\n", toc)

table of confusion:
 ['acc', 'non-acc'] 
 [[  38.   87.]
 [  25.  196.]]


__Accuracy__

In [15]:
def get_accuracy(table_of_confusion):
    return (table_of_confusion[0][0]+table_of_confusion[1][1])/sum(sum(table_of_confusion))

In [16]:
accuracy = get_accuracy(toc)
print("Accuracy for class", '"' + t_class_name + '"' ,":", accuracy)

Accuracy for class "acc" : 0.676300578035


__Precision__

In [17]:
def get_precision(table_of_confusion):
    return (table_of_confusion[0][0])/(table_of_confusion[0][0]+toc[0][1])

In [18]:
precision = get_precision(toc)
print("Precision for class", '"' + t_class_name + '"' ,":", precision)

Precision for class "acc" : 0.304


__Recall__

In [19]:
def get_recall(table_of_confusion):
    return (table_of_confusion[0][0])/(table_of_confusion[0][0]+table_of_confusion[1][0])

In [20]:
recall = get_recall(toc)
print("Recall for class", '"' + t_class_name + '"' ,":", recall)

Recall for class "acc" : 0.603174603175


### Comparsion with sklearn implementation

In [21]:
def sklearn_nb(X, y, train_ratio=0.8, alpha=1.0):
    nb = MultinomialNB()
    train_X, train_y = X[:int(X.shape[0] * train_ratio)], y[:int(y.shape[0] * train_ratio)]
    nb.fit(train_X, train_y)
    predict_X, predict_y = X[int(X.shape[0] * train_ratio):], y[int(y.shape[0] * train_ratio):]
    return nb.predict(predict_X), predict_y

In [22]:
sk_result, sk_target = sklearn_nb(X, y)

In [23]:
sk_cm = get_confusion_matrix(sk_result, sk_target, classes)
print("Confusion matrix:\n", classes, "\n", sk_cm)

Confusion matrix:
 {'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3} 
 [[   0.    0.    0.    6.]
 [   0.    0.    0.    0.]
 [  63.   46.  198.   33.]
 [   0.    0.    0.    0.]]


In [24]:
sk_toc = get_table_of_confusion(sk_cm, t_class)
print("table of confusion:\n", [t_class_name, "non-" + t_class_name], "\n", sk_toc)

table of confusion:
 ['acc', 'non-acc'] 
 [[   0.    6.]
 [  63.  277.]]


In [25]:
sk_accuracy = get_accuracy(sk_toc)
print("Accuracy for class", '"' + t_class_name + '"' ,":", sk_accuracy)

Accuracy for class "acc" : 0.800578034682


In [26]:
sk_precision = get_precision(sk_toc)
print("Precision for class", '"' + t_class_name + '"' ,":", sk_precision)

Precision for class "acc" : 0.0


In [27]:
sk_recall = get_recall(sk_toc)
print("Recall for class", '"' + t_class_name + '"' ,":", sk_recall)

Recall for class "acc" : 0.0
