# Machine Learning (Summer 2017)

## Homework 2:


- Implement Naive Bayes model (remember about smoothing). 
- Find a reasonably interesting but not to complicated dataset for which you will be able to use this model to perform binary classification. Do the latter.
- Produce the confussion matrix, calculate accuracy, precission, recall
- Check how your model does against its version from sklearn and logistic regression from sklearn. 

### Importing libraries

In [1]:
import pandas as pd
import numpy as np

### Implementing Naive Bayes

In [2]:
class MultinomialNaiveBayes:
    
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.features = [np.unique(X[:,column]) for column in range(X.shape[1])]
        grouped = [[x for x, t in zip(X, y) if t == c] for c in self.classes]
        self.class_log_prior = [np.log(len(group)/X.shape[0]) for group in grouped]
        self.log_probs = self.feature_log_probs(grouped)
        return self
    
    def log_prob(self, amount, total, unique):
        return np.log((amount + self.alpha)/np.sum(total + (unique * self.alpha)))
    
    def feature_log_probs(self, grouped):
        group_counts = [[np.unique(column, return_counts=True) for column in np.array(group).T] for group in grouped]
        counts_dicts = [[{k:v for k, v in zip(count[0],count[1])} for count in group] for group in group_counts]
        return [[[self.log_prob(d.get(key, 0), sum(d.values()),feature.shape[0]) for key in feature] for feature, d in zip(self.features, c)] for c in counts_dicts]
    
    def get_log_probs(self, row):
        inidices = [np.where(keys == key)[0][0] for keys, key in zip(self.features, row)]
        return [[f_probs[index] for f_probs, index in zip(c_probs, inidices)] for c_probs in self.log_probs]
    
    def predict_log_prob(self, X):
        return [np.array(self.get_log_probs(x)).sum(axis=1) + self.class_log_prior for x in X]
    
    def predict(self, X):
        return np.argmax(self.predict_log_prob(X), axis=1)

### Importing data set

Source: [Car Evaluation Data Set ](https://archive.ics.uci.edu/ml/datasets/car+evaluation)

**Class values:**   
unacc, acc, good, vgood

**Attributes:**
- buying:   vhigh, high, med, low.
- maint:    vhigh, high, med, low.
- doors:    2, 3, 4, 5more.
- persons:  2, 4, more.
- lug_boot: small, med, big.
- safety:   low, med, high.

In [3]:
DATASET_DIR = '~/Downloads/'

attributes = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
cars_dataset = pd.read_csv(DATASET_DIR+'car.data.txt')
cars_dataset.columns = attributes + ['class']

In [4]:
cars_dataset.tail()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good
1726,low,low,5more,more,big,high,vgood


### Calculate results

In [85]:
def naive_bayes(X, y, train_ratio=0.8, alpha=1.0):
    nb = MultinomialNaiveBayes(alpha)
    train_X, train_y = X[:int(X.shape[0] * train_ratio)], y[:int(y.shape[0] * train_ratio)]
    nb.fit(train_X, train_y)
    predict_X, predict_y = X[int(X.shape[0] * train_ratio):], y[int(y.shape[0] * train_ratio):]
    return nb.predict(predict_X), [np.where(nb.classes == y)[0][0] for y in predict_y], nb.classes

In [141]:
def confusion_matrix(results, target, classes):
    if len(results) != len(target):
        return None
    matrix = np.zeros(shape=(len(classes),len(classes)))
    for r, t in zip(results, target):
        matrix[r][t] += 1
    return matrix

def table_of_confusion(confusion_matrix, t_class):
    table = np.zeros(shape=(2,2))
    for i_r, result in enumerate(confusion_matrix):
        for i_t, target in enumerate(confusion_matrix[i_r]):
            if i_r == t_class:
                if i_t == t_class:
                    table[0][0] += confusion_matrix[i_r][i_t]
                else:
                    table[0][1] += confusion_matrix[i_r][i_t]
            else:
                if i_t == t_class:
                    table[1][0] += confusion_matrix[i_r][i_t]
                else:
                    table[1][1] += confusion_matrix[i_r][i_t]
    return table

def accuracy(table_of_confusion):
    return (table_of_confusion[0][0]+table_of_confusion[1][1])/sum(sum(table_of_confusion))

def precision(table_of_confusion):
    return (table_of_confusion[0][0])/(table_of_confusion[0][0]+table_of_confusion[0][1])

def recall(table_of_confusion):
    return (table_of_confusion[0][0])/(table_of_confusion[0][0]+table_of_confusion[1][0])

In [142]:
X = np.array(cars_dataset.iloc[:,0:-1])
y = cars_dataset.iloc[:,-1]

result, target, classes = naive_bayes(X, y, 0.8)
confusion_matrix = confusion_matrix(result, target, classes)
t_class_name = "acc"
t_class = np.where(classes == t_class_name)[0][0]
table_of_confusion = table_of_confusion(confusion_matrix, t_class)
print("confussion matrix:\n", classes, "\n", confusion_matrix)
print("table of confusion:\n", [t_class_name, "non-" + t_class_name], "\n",table_of_confusion)
print("accuracy for class", '"' + t_class_name + '"' ,":", accuracy(table_of_confusion))
print("precision for class", '"' + t_class_name + '"' ,":", precision(table_of_confusion))
print("recall for class", '"' + t_class_name + '"' ,":", recall(table_of_confusion))

confussion matrix:
 ['acc' 'good' 'unacc' 'vgood'] 
 [[  38.   46.    2.   39.]
 [   0.    0.    0.    0.]
 [  25.    0.  196.    0.]
 [   0.    0.    0.    0.]]
table of confusion:
 ['acc', 'non-acc'] 
 [[  38.   87.]
 [  25.  196.]]
accuracy for class "acc" : 0.676300578035
precision for class "acc" : 0.304
recall for class "acc" : 0.603174603175
