# Machine Learning (Summer 2017)

## Homework 2:


- Implement Naive Bayes model (remember about smoothing). 
- Find a reasonably interesting but not to complicated dataset for which you will be able to use this model to perform binary classification. Do the latter.
- Produce the confussion matrix, calculate accuracy, precission, recall
- Check how your model does against its version from sklearn and logistic regression from sklearn. 

### Importing libraries

In [422]:
import pandas as pd
import numpy as np
np.set_printoptions(precision=6)

### Importing data set

Source: [Car Evaluation Data Set ](https://archive.ics.uci.edu/ml/datasets/car+evaluation)

**Class values:**   
unacc, acc, good, vgood

**Attributes:**
- buying:   vhigh, high, med, low.
- maint:    vhigh, high, med, low.
- doors:    2, 3, 4, 5more.
- persons:  2, 4, more.
- lug_boot: small, med, big.
- safety:   low, med, high.

In [423]:
DATASET_DIR = '~/Downloads/'

attributes = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
cars_dataset = pd.read_csv(DATASET_DIR+'car.data.txt')
cars_dataset.columns = attributes + ['class']

In [424]:
cars_dataset.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc


### Implementing Naive Bayes

In [546]:
class MultinomialNaiveBayes:
    
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        self.classes = np.unique(y)
        self.features = [np.unique(X[:,column]) for column in range(X.shape[1])]
        grouped = [[x for x, t in zip(X, y) if t == c] for c in self.classes]
        self.class_log_prior = [np.log(len(group)/X.shape[0]) for group in grouped]
        self.log_probs = self.feature_log_probs(grouped)
        return self
    
    def log_prob(self, amount, total, unique):
        return np.log((amount + self.alpha)/np.sum(total + (unique * self.alpha)))
    
    def feature_log_probs(self, grouped):
        group_counts = [[np.unique(column, return_counts=True) for column in np.array(group).T] for group in grouped]
        counts_dicts = [[{k:v for k, v in zip(count[0],count[1])} for count in group] for group in group_counts]
        return [[[self.log_prob(d.get(key, 0), sum(d.values()),feature.shape[0]) for key in feature] for feature, d in zip(self.features, c)] for c in counts_dicts]
    
    def get_log_probs(self, row):
        inidices = [np.where(keys == key)[0][0] for keys, key in zip(self.features, row)]
        return [[f_probs[index] for f_probs, index in zip(c_probs, inidices)] for c_probs in self.log_probs]
    
    def predict_log_prob(self, X):
        return [np.array(self.get_log_probs(x)).sum(axis=1) + self.class_log_prior for x in X]
    
    def predict(self, X):
        return np.argmax(self.predict_log_prob(X), axis=1)

In [547]:
def naive_bayes(X, y, train_ratio=0.8, alpha=1.0):
    nb = MultinomialNaiveBayes(alpha)
    train_X, train_y = X[:int(X.shape[0] * train_ratio)], y[:int(y.shape[0] * train_ratio)]
    nb.fit(train_X, train_y)
    predict_X, predict_y = X[int(X.shape[0] * train_ratio):], y[int(y.shape[0] * train_ratio):]
    return nb.predict(predict_X), [np.where(nb.classes == y)[0][0] for y in predict_y]

In [548]:
X = np.array(cars_dataset.iloc[:,0:-1])
y = cars_dataset.iloc[:,-1]

result, expected = naive_bayes(X, y, 0.8)
paired = [(r, e) for r, e in zip(result, expected)]
unique = np.unique([pair[0] == pair[1] for pair in paired], return_counts=True)
[print(c, r) for c, r in zip(unique[0], unique[1])]
print("accuracy: ", unique[1][1]/sum(unique[1]))

False 112
True 234
accuracy:  0.676300578035
