# Ensemble methods. Exercises


In this section we have only two exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package.

2. Build arcing arc-x4 method. 

In [1]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

In [8]:
test_data_set.shape

(20, 4)

## Exercise 1: Find the best three classifier in the stacking method

Please use the following classifiers:

* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [64]:
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [94]:
def build_classifiers():
    # save each label and model in a dictionary
    clf_dict = {}
    
    lr = LinearRegression()
    lr.fit(data_set, labels)
    clf_dict['lr'] = lr
    
    neigh = KNeighborsClassifier(n_neighbors=3)
    neigh.fit(data_set, labels)
    clf_dict['neigh'] = neigh
    
    svc = SVC(kernel='linear') 
    svc.fit(data_set, labels)
    clf_dict['svc'] = svc
    
    dt = DecisionTreeClassifier(random_state=0)
    dt.fit(data_set, labels)
    clf_dict['dt'] = dt
    
    nb = GaussianNB()
    nb.fit(data_set, labels)
    clf_dict['nb'] = nb
    
    scores = {}
    for clf in clf_dict:
        model = clf_dict[clf]
        scores[clf] = model.score(data_set, labels)
       
    # 3 best scores
    highest = dict(sorted(scores.items(), key=lambda item: item[1])[:3])
    print('3 highest scores are: ', highest)
    
    classifiers = []
    for clf in clf_dict:
        if clf in highest:
            classifiers.append(clf_dict[clf])
    
    return classifiers

In [99]:
def build_stacked_classifier(classifiers):
    output = []
    test_set = []
    for clf in classifiers:
        y_pred = clf.predict(data_set)
        output.append(y_pred)
        test_set.append(clf.predict(test_data_set))
            
    output = np.array(output).reshape((130,3))
    
    # stacked classifier part:
    stacked_classifier = QuadraticDiscriminantAnalysis()
    stacked_classifier.fit(output.reshape((130,3)), labels.reshape((130,)))
    
    test_set = np.array(test_set).reshape((len(test_set[0]),3))
    
    predicted = stacked_classifier.predict(test_set)
    
    return predicted

In [100]:
classifiers = build_classifiers()
predicted = build_stacked_classifier(classifiers)
accuracy = accuracy_score(test_labels, predicted)
print(accuracy)

3 highest scores are:  {'lr': 0.9287417709377138, 'nb': 0.9692307692307692, 'neigh': 0.9769230769230769}
0.9


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


## Exercise 2: 

Use the boosting method and change the code to fullfilt the following requirements:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [27]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# prepare data set

def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

# init weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size

def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0

Fill the two functions below:

In [28]:
def set_new_weights(model, count_missclf):
    # prediction 
    y_pred = model.predict(test_set)
    
    weights = []    
    for i in range(test_set_size):
        if y_pred[i] != test_labels[i]:
            # increment counter if it was missclassified
            count_missclf[i] += 1
        
        # numerator
        num = 1 + count_missclf[i]
        
        # denominator
        denum = 0
        for i in range(test_set_size):
            # sum the same thing many times. seems wrong but I didn't find a well-explained formula
            denum += 1 + count_missclf[i]
        
        weight = num/denum
        weights.append(weight)
    
    return weights, count_missclf

Train the classifier with the code below:

In [44]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=1)
classifier.fit(X=train_set, y=train_labels)
alphas = []
classifiers = []
count_missclf = [0]*test_set_size
for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights, count_missclf = set_new_weights(model, count_missclf)
    classifiers.append(model)

print(weights)

[0.0009184423218221896, 0.0011019283746556473, 0.0012853470437017994, 0.0011017260374586854, 0.0011017260374586854, 0.0011015237745548008, 0.0011013215859030838, 0.0011013215859030838, 0.0011011194714626537, 0.0011011194714626537, 0.0011011194714626537, 0.0011009174311926607, 0.0009174311926605505, 0.001100715465052284, 0.0011005135730007337, 0.0011005135730007337, 0.0011005135730007337, 0.000917094644167278, 0.0011005135730007337, 0.0011003117549972493, 0.0011003117549972493, 0.0009169264624977076, 0.0011001100110011, 0.0010999083409715857, 0.0010997067448680353, 0.0010997067448680353, 0.0010997067448680353, 0.0010997067448680353, 0.0010995052226498076, 0.0010993037742762918, 0.001099102399706906, 0.001098901098901099, 0.0009157509157509158, 0.001098901098901099, 0.001098901098901099, 0.001098901098901099, 0.0009157509157509158, 0.0010986998718183483, 0.0010984987184181618, 0.001098297638660077, 0.0010980966325036604, 0.0010978956999085087, 0.0010978956999085087, 0.0010978956999085087

Set the validation data set:

In [145]:
validate_x, validate_label = generate_data(5, dimension, labels)

why does train_model function use  the test dataset? in prediction should we use test as well? what about validation dataset?

Fill the prediction code:

In [146]:
def get_prediction(x):  
    prediction = []
    
    for i in range(len(x)):
        
        voting = []
        preds = []
        
        for clf in classifiers:
            # predict with the model
            y_pred = clf.predict(x[i].reshape(1, -1))
            # append to vector of predictions for this x[i]
            preds.append(y_pred[0])
            
            # if the prediction is right
            if y_pred[0] == validate_label[i]:
                # true to the voting array
                voting.append(1) 
            else:
                # else, false
                voting.append(0)
                
        # the prediction is the argmax of the voting array
        prediction.append(preds[np.argmax(voting)])
        
    return prediction

Test it:

In [147]:
prediction = get_prediction(validate_x)

print(prediction)

[0, 1, 1, 0, 0]
