In [27]:
import numpy as np
from sklearn.datasets import load_iris
dataset = load_iris()
X = dataset.data
Y = dataset.target
# print (dataset.DESCR)
np.savetxt("X.txt", X)
np.savetxt("Y.txt", Y)

In [28]:
n_samples, n_features = X.shape
attribute_means = X.mean(axis=0)
assert(attribute_means.shape) == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')
np.savetxt("X_d.txt", X_d)

In [29]:
from sklearn.cross_validation import train_test_split
random_state = 14
X_train, X_test, Y_train, Y_test = train_test_split(X_d, Y, random_state=random_state)
print("There are {} training samples".format(Y_train.shape))
print("There are {} testing samples".format(Y_test.shape))

There are (112,) training samples
There are (38,) testing samples


In [30]:
from collections import defaultdict
from operator import itemgetter


def train_feature_value(X, y_true, feature, current_value):
    class_count = defaultdict(int)
    for sample, y in zip(X, y_true):
        if sample[feature] == current_value:
            class_count[y] += 1
    sorted_class_count = sorted(class_count.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_count[0][0]
    error = sum(class_count for class_value, class_count in class_count.items()
                if class_value != most_frequent_class)
    return most_frequent_class, error


def train(X, y_true, feature):
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    values = set(X[:, feature])
    predictors = dict()
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    total_error = sum(errors)
    return predictors, total_error

In [47]:
all_predictors = {variable: train(X_train, Y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))

# Choose the bset model
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)
print(errors)

The best model is based on variable 2 and has error 37.00
{'predictor': {0: 0, 1: 2}, 'variable': 2}
{0: 41, 1: 58, 2: 37, 3: 37}


In [38]:
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

In [41]:
y_predicted = predict(X_test, model)
print(y_predicted)

[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]


In [44]:
accuracy = np.mean(y_predicted == Y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))

The test accuracy is 65.8%


In [46]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, y_predicted))

             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       0.00      0.00      0.00        13
          2       0.40      1.00      0.57         8

avg / total       0.51      0.66      0.55        38



  'precision', 'predicted', average, warn_for)
