In [1]:
from sklearn.datasets import load_iris
import numpy as np

In [32]:
# load load_iris dataset from scikit-learn library, display description and information
#   sepal: 萼片
#   petal: 花瓣
dataset = load_iris()
iris_data = dataset.data
iris_target = dataset.target
print (dataset.DESCR)
data_instances, data_attributes = iris_data.shape
print ("The data contains {0} instances, for each has {1} attributes.".format(data_instances, data_attributes))
print ("5 data samples in data: ")
for index in range(5):
    print (iris_data[index])
target_instances = iris_target.shape[0]
print ("The data contains {0} instances.".format(target_instances))
print ("In target: 0 - Iris Sentosa, 1 - Iris Cersicolour, 2 - Iris Virginica")
print ("5 data samples in target: ")
for index in range(5):
    print (iris_target[index])

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [21]:
# calculate the means of each attributes for all samples
# get feature set based on means: 
#    find out the attributes greater than mean value (set to 1), and ignore the others (set to 0)
attribute_means = iris_data.mean(axis=0)
assert attribute_means.shape == (data_attributes,)
print ("Means for each attribute: ")
print (attribute_means)
iris_data_d = np.array(iris_data >= attribute_means, dtype='int')
print ("5 sample feature set: ")
for index in range(5):
    print (iris_data_d[index])

Means for each attributes: 
[5.84333333 3.054      3.75866667 1.19866667]
5 sample feature set: 
[0 1 0 0]
[0 0 0 0]
[0 1 0 0]
[0 1 0 0]
[0 1 0 0]


In [31]:
# split feature set and target set into training and test set
from sklearn.cross_validation import train_test_split
random_state = 14
iris_data_train, iris_data_test, iris_target_train, iris_target_test = train_test_split(iris_data_d, iris_target, random_state = random_state)
print("There are {0} training target samples.".format(iris_target_train.shape[0]))
print("There are {0} testing target samples.".format(iris_target_test.shape[0]))

There are 112 training target samples.
There are 38 testing target samples.


In [49]:
# 
from collections import defaultdict
from operator import itemgetter
# this function aims to:
#    find the most frequent class of a certain feature
#    find the error
def train_feature_value(data, target, feature, value):
    # Create a simple dictionary to count how frequency they give certain predictions
    class_counts = defaultdict(int)
    for x, y in zip(data, target): # combine the data and target
        if x[feature] == value:
            class_counts[y] += 1
    # Now get the best one by sorting (highest first) and choosing the first item
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0] # most frequent class name(0, 1 or 2)
    # The error is the number of samples that do not belong to the most frequent class but have the same feature value.
    data_attributes = data.shape[1] # 4 attributes
    error = sum([class_count for class_value, class_count in class_counts.items() if class_value != most_frequent_class])
    return most_frequent_class, error

# this function aims to:
#    find the feature values and their cooresponding most-frequent class
#    find the total error for this certain feature
def train(data, target, feature): # data: data set; target: target_set; feature: feature number
    data_samples, data_features = data.shape
    assert 0 <= feature < data_features
    values = set(data[:, feature]) # get unique values for a certain feature
    predictors = dict()
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(data, target, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    # Compute the total error of using this feature to classify on
    total_error = sum(errors)
    return predictors, total_error
predictors, total_error = train(iris_data, iris_target, 0)

print("Example: find the predictors of feature sepal length(0)")
predictors_array = predictors.items()
for predictor in predictors_array:
    print ("sepal length of {0} cm should be in {1} class.".format(predictor[0], predictor[1]))
print("with total value {0}%".format(total_error))

Example: find the predictors of feature sepal length(0)
sepal length of 4.7 cm should be in 0 class.
sepal length of 5.5 cm should be in 1 class.
sepal length of 6.3 cm should be in 2 class.
sepal length of 5.0 cm should be in 0 class.
sepal length of 4.9 cm should be in 0 class.
sepal length of 5.1 cm should be in 0 class.
sepal length of 4.6 cm should be in 0 class.
sepal length of 5.4 cm should be in 0 class.
sepal length of 4.4 cm should be in 0 class.
sepal length of 4.8 cm should be in 0 class.
sepal length of 5.8 cm should be in 1 class.
sepal length of 7.0 cm should be in 1 class.
sepal length of 7.1 cm should be in 2 class.
sepal length of 4.5 cm should be in 0 class.
sepal length of 5.9 cm should be in 1 class.
sepal length of 5.6 cm should be in 1 class.
sepal length of 6.9 cm should be in 2 class.
sepal length of 6.6 cm should be in 1 class.
sepal length of 6.5 cm should be in 2 class.
sepal length of 6.4 cm should be in 2 class.
sepal length of 6.0 cm should be in 1 class.

In [53]:
# Compute all of the predictors
all_predictors = {variable: train(iris_data_train, iris_target_train, variable) for variable in range(iris_data_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# Now choose the best and save that as "model"
# Sort by error
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}%".format(best_variable, best_error))

# Choose the bset model
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)

{0: ({0: 0, 1: 2}, 41), 1: ({0: 1, 1: 0}, 58), 2: ({0: 0, 1: 2}, 37), 3: ({0: 0, 1: 2}, 37)}
The best model is based on variable 2 and has error 37.00%
{'variable': 2, 'predictor': {0: 0, 1: 2}}


In [62]:
def predict(data_test, model): # 38 samples in data test set
    variable = model['variable']
    predictor = model['predictor']
    target_predicted = np.array([predictor[int(sample[variable])] for sample in data_test])
    return target_predicted
iris_target_predicted = predict(iris_data_test, model)
print (iris_target_predicted)

[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]


In [63]:
# Compute the accuracy by taking the mean of the amounts that y_predicted is equal to y_test
accuracy = np.mean(iris_target_predicted == iris_target_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))

The test accuracy is 65.8%


In [64]:
from sklearn.metrics import classification_report
print(classification_report(iris_target_test, iris_target_predicted))

             precision    recall  f1-score   support

          0       0.94      1.00      0.97        17
          1       0.00      0.00      0.00        13
          2       0.40      1.00      0.57         8

avg / total       0.51      0.66      0.55        38



  'precision', 'predicted', average, warn_for)
