In [9]:
import pandas as pd

training_data = [
    ['Green', 3, 'Mango'],
    ['Yellow', 3, 'Mango'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]


# column labels
# this are only used to print the tree
header = ['color', 'diameter', 'label']

def unique_vals(rows, col):
    """find the unique values for a column in a  dataset."""
    return set([row[col] for row in rows])

def class_counts(rows):
    """Counts the number of each type of example in a dataset"""
    counts = {}  # a dictonary of label -> count
    for row in rows:
        # in our dataset format, the label is always the last column
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

#demo :
# class_counts(training_data)

def is_numeric(value):
    """Test if a value is numeric"""
    return isinstance(value, int) or isinstance(value, float)

#######
#demo
# is_numeric(7)
# is_numeric("Red")

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value
      
    
    def __repr__(self):
        # this is just a helper method to print
        # the question in a readable format
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
        	header[self.column], condition, str(self.value))
    
    
    def partition(rows, question):
        """partitions a dataset
        
        For each row in the dataset, check if it matches the question. if so, add it to 
        the 'true rows', otherwise, add it to the 'false rows'.
        """
        true_row, false_rows = [], []
        for row in rows:
            if question.match(row):
                true_rows.append(row)
            else:
                false_rows.append(row)
        return true_rows, false_rows
    
    
    def gini(rows):
        """calculates the Gini impurity for a list of rows."""
        counts = class_counts(rows)
        impurity = 1
        for lbl in counts:
            prob_of_lbl = counts[lbl] / float(len(rows))
            impurity -= prob_of_lbl**2
        return impurity
    
    def info_gain(left, right, current_uncertainity):
        """infomation gain.
        
        The uncertainity of the starting node,, minus the weighted impurity
        of the two child nodes.
        """
        
        p = float(len(left)) / (len(left) + len(right))
        return current_uncertainity - p* gini(left) - (1-p)*gini(right)
    
    def find_best_split(rows):
        """find the best question to ask by iterating over feature/ value and calculating the information gain.
        """
        best_gain = 0 # keep track of the best info gain
        best_question - None # keep train of the feature/ value that produced it
        current_uncertainity = gini(rows)
        n_features = len(rows[0]) -1 # number of columns
        
        for col in range(n_features): # for each feature
            values = set([row[col] for row in rows]) # unique values in the column
            
            for val in values: # for each value
                question = Question(col, val)
                
                #try spliting the dataset
                true_rows, false_rows = partition(rows, question)
                

class leaf:
    """A leaf node classifies data.
    
    this holds a dictonary of class (eg "mango") number of times it appears 
    in the row form the terining data that rach this leaf.
    """
    def __init__(self, rows):
        self.predictions = class_counts(rows)

        
class Decision_Node:
    """A decison Node aasks a question.
    This holds a reference to the questioon, and to the two childs nodes."""
    
    
    def __init__(self,
                question,
                true_branch,
                false_branch):
        self.question = question
        self.true_branch  = true_branch
        self.false_branch = false_branch
        
    def build_tree(rows):
        """Builds the tree"""
        gain, question  = find_best_split(rows)
        if gain ==0:
            return Leaf(rows)
        
        true_rows, false_rows = partition(rows, question)
        true_branch = build_tree(false_rows)
        
        return Decision_Node(question, true_branch, false_branch)
    
    def print_tree(node, spacing=""):
        """world's most elegant tree printing function """
        
        # Base case: we've reached a leaf
        if isinstance(node, leaf):
            print(spacing + "Predict", node.prediction)
            return
        
        print(spacing + str(node.question))
        print(spacing + '--> True:')
        print_tree(nde.true_branch, spacing + " ")
        
    def classify(row, node):
        if isinstance(node, leaf):
            return node.predictions
        
        if node.question.match(row):
            return classify(row, node.true_branch)
        else:
            return classify(row, node.false_branch)
        
        
    def print_leaf(counts):
        """print the predictions of a leaf."""
        total = sum(counts.values())*10
        probs = {}
        for lnl in counts.key():
            probs[lbl] = str(int(counts[lbl] / total*100)) + "%"
        return probs
    
    
    
    
    
    
    
    
if __name__ == '__main__':
    my_tree = build_tree(training_data)
    print_tree(my_tree)
    
    #evaluate
    training_data = [
    ['Green', 3, 'Mango'],
    ['Yellow', 3, 'Mango'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]
    for row in testing_data:
        print("Actual: %s.Predicted: %s" %
             (row[-1], print_leaf(classify(row, my_tree))))
            

df = pd.DataFrame(training_data)
df

NameError: name 'build_tree' is not defined