In [1]:
from __future__ import print_function 
import numpy as np 
import pandas as pd

In [2]:
class TreeNode(object):
    def __init__(self, ids = None, children = [], entropy = 0, depth = 0):
        self.ids = ids           # index of data in this node
        self.entropy = entropy   # entropy, will fill later
        self.depth = depth       # khoảng cách đến root node
        self.split_attribute = None # which attribute is chosen, it non-leaf
        self.children = children # list of its child nodes
        self.order = None       # order of values of split_attribute in children
        self.label = None       # label of node if it is a leaf

    def set_properties(self, split_attribute, order):
        self.split_attribute = split_attribute
        self.order = order

    def set_label(self, label):
        self.label = label

In [3]:
def entropy(freq):
    # bỏ các tần suất bằng 0 đi vì logarit tại đây không xác định.
    freq_0 = freq[np.array(freq).nonzero()[0]]
    prob_0 = freq_0/float(freq_0.sum())
    return -np.sum(prob_0*np.log(prob_0))

In [4]:
class DecisionTreeID3(object):
    def __init__(self, max_depth= 10, min_samples_split = 2, min_gain = 1e-4):
        self.root = None
        self.max_depth = max_depth 
        self.min_samples_split = min_samples_split 
        self.Ntrain = 0
        self.min_gain = min_gain
    
    def fit(self, data, target):
        self.Ntrain = data.count()[0] # lấy ra số lượng dữ liệu của thuộc tính thứ nhất (outlook)
        self.data = data # dữ liệu
        self.attributes = list(data) # ['outlook', 'temperature', 'humidity', 'wind']
        self.target = target # nhãn
        self.labels = target.unique()
        
        ids = range(self.Ntrain) # tạo dãy tự nhiên từ 0-14
        self.root = TreeNode(ids = ids, entropy = self._entropy(ids), depth = 0) # tạo đối tượng lớp TreeNode, đối số sẽ truyền vào sau
        queue = [self.root] # list
        while queue:
            node = queue.pop()
            if node.depth < self.max_depth or node.entropy < self.min_gain:  # điều kiện dừng
                node.children = self._split(node)
                if not node.children: #leaf node
                    self._set_label(node)
                queue += node.children
            else:
                self._set_label(node)
                
    def _entropy(self, ids):
        # calculate entropy of a node with index ids
        if len(ids) == 0: return 0
        ids = [i+1 for i in ids] # panda series index starts from 1
        freq = np.array(self.target[ids].value_counts())
        return entropy(freq)

    def _set_label(self, node):
        # find label for a node if it is a leaf
        # simply chose by major voting 
        target_ids = [i + 1 for i in node.ids]  # target is a series variable
        node.set_label(self.target[target_ids].mode()[0]) # most frequent label
    
    def _split(self, node):
        ids = node.ids 
        best_gain = 0
        best_splits = []
        best_attribute = None
        order = None
        sub_data = self.data.iloc[ids, :]
        for i, att in enumerate(self.attributes):
            values = self.data.iloc[ids, i].unique().tolist()
            if len(values) == 1: continue # entropy = 0
            splits = []
            for val in values: 
                sub_ids = sub_data.index[sub_data[att] == val].tolist()
                splits.append([sub_id-1 for sub_id in sub_ids])
            # don't split if a node has too small number of points
            if min(map(len, splits)) < self.min_samples_split: continue
            # information gain
            HxS= 0
            for split in splits:
                HxS += len(split)*self._entropy(split)/len(ids)
            gain = node.entropy - HxS 
            if gain < self.min_gain: continue # stop if small gain 
            if gain > best_gain:
                best_gain = gain 
                best_splits = splits
                best_attribute = att
                order = values
        node.set_properties(best_attribute, order)
        child_nodes = [TreeNode(ids = split,
                     entropy = self._entropy(split), depth = node.depth + 1) for split in best_splits]
        return child_nodes

    def predict(self, new_data):
        """
        :param new_data: a new dataframe, each row is a datapoint
        :return: predicted labels for each row
        """
        npoints = new_data.count()[0]
        labels = [None]*npoints
        for n in range(npoints):
            x = new_data.iloc[n, :] # one point 
            # start from root and recursively travel if not meet a leaf 
            node = self.root
            while node.children: 
                node = node.children[node.order.index(x[node.split_attribute])]
            labels[n] = node.label
            
        return labels

In [5]:
if __name__ == "__main__":
    df = pd.read_csv('weather.csv', index_col = 0, parse_dates = True)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    tree = DecisionTreeID3(max_depth = 3, min_samples_split = 2)
    tree.fit(X, y)
    print(tree.predict(X))

['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']


In [55]:
print(X)

     outlook temperature humidity    wind
id                                       
1      sunny         hot     high    weak
2      sunny         hot     high  strong
3   overcast         hot     high    weak
4      rainy        mild     high    weak
5      rainy        cool   normal    weak
6      rainy        cool   normal  strong
7   overcast        cool   normal  strong
8      sunny        mild     high    weak
9      sunny        cool   normal    weak
10     rainy        mild   normal    weak
11     sunny        mild   normal  strong
12  overcast        mild     high  strong
13  overcast         hot   normal    weak
14     rainy        mild     high  strong


In [56]:
print(y)

id
1      no
2      no
3     yes
4     yes
5     yes
6      no
7     yes
8      no
9     yes
10    yes
11    yes
12    yes
13    yes
14     no
Name: play, dtype: object


In [23]:
list(y)

['no',
 'no',
 'yes',
 'yes',
 'yes',
 'no',
 'yes',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'no']

In [30]:
X.count()

outlook        14
temperature    14
humidity       14
wind           14
dtype: int64

In [25]:
y.count()

14

In [31]:
list(X)

['outlook', 'temperature', 'humidity', 'wind']

In [51]:
y.unique()

array(['no', 'yes'], dtype=object)

In [53]:
X.unique()

AttributeError: 'DataFrame' object has no attribute 'unique'

In [38]:
print(DecisionTreeID3.queue)

AttributeError: type object 'DecisionTreeID3' has no attribute 'queue'

In [45]:
x = TreeNode(ids = ids, entropy = self._entropy(ids), depth = 0)

NameError: name 'ids' is not defined

In [48]:
a = 100
print([a])

[100]


In [2]:
class SieuNhan:
    # class attribute
    suc_manh = 50

    def __init__(self, para_ten, para_vu_khi, para_mau_sac):
        self.ten = "Sieu nhan" + para_ten
        self.vu_khi = para_vu_khi
        self.mau_sac = para_mau_sac

    @classmethod
    def cap_nhat_suc_manh(cls, smanh): # cls tương tự self
        cls.suc_manh = smanh # suc_manh là một class attribute nên có thể truy xuất qua biến cls.
        # Ta có thể hiểu rằng cls.suc_manh hoàn toàn tương đương SieuNhan.suc_manh
""""
sieu_nhan_A = SieuNhan("Sieu nhan do", "kiem", "Do")
print(SieuNhan.suc_manh)
print(sieu_nhan_A.suc_manh)
sieu_nhan_A.cap_nhat_suc_manh(40) # sử dụng đối tượng thay vì lớp
print(SieuNhan.suc_manh)
print(sieu_nhan_A.suc_manh)
"""
# trên đây là cách thay đổi giá trị của class attribute thông qua object sieu_nhan_A
# Việc này sẽ đảm bảo tính toàn vẹn dữ liệu v.v..

sieu_nhan_Black = SieuNhan("Sieu nhan Đen", "Dùi cui", "Black")
tuanmanh = [sieu_nhan_Black]
print(tuanmanh)

[<__main__.SieuNhan object at 0x00000200B0301730>]


In [43]:
m = list(X)
for i in enumerate(m):
    n = X.iloc[0, i].unique().tolist()
    print(len(n))

IndexingError: Too many indexers

In [27]:
X(1)

TypeError: 'DataFrame' object is not callable

In [58]:
j = range(0,14)
m = list(X)

for i in enumerate(m):
    n = X.iloc[j, i].unique().tolist()
    print(len(n))
    
print(m)

IndexingError: Too many indexers

In [6]:
list(range(X.count()[0]))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]