In [193]:
import pandas as pd
import numpy as np

## Read

In [258]:
## read train data
data = pd.read_table('training.txt', sep=' ', header=None)
# name label
data.rename(columns={0:'label'}, inplace=True)
# sepearte labels
label = data.iloc[:, 0]

# remove index: from data
data = data.iloc[:, 1:].applymap(lambda x: x[x.find(':')+1:])
data = data.astype(int)

# rename columns and merge label
data.columns = list(range(data.shape[1]))
data = data.join(label)

In [259]:
## read test data
test = pd.read_table('testing.txt', sep=' ', header=None)

# remove index: from data
test = test.applymap(lambda x: x[x.find(':')+1:])

In [260]:
# train test split
n = data.shape[0]
idx = np.random.randint(0, n, int(n/20))
val_data = data.iloc[idx,:]
train_data = data.drop(idx)

# data indexing
train_data.index = range(0, train_data.shape[0])
val_data.index = range(0, val_data.shape[0])

## Decision Tree

In [170]:
def get_gini(data):
    _, counts = np.unique(np.array(data.label), return_counts= True)
    
    return 1-np.sum(np.square(counts/data.shape[0]))

In [215]:
def data_split(data, column, feature):
    
    left = data[column][data[column]==feature].index.values
    right = data[column][data[column]!=feature].index.values
    
    return left, right

In [226]:
def get_label(data):
    label_count = data.groupby('label').label.count()
    
    return label_count.idxmax()

In [217]:
def find_best_split(data, min_leaf):
    gini_list = []
    # loop over columns except labels
    for column in data.iloc[:, :-1]:
        for feature in np.unique(data[column]):
            left_idx, right_idx = data_split(data, column, feature)

            # check if splits are smaller than min_leaf
            left_n = len(left_idx)
            right_n = len(right_idx)
            if left_n <= min_leaf and right_n <= min_leaf:
                pass
            elif left_n > min_leaf and right_n > min_leaf:
                left_ratio = left_n/n
                right_ratio = right_n/n
                gini = left_ratio*get_gini(data.loc[left_idx])+right_ratio*get_gini(data.loc[right_idx])
                gini_list.append((column, feature, gini))
    
    return min(gini_list, key= lambda x: x[2])

In [250]:
max_depth = 5
build_tree(data, 5)

{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}
{}


{(72, 1): {(56, 3): {(82, 1): {(15, 3): None}}}}

In [249]:
def build_tree(data, depth, tree=None):
    n = data.shape[0]
    if depth ==0 or n <= min_node:
        pass
    else:
        # loop over columns except labels
        column, feature, min_gini = find_best_split(data, min_leaf)
        node = (column, feature)

        # remove used column
        left_idx, right_idx = data_split(data, column, feature)
        data = data.drop(columns=column)

        #Create an empty dictionary to create tree    
        if tree is None:                    
            tree={}
            tree[node] = {}

        # recursively buld
        tree[node] = build_tree(data.loc[left_idx], depth-1, tree)
        tree[node] = build_tree(data.loc[left_idx], depth-1, tree)
        
        return tree

In [224]:
%%time
## Build Tree
max_depth = 20
threshold = 1e-4
min_node = 20
min_leaf = 5

# data copy
data_copy = train_data.copy()

## setup
depth = 0
result_list = []
search_list = [(data_copy.index.values, depth)]
base_gini_list = [get_gini(data_copy)]

# breadth first search
while search_list:
    
    node, depth = search_list.pop(0)
    base_gini = base_gini_list.pop(0)
    train = data_copy.loc[node]
    gini_list = []
    
    # case used all columns
    if train.shape[1] <=1:
        break

    n = len(node)
    # case when depth is higher than max_depth
    # case when node is too small to split
    if depth >= max_depth or n <= min_node:
        result_list.append((None, None, None))
        continue

    # loop over columns except labels
    column, feature, min_gini = find_best_split(train, min_leaf)
    
    # check if satisfies threshold
    if base_gini-min_gini < threshold:
        result_list.append((None, None, None))
        continue
    
    # remove used column
    left_idx, right_idx = data_split(train, column, feature)
    data_copy = data_copy.drop(columns=column)
    
    # append left and right node
    depth +=1
    result_list.append((column, feature, depth))
    search_list.append((left_idx, depth))
    search_list.append((right_idx, depth))
    
    # update base_gini and appends left, right base_gini
    base_gini = min_gini
    base_gini_list.append(base_gini)
    base_gini_list.append(base_gini)
    
    print((column, feature, depth))
    #print('left num: '+str(len(left_idx)))
    #print('right num: '+str(len(right_idx)))
    #print('new base gini: '+str(base_gini) + '\n')

(72, 1, 1)
(56, 3, 2)
(64, 1, 2)
(123, 3, 3)
(42, 1, 3)
(23, 3, 3)
(127, 3, 3)
(29, 2, 4)
(84, 1, 4)
(101, 1, 4)
(19, 1, 4)
(99, 3, 4)
(76, 1, 4)
(96, 3, 5)
(44, 1, 5)
(21, 3, 5)
(98, 2, 5)
(8, 3, 5)
(33, 1, 5)
(14, 3, 6)
(0, 3, 6)
(68, 2, 6)
(97, 3, 6)
(49, 2, 6)
(46, 1, 6)
(27, 3, 6)
(93, 3, 6)
(102, 3, 7)
(58, 3, 7)
(59, 3, 7)
(61, 1, 7)
(34, 1, 7)
(112, 3, 7)
(126, 2, 7)
(106, 3, 7)
(57, 2, 7)
(35, 1, 7)
(1, 2, 8)
(74, 2, 8)
(69, 1, 8)
(47, 3, 8)
(20, 2, 8)
(103, 1, 8)
(28, 3, 8)
(63, 1, 8)
(48, 1, 8)
(38, 3, 8)
(2, 2, 9)
(41, 3, 9)
(45, 2, 9)
(113, 1, 9)
(60, 1, 9)
(3, 2, 10)
(18, 3, 10)
(108, 2, 10)
(87, 2, 10)
(10, 3, 10)
(4, 2, 11)
(118, 1, 11)
(91, 3, 11)
(125, 1, 12)
(105, 3, 12)
(13, 1, 13)
(5, 1, 13)
(36, 3, 14)
(6, 1, 15)
Wall time: 58.8 s


In [261]:
## predict
# copy test set
test_set = test.copy()
train_set = data.copy()

# setup
split_list = result_list.copy()
train_set['predict'] = None
test_set['predict'] = None
train_list = [train_set.index]
test_list = [test_set.index]

while split_list:
    # get split criteria and datasets from each
    column, feature, depth = split_list.pop(0)
    
    #print(str(column)+' '+str(feature)+', '+str(depth))
    train_idx = train_list.pop(0)
    test_idx = test_list.pop(0)
    train = train_set.loc[train_idx]
    test = test_set.loc[test_idx]
    
    # certain condition not met
    if column is None:
        continue
    
    n = len(train_idx)
    if n <= min_node:
        continue
    
    train_left_idx, train_right_idx = data_split(train, column, feature)
    test_left_idx, test_right_idx = data_split(test, column, feature)
    
    # set label
    left_label = get_label(train_set.loc[train_left_idx])
    right_label = get_label(train_set.loc[train_right_idx])
    # test set labels
    test_set.loc[test_left_idx, 'predict'] = left_label
    test_set.loc[test_right_idx, 'predict'] = right_label
    # train set labels
    train_set.loc[train_left_idx, 'predict'] = left_label
    train_set.loc[train_right_idx, 'predict'] = right_label

    # put on the list
    # if len(train_left_idx) > min_leaf and len(train_right_idx) > min_leaf:
    train_list.append(train_left_idx)
    train_list.append(train_right_idx)
    test_list.append(test_left_idx)
    test_list.append(test_right_idx)
    # print('left label: '+str(left_label))
    # print('left num: '+str(len(train_left_idx)))
    # print('right label: '+str(right_label))
    # print('right num: '+str(len(train_right_idx)) +'\n')

In [262]:
print(np.unique(train_data.label, return_counts=True))
print(np.unique(test_set.predict, return_counts=True))

(array([1, 2, 3, 4], dtype=int64), array([702, 721, 727, 706], dtype=int64))
(array([1], dtype=object), array([1000], dtype=int64))


In [251]:
np.mean(test_set.label == test_set.predict)

0.37333333333333335

## sklearn

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

Wall time: 1.74 s


In [None]:
%%time
parameters = {'max_depth':[10, 20, 30], 'min_samples_split': [2, 5, 10, 30],
              'min_samples_leaf':[3, 5, 10], 'min_impurity_decrease':[1e-3, 1e-4]}
dt = DecisionTreeClassifier()
grid_dt = GridSearchCV(dt, parameters, cv=5)
grid_dt.fit(train_data.iloc[:,:-1], train_data.label)

print(grid_dt.best_estimator_)
print(grid_dt.best_score_)

In [235]:
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=20,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=30,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
dt.fit(train_data.iloc[:,:-1], train_data.label)
pred = dt.predict(train_data.iloc[:,:-1])
np.mean(pred ==train_data.label)

0.7038205397826849

In [233]:
with open('result.txt', 'w') as fp:
    [fp.write(str(predict)+'\n') for predict in pred]