In [1]:
import pandas as pd
import numpy as np

## Read

In [24]:
## read train data
data = pd.read_table('training.txt', sep=' ', header=None)
# name label
data.rename(columns={0:'label'}, inplace=True)
# sepearte labels
label = data.iloc[:, 0]

# remove index: from data
data = data.iloc[:, 1:]
data = data.iloc[:, 1:].applymap(lambda x: x[x.find(':')+1:])

# rename columns and merge label
data.columns = list(range(data.shape[1]))
data = data.join(label)
# label: int, other columns: str

In [25]:
## read test data
test = pd.read_table('testing.txt', sep=' ', header=None)

# remove index: from data
test = test.applymap(lambda x: x[x.find(':')+1:])

In [26]:
# train test split
n = data.shape[0]
idx = np.random.randint(0, n, int(n/20))
val_data = data.iloc[idx,:]
train_data = data.drop(idx)

## Decision Tree

In [6]:
def get_gini(data):
    n = data.shape[0]
    label_count = data.groupby('label')['label'].count()
    prob = label_count/n

    return 1-np.sum(np.square(prob))

In [138]:
def data_split(data, column, feature):
    left = data[column][data[column]==feature].index.values
    right = data[column][data[column]!=feature].index.values
    
    return left, right

In [8]:
def get_label(data):
    label_count = data.groupby('label')['label'].count()
    
    return label_count.idxmax()

In [27]:
# data indexing
train_data.index = range(0, train_data.shape[0])
val_data.index = range(0, val_data.shape[0])

In [238]:
%%time
## Build Tree
max_depth = 20
threshold = 0.005
min_node = 20
min_leaf = 10

# data copy
data_copy = train_data.copy()

## setup
depth = 0
result_list = []
search_list = [(data_copy.index.values, depth)]
base_gini_list = [get_gini(data_copy)]

# breadth first search
while search_list:
    
    node, depth = search_list.pop(0)
    base_gini = base_gini_list.pop(0)
    train = data_copy.loc[node]
    gini_list = []
    
    # case used all columns
    if train.shape[1] <=1:
        break

    # case when depth is higher than max_depth
    if depth >= max_depth:
        result_list.append((None, None, None))
        continue
    
    n = len(node)
    # case when node is too small to split
    if n <= min_node:
        result_list.append((None, None, None))
        continue

    # loop over columns except labels
    for column in train.iloc[:,:-1]:
        for feature in np.unique(train[column]):
            left_idx, right_idx = data_split(train, column, feature)

            # check if splits are smaller than min_leaf
            left_n = len(left_idx)
            right_n = len(right_idx)
            if left_n > min_leaf and right_n > min_leaf:
                left_ratio = left_n/n
                right_ratio = right_n/n
                gini = left_ratio*get_gini(train.loc[left_idx])+right_ratio*get_gini(train.loc[right_idx])
                gini_list.append((column, feature, gini))
    
    # find minimum gini
    if len(gini_list) <1:
        result_list.append((None, None, None))
        continue
    column, feature, min_gini = min(gini_list, key= lambda x: x[2])
    #print('min gini: '+ str(min_gini))
    
    # check if satisfies threshold
    if base_gini-min_gini < threshold:
        result_list.append((None, None, None))
        continue
    
    # remove used column
    left_idx, right_idx = data_split(train, column, feature)
    data_copy = data_copy.drop(columns=column)
    
    # append left and right node
    depth +=1
    result_list.append((column, feature, depth))
    search_list.append((left_idx, depth))
    search_list.append((right_idx, depth))
    
    # update base_gini and appends left, right base_gini
    base_gini = min_gini
    base_gini_list.append(base_gini)
    base_gini_list.append(base_gini)
    
    # print((column, feature, depth))
    # print('left num: '+str(len(left_idx)))
    # print('right num: '+str(len(right_idx)))
    # print('new base gini: '+str(base_gini) + '\n')

Wall time: 2min 7s


In [243]:
## predict
# copy test set
test_set = val_data.copy()
train_set = train_data.copy()

# setup
split_list = result_list.copy()
train_set['predict'] = None
test_set['predict'] = None
train_list = [train_set.index]
test_list = [test_set.index]

while split_list:
    # get split criteria and datasets from each
    column, feature, depth = split_list.pop(0)
    
    #print(str(column)+' '+str(feature)+', '+str(depth))
    train_idx = train_list.pop(0)
    test_idx = test_list.pop(0)
    train = train_set.loc[train_idx]
    test = test_set.loc[test_idx]
    
    # certain condition not met
    if column is None:
        continue
    
    n = len(train_idx)
    if n <= min_node:
        continue
    
    train_left_idx, train_right_idx = data_split(train, column, feature)
    test_left_idx, test_right_idx = data_split(test, column, feature)
    
    # set label
    left_label = get_label(train_set.loc[train_left_idx])
    right_label = get_label(train_set.loc[train_right_idx])
    # test set labels
    test_set.loc[test_left_idx, 'predict'] = left_label
    test_set.loc[test_right_idx, 'predict'] = right_label
    # train set labels
    train_set.loc[train_left_idx, 'predict'] = left_label
    train_set.loc[train_right_idx, 'predict'] = right_label

    # put on the list
    if len(train_left_idx) > min_leaf and len(train_right_idx) > min_leaf:
        train_list.append(train_left_idx)
        train_list.append(train_right_idx)
        test_list.append(test_left_idx)
        test_list.append(test_right_idx)
        # print('left label: '+str(left_label))
        # print('left num: '+str(len(train_left_idx)))
        # print('right label: '+str(right_label))
        # print('right num: '+str(len(train_right_idx)) +'\n')

In [244]:
np.unique(test_set.predict, return_counts=True)

(array([1, 2, 3, 4], dtype=int64), array([81, 17, 16, 36], dtype=int64))

In [245]:
np.mean(train_set.label == train_set.predict)

0.4640477025605051