In [1]:
import pandas as pd
import numpy as np

## Read

In [2]:
## read train data
data = pd.read_table('training.txt', sep=' ', header=None)
# name label
data.rename(columns={0:'label'}, inplace=True)
# sepearte labels
label = data.iloc[:, 0]

# remove index: from data
data = data.iloc[:, 1:]
data = data.iloc[:,1:].applymap(lambda x: x[x.find(':')+1:])

# rename columns and merge label
data.columns = list(range(data.shape[1]))
data = data.join(label)
# label: int, other columns: str

In [132]:
## read test data
test = pd.read_table('testing.txt', sep=' ', header=None)

# remove index: from data
test = test.applymap(lambda x: x[x.find(':')+1:])

## Decision Tree

In [None]:
'''
1.compute the entropy for data-set
2.for every attribute/feature:
       1.calculate entropy for all categorical values
       2.take average information entropy for the current attribute
       3.calculate gain for the current attribute
3. pick the highest gain attribute.
4. Repeat until we get the tree we desired.
'''

In [83]:
# create Tree class
class Tree:
    def __init__(self):
        self.left = None
        self.right = None
        self.data = None
        self.root = None
        self.depth = 0
        self.rule = None
        
    def get_depth(self):
        if self.root == None:
            return self.depth
        else:
            return self.root.get_depth()+1

In [52]:
def get_gini(data, column, feature):
    n = data.shape[0]
    # counts of each attribute, each class(label) count
    column_count = data.groupby(column)[column].count()
    class_count = data.groupby([column, 'label'])[column].count()

    # gini for left split
    class_exclude = class_count.unstack().drop(index = feature).sum()
    column_exclude = column_count.drop(index=feature).sum()
    prob_exclude = class_exclude/column_exclude
    ratio_exclude = column_exclude/n
    gini_exclude = (1-np.sum((prob_exclude)**2))

    # gini for right split
    class_feature = class_count.unstack().loc[feature]
    column_feature = column_count.loc[feature].sum()
    prob_feature = class_feature/column_feature
    ratio_feature = column_feature/n
    gini_feature = (1-np.sum((prob_feature)**2))

    # combine two gini index
    gini = ratio_exclude*gini_exclude+ratio_feature*gini_feature
    return gini

## Root

In [97]:
%%time
train = data
## calculate gini
n = train.shape[0]
gini_list = []

# loop over columns except labels
for column in train.iloc[:,:-1]:
    for feature in np.unique(train[column]):
        gini = get_gini(train, column, feature)
        gini_list.append((column, feature, gini))

Wall time: 2.29 s


In [98]:
# split tree
column, feature, min_gini = min(gini_list, key=lambda x: x[2])
left = data[data[column]==feature]
right = data[data[column]!=feature]

In [99]:
# tree
root = Tree()
root.rule = (column, feature)

# left
root.left = Tree()
root.left.root = root
root.left.data = left

# right
root.right = Tree()
root.right.root = root
root.right.data = right

## Left

In [111]:
%%time
train = root.left.data
## calculate gini
n = train.shape[0]
gini_list = []

# loop over columns except labels
for column in train.iloc[:,:-1]:
    for feature in np.unique(train[column]):
        gini = get_gini(train, column, feature)
        gini_list.append((column, feature, gini))

Wall time: 2.01 s


In [113]:
# split tree
column, feature, min_gini = min(gini_list, key=lambda x: x[2])
left = train[train[column]==feature]
right = train[train[column]!=feature]

In [117]:
# tree
root.left.rule = (column, feature)

# left
root.left.left = Tree()
root.left.left.root = root.left
root.left.left.data = left

# right
root.left.right = Tree()
root.left.right.root = root.left
root.left.right.data = right

In [124]:
lln = root.left.left.data.shape[0]
root.left.left.data.groupby('label')[column].count()/lln

label
1    0.065574
2    0.631148
3    0.163934
4    0.139344
Name: 55, dtype: float64

In [125]:
lrn = root.left.right.data.shape[0]
root.left.right.data.groupby('label')[column].count()/lrn

label
1    0.117063
2    0.277778
3    0.107143
4    0.498016
Name: 55, dtype: float64

In [126]:
rn = root.right.data.shape[0]
root.right.data.groupby('label')[column].count()/rn

label
1    0.280118
2    0.226622
3    0.292334
4    0.200927
Name: 55, dtype: float64

## Predicition

In [175]:
#split
column, feature = root.rule
# split tree
left = test[test[column]==feature]
right = test[test[column]!=feature]

In [176]:
column, feature = root.left.rule
# split left tree
ll = left[left[column]==feature]
lr = left[left[column]!=feature]

In [177]:
test['label'] = None
test.loc[right.index, 'label']=int(3)
test.loc[ll.index, 'label']=int(2)
test.loc[lr.index, 'label']=int(4)

In [178]:
lr.shape[0]+ll.shape[0]

194

## Save

In [169]:
with open('result.txt', 'w') as fp:
    [fp.write(str(label)+'\n') for label in  test.label]

In [15]:
column = 71
feature = '1'
# counts of each attribute, each class(label) count
column_count = data.groupby(column)[column].count()
class_count = data.groupby([column, 'label'])[column].count()

# gini for left split
class_exclude = class_count.unstack().drop(index = feature).sum()
column_exclude = column_count.drop(index=feature).sum()
prob_exclude = class_exclude/column_exclude
ratio_exclude = column_exclude/n
gini_exclude = (1-np.sum((prob_exclude)**2))

# gini for right split
class_feature = class_count.unstack().loc[feature]
column_feature = column_count.loc[feature].sum()
prob_feature = class_feature/column_feature
ratio_feature = column_feature/n
gini_feature = (1-np.sum((prob_feature)**2))

# combine two gini index
gini = ratio_exclude*gini_exclude + ratio_feature*gini_feature
gini_list.append(gini)

0.7290673689858093