In [131]:
import numpy as np
import pandas as pd
# import csv
# import matplotlib.cm as cm
# import matplotlib.mlab as mlab
# import matplotlib.pyplot as plt
# import scipy as sp
import scipy.io as sio
# import scipy.stats as stats
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import Imputer
from math import log2

In [132]:
SPAM = 'dist/spam_data.mat'
TRAIN_CENSUS = 'hw5_census_dist/train_data.csv'
TRAIN_TITANIC = 'hw5_titanic_dist/titanic_training.csv'

In [133]:
spam_data = sio.loadmat(SPAM)
census_tdata = pd.read_csv(TRAIN_CENSUS)
titanic_tdata = pd.read_csv(TRAIN_TITANIC)

In [134]:
"""Removed ticket and cabin features"""
titanic_tdata_filtered = titanic_tdata.drop('ticket', 1)
titanic_tdata_filtered = titanic_tdata_filtered.drop('cabin', 1)

"""Row 707 is empty in titanic_training.csv"""
titanic_tdata_filtered = titanic_tdata_filtered.dropna(how='all')

"""Map categories to binary variables [1(a)]"""
tv = DictVectorizer()
vectorized_titanic = tv.fit_transform(titanic_tdata_filtered.to_dict(orient='records'))

"""Replaced unknown values with mean of features [2(b)]"""
timp = Imputer()
imputed_titanic = timp.fit_transform(vectorized_titanic)

ttdata = pd.DataFrame(data=imputed_titanic.toarray(), columns=tv.get_feature_names(), dtype='object')
# ttlabels = ttdata['survived']
# ttdata = ttdata.drop('survived', 1)

In [239]:
"""Map categories to binary variables [1(a)]"""
cv = DictVectorizer()
vectorized_census = cv.fit_transform(census_tdata.to_dict(orient='records'))

"""Replaced unknown values with mean of features [2(b)]"""
cimp = Imputer()
imputed_census = cimp.fit_transform(vectorized_census)

ctdata = pd.DataFrame(data=imputed_census.toarray(), columns=cv.get_feature_names(), dtype='object')
# ctlabels = ctdata['label']
# ctdata = ctdata.drop('label', 1)

In [None]:
stdata = spam_data['training_data']
stlabels = spam_data['training_labels']

# Decision Tree Abstraction
## Class InternalNode
* ### State
    * Node left, right
    * split_feature [split rule]
    * split_value [split rule]
* ### Methods
    * `predict(data)`
        * given a data point, chooses left or right child based on the split rule
        * traverses starting from this node
    * `is_leaf() { return False }`

## Class LeafNode
* ### State
    * label
* ### Methods
    * `is_leaf() { return True }`

## Class DecisionTree
* ### State
    * root
* ### Methods
    * `impurity(left_label_hist, right_label_hist)`
        * calculates the entropy of a split
    * `segmenter(data, labels)`
        * finds the best split rule using impurity()
        * many different types of segmenters
    * `train(train_data, train_labels, depth_limited)`
        * grows the decision tree
        * uses segmenter to find the best splits
    * `predict(data)`
        * given a data point, traverses the tree starting at the root
    

In [221]:
class LeafNode:
    def __init__(self, label):
        self.label = label
        
    def predict(self, data):
        return self.label
    
    def is_leaf(self):
        return True
    
class InternalNode:
    def __init__(self, left, right, split_feature, split_value):
        self.left, self.right = left, right
        self.split_feature = split_feature 
        self.split_value = split_value
    
    def predict(self, data):
        if data[self.split_feature] < self.split_value:
            return self.left.predict(data)
        return self.right.predict(data)
        
    def is_leaf(self):
        return False
        
class PandasDecisionTree:
    def __init__(self, root=None):
        self.root = root
        
    def logsp(self, x, y):
        if x == 0:
            return 0
        return x * log2(x / (x + y))
        
    def impurity(self, C, D, nC, nD):
        c, d = nC - C, nD - D
        return -(self.logsp(C, D) + self.logsp(D, C) + self.logsp(c, d) + self.logsp(d, c)) / (nC + nD)
        
    def count(self, data, label):
        nC, nD = 0, 0
        for lbl in data[label]:
            if lbl == 0:
                nC += 1
            else:
                nD += 1
        return nC, nD
        
    def segmenter(self, data, label):
        """
        For splits on a feature f with a value v, 
            left will have samples with f values strictly less than v
            right will have samples with f values greater than or equal to v 
        """
        nC, nD = self.count(data, label)
        features = data.drop(label, 1).axes[1]
        
        min_entropy = float('inf')
        splitf = features[0]
#         print(len(data[splitf]))
        splitv = next(data[splitf].__iter__())
        spliti = max(len(data) // 2, 1)
        
        for f in features:
            sorted_data_by_f = data.sort_values(f)
            iterv = sorted_data_by_f[f].__iter__()
            iterl = sorted_data_by_f[label].__iter__()
            
            # keeps track of which value-label pair we're on
            v, lbl, i = next(iterv), next(iterl), 0
            
            # keeps track of class counts on LEFT
            C, D = 0, 0
            
            # don't check if all data lies on one side, since then this node should be a leaf (base case of train)
            beta_iter = sorted_data_by_f.drop_duplicates(subset=f, keep='first')[f].__iter__()
            next(beta_iter)
            
            for beta in beta_iter:
                try:
                    while v != beta:
                        if lbl == 0:
                            C += 1
                        else:
                            D += 1
                        i += 1
                        v, lbl = next(iterv), next(iterl)
                except StopIteration:
                    continue
                entropy = self.impurity(C, D, nC, nD)
#                 print(f, beta, i, C, D, entropy)
                if entropy < min_entropy:
                    min_entropy, splitf, splitv, spliti = entropy, f, beta, i
            # no need to check split on max(beta)+1 since then all elements are on one split (node should be leaf)
            
        if splitf == None:
            raise Exception('{}\n{}'.format(data, features))
            
        sorted_data = data.sort_values(splitf)
        left = sorted_data.iloc[:spliti]
        right = sorted_data.iloc[spliti:]
        
        return left, right, splitf, splitv #, spliti
        
    def train(self, train_data, label, depth_limited=float('inf')):
        def grow_tree(train_data, label, depth_limited=float('inf')):
            labels = train_data[label].unique()
            if depth_limited == 0 or len(labels) == 1:
                return LeafNode(labels[0])
            left_data, right_data, split_feature, split_value = self.segmenter(train_data, label)
            left = grow_tree(left_data, label, depth_limited - 1)
            right = grow_tree(right_data, label, depth_limited - 1)
            return InternalNode(left, right, split_feature, split_value)
        self.root = grow_tree(train_data, label, depth_limited)
        
    def predict(self, data):
        return self.root.predict(data)

In [222]:
shuffled_ttdata = ttdata.sample(frac=1)
tvnum = int(len(shuffled_ttdata) // 10)
vsttdata = shuffled_ttdata[:tvnum]
tsttdata = shuffled_ttdata[tvnum:]

tlabel = 'survived'
tdt = PandasDecisionTree()
tdt.train(tsttdata, tlabel)

correct = 0
for _, vsample in vsttdata.iterrows():
    if tdt.predict(vsample) == vsample[tlabel]:
        correct += 1
print(correct / len(vsttdata), correct, len(vsttdata))

0.8282828282828283 82 99


In [None]:
shuffled_ctdata = ctdata.sample(frac=1)
cvnum = int(len(shuffled_ctdata) // 10)
vsctdata = shuffled_ctdata[:cvnum]
tsctdata = shuffled_ctdata[cvnum:]

clabel = 'label'
cdt = PandasDecisionTree()
cdt.train(tsctdata, clabel)

correct = 0
for _, vsample in vsctdata.iterrows():
    if cdt.predict(vsample) == vsample[clabel]:
        correct += 1
print(correct / len(vsctdata), correct, len(vsctdata))

In [174]:
small_titanic = ttdata[:30]
small_tdt = PandasDecisionTree()
# small_titanic['survived'].unique()
small_tdt.train(small_titanic, 'survived')

In [177]:
p = small_tdt.predict(ttdata.iloc[345])
print(p, ttdata['survived'][345])

1.0 1.0


In [166]:
# q = []
# q.append(small_tdt.root)
# while len(q) > 0:
#     n = q.pop(0)
#     if not n.is_leaf():
#         print(n.split_feature, n.split_value, '\n')
#         q.append(n.left)
#         q.append(n.right)
#     else:
#         print('label: ', n.label, '\n')

In [151]:
k, a, b = 'survived', 'apple', 'banana'
d = [{k: 0, a: 1, b: 1},
     {k: 1, a: 1, b: 77},
     {k: 1, a: 1, b: 4},
     {k: 0, a: 1, b: 1}, 
     {k: 0, a: 1, b: 2}]

# d = [{k: 0, a: 0, b: 1},
#      {k: 1, a: 0, b: 0}]

dfd = pd.DataFrame(d)
tdt = TitanicDecisionTree()
left, right, splitf, splitv, spliti = tdt.segmenter(dfd, k)
print('\n', left, '\n')
print(right, '\n')
print(splitf, splitv, spliti)

ValueError: not enough values to unpack (expected 5, got 4)

In [192]:
d = [{k: 7, a: 0, b: 1}]

dfd = pd.DataFrame(d)
print(dfd[k][0])

print(ttdata['survived'][0])

7
0.0


In [207]:
# print(ttdata.iloc[0])
print(ttdata['survived'][0])
print(len(ttdata))
print(ttdata.shape[0])
print(ttdata.iloc[705])
print(set(ttdata['survived']), '\n')
s = set(ttdata.axes[1])
# for f in ttdata.drop('survived', 1).axes[1]:
#     print(f, type(f))
    
features = ttdata.drop('survived', 1).axes[1]
for f in features:
    print(f)
    sorted_data_by_f = ttdata.sort_values(f)
    print(sorted_data_by_f.iloc[-1])
    break;

0.0
999
999
age           30.2594
embarked            0
embarked=C          0
embarked=Q          0
embarked=S          1
fare             8.05
parch               0
pclass              3
sex=female          0
sex=male            1
sibsp               0
survived            0
Name: 705, dtype: object
{0.0, 1.0} 

age
age           80
embarked       0
embarked=C     0
embarked=Q     0
embarked=S     1
fare          30
parch          0
pclass         1
sex=female     0
sex=male       1
sibsp          0
survived       1
Name: 475, dtype: object


In [205]:
raw_data = {'first_name': ['Jason', 'Jason', 'Tina', 'Jake', 'Amy', 'Zach'],
        'last_name': ['Miller', 'Miller', 'Ali', 'Milner', 'Cooze', 'Miller'],
        'age': [42, 42, 36, 24, 73, 4],
        'preTestScore': [4, 4, 31, 2, 3, 100],
        'postTestScore': [25, 25, 57, 62, 70, 100]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
print(df)
print(df.drop_duplicates(subset='first_name', keep='first'))

  first_name last_name  age  preTestScore  postTestScore
0      Jason    Miller   42             4             25
1      Jason    Miller   42             4             25
2       Tina       Ali   36            31             57
3       Jake    Milner   24             2             62
4        Amy     Cooze   73             3             70
5       Zach    Miller    4           100            100
  first_name last_name  age  preTestScore  postTestScore
0      Jason    Miller   42             4             25
2       Tina       Ali   36            31             57
3       Jake    Milner   24             2             62
4        Amy     Cooze   73             3             70
5       Zach    Miller    4           100            100
