In [3]:


class Tree:
    def __init__(self, parent=None):
        self.parent = parent
        self.children = []
        self.label = None
        self.classCounts = None
        self.splitFeatureValue = None
        self.splitFeature = None

def dataToDistribution(data):
    '''Turn a dataset which has n possibe classification labels into a 
       probability distribution with n entries.'''
    allLabels = [label for (point,label) in data]
    numEntries = len(allLabels)
    possibleLabels = set(allLabels)
    
    dist = []
    for aLabel in possibleLabels:
        dist.append(float(allLabels.count(aLabel)) / numEntries)
    return dist

def entropy(dist):
    '''Compute the Shannon entropy of the given probability distribution.'''
    return -sum([p*math.log(p,2) for p in dist])

def splitData(data, featureIndex):
    attrValues = [point[featureIndex] for (point, label) in data]
    for aValue in set(attrValues):
        dataSubset = [(point, label) for (point, label) in data
                      if point[featureIndex] == aValue]
        yield dataSubset

def gain(data, featureIndex):
    entropyGain = entropy(dataToDistribution(data))
    for dataSubset in splitData(data,featureIndex):
        entropyGain -= entropy(dataToDistribution(dataSubset))
    return entropyGain

def homogeneous(data):
    return len(set([label for (point,label) in data])) <= 1

def majorityVote(data,node):
    labels = [label for (pt,label) in data]
    choice = max(set(labels), key=labels.count)
    node.label = choice
    return node

def buildDecisionTree(data, root, remainingFeatures):
    if homogeneous(data):
        root.label = data[0][1]
        return root
    if len(remainingFeatures) == 0:
        return majorityVote(data, root)
    bestFeature = max(remainingFeatures, key=lambda index: gain(data, index))
    if gain(data, bestFeature) == 0:
        return majorityVote(data,root)
    root.splitFeature = bestFeature
    print(str(root.splitFeature)+' ', end='')
    
    for dataSubset in splitData(data, bestFeature):
        aChild = Tree(parent=root)
        aChild.splitFeatureValue = dataSubset[0][0][bestFeature]
        root.children.append(aChild)
        buildDecisionTree(dataSubset, aChild, remainingFeatures - set([bestFeature]))
    return root

def decisionTree(data):
    return buildDecisionTree(data,Tree(),set(range(len(data[0][0]))))

def classify(tree,point):
    if tree.children == []:
        return tree.label
    else:
        matchingChildren = [child for child in tree.children
                           if child.splitFeatureValue == point[tree.splitFeature]]
        return classify(matchingChildren[0],point)

def testClassification(data, tree):
    actualLabels = [label for point, label in data]
    predictedLabels = [classify(tree, point) for point, label in data]
    correctLabels = [(1 if a == b else 0) for a,b in zip(actualLabels, predictedLabels)]
    return float(sum(correctLabels)) / len(actualLabels)



In [4]:
def BinaryDecisionTree(root, indent=''):
    if root.children == []:
        print(indent, root.splitFeatureValue, root.label, root.classCounts)
    else:
        BinaryDecisionTree(root.children[0], indent + '\t')

        if indent == '': # processing the root
            print(indent, root.splitFeature)
        else:
            print(indent, root.splitFeatureValue, root.splitFeature)
        if len(root.children) == 2:
            BinaryDecisionTree(root.children[1], indent + '\t')

import math
with open('house-votes-1984.txt', 'r') as inputFile:
    lines = inputFile.readlines()
data = [line.strip().split(',') for line in lines]
data = [(x[1:], x[0]) for x in data]
cdata = [x for x in data if '?' not in x[0]]
ndata = [x for x in data if '?' in x[0]] #直接去掉带？的数据

tree = decisionTree(cdata)
print()
BinaryDecisionTree(tree)
testClassification(cdata,tree)

3 13 14 1 9 4 12 4 11 0 2 8 7 6 5 12 9 15 10 15 11 7 4 8 6 12 5 0 10 1 9 14 2 
			 y R None
		 y 14
						 y R None
					 y 4
						 n R None
				 y 9
						 y R None
					 n 12
						 n D None
			 n 1
							 y R None
						 y 0
								 y R None
							 n 2
									 y R None
								 n 8
										 y R None
									 n 7
											 y R None
										 n 6
														 y R None
													 y 9
															 y R None
														 n 15
																 y D None
															 n 10
																 n R None
												 y 12
													 n R None
											 n 5
												 n R None
					 y 11
						 n R None
				 n 4
					 n R None
	 y 13
		 n R None
 3
			 y D None
		 y 11
					 y D None
				 y 4
								 y D None
							 y 12
									 y D None
								 n 5
											 y D None
										 y 10
												 y D None
											 n 1
														 y D None
													 y 14
															 y D None
														 n 2
															 n R None
												 n 9
													 

0.9870689655172413

In [64]:
a = [3,13,14,1,9,4,12,4,11,0,2,8,7,6,5,12,9,15,10,15,11,7,4,8,6,12,5,0,10,1,9,14,2]
print("the bestFeatue'column  is : "+str(max(set(a), key=a.count)))

the bestFeatue'column  is : 4


In [None]:
'''
剪枝叶和画出准确率与subset大小的关系图没有写
'''

In [None]:
'''
1.
In an image, the appearance and shape of the local target can be well described by the gradient or the density of the edge.

Sampling positive images
Sampling negative images
Training a Linear SVM
Performing hard-negative mining
Re-training your Linear SVM using the hard-negative samples
Evaluating your classifier on your test dataset, utilizing non-maximum suppression to ignore redundant, overlapping bounding boxes

2.
(1) sliding window algorithm
Sliding window is a sub-list that runs over an underlying collection. I.e., if you have an array like
[a b c d e f g h]
a sliding window of size 3 would run over it like

[a b c]
  [b c d]
    [c d e]
      [d e f]
        [e f g]
          [f g h]

(2) non-Maxima Suppression
I.e.
Discards all those cells where probability of object being present is <= 0.6
Then it takes the cell with largest probability among candidates for object as a prediction
Finally we discard any remaining cell with Intersection over union value >= 0.5 with the prediction cell.
'''
