In [1]:
import re
import math
import heapq

In [2]:
def getwords(doc):
    splitter = re.compile('\\W+')
    words = [s.lower() for s in re.split(splitter,doc) if len(s) > 2 and len(s) < 30]
    return dict([(word,1) for word in words])

In [3]:
getwords('I like programming')

{'like': 1, 'programming': 1}

In [4]:
class classifier:
    def __init__(self, getfeatures,filename=None):
#         classifier.__init__ = (self,getfeatures)
        self.fc = {} # 统计特征/分类组合的数量 e.g {'python': {'bad':0,'good':6},'the':{'bay':3,'good':3}}
        self.cc = {} # 统计每个分类的文档数量
        self.getfeatures = getfeatures
        self.thresholds = {}
        
        
    def incf(self, feature, category):
        self.fc.setdefault(feature,{})
        self.fc[feature].setdefault(category,0)
        self.fc[feature][category] += 1

    def incc(self, category):
        self.cc.setdefault(category,0)
        self.cc[category] += 1

    def fcount(self,feature, category):
        if feature in self.fc and category in self.fc[feature]:
            return float(self.fc[feature][category])
        return 0.0

    def catcount(self, category):
        if category in self.cc:
            return float(self.cc[category])
        return 0.0
    def totalcount(self):
        return sum(self.cc.values())

    def categories(self):
        return self.cc.keys()

    def train(self, doc, category):
        word_count = self.getfeatures(doc)
        for feature in word_count:
            self.incf(feature,category)
        self.incc(category)
        
    def fprob(self, feature, category):
        if self.catcount(category) == 0.0 or category not in self.fc[feature]: return 0
        return float(self.fc[feature][category]) / float(self.catcount(category))
    
    def weightprob(self, feature, cat, prf, weight = 1.0, ap=0.5):
        basicprob = prf(feature, cat)
        totals = sum([self.fcount(feature,c) for c in self.categories()])
        bp = ((weight * ap) + (totals*basicprob)) / (weight + totals)
        return bp
    
    def setthreshold(self, category, val):
        self.thresholds[category] = val
        
    def getthreshold(self, category):
        if category not in self.thresholds:
            return 1.0
        return self.thresholds[category]
    
    def classify(self, doc, default=None):
        probs = {}
        categories = self.categories()
        for cat in categories:
            probs[cat] = self.prob(doc,cat)
        # print(probs) 
        # best_category = max(zip(probs.keys(),probs.values()))
        
        best_two = heapq.nlargest(2,probs.items(),key=lambda x:x[1])
        print(best_two)
        if self.getthreshold(best_two[1][0]) * best_two[1][1] < best_two[0][1]:
            return best_two[0][0]
        else:
            return default
        
        
cl = classifier(getwords)

In [5]:
def sampletrain(cl):
    cl.train('Nobody owns the water.','good')
    cl.train('the quick rabbit jumps fences', 'good')
    cl.train('buy pharmaceuticals now', 'bad')
    cl.train('make quick money at the online casino','bad')
    cl.train('the quick brown fox jumps', 'good')
    
sampletrain(cl)

In [6]:
# 多次执行看数值的变化
print('money在bad分类中')
print(cl.weightprob('money','bad',cl.fprob))
print('money在good分类中')
print(cl.weightprob('money','good',cl.fprob))

money在bad分类中
0.5
money在good分类中
0.25


# 看重点

这里引入weightprob是用来判断一个特征词对于某个分类的贡献是否可信。从上面的语料库中看出，money只在bad分类中出现，而在good分类中一个都没有，这样对于新的测试集中的文本而言，带有money的文本将会全部判为bad分类，显然这是不正确的。除了扩大语料集外，我们还应该对某个词对某个分类的可信度进行测量，这就是weightprob函数的作用！

In [7]:
class naivebayes(classifier):
    def docprob(self,doc,category):
        features = self.getfeatures(doc)
        
        multi = 1
        for f in features:
            multi *= self.weightprob(f,category,self.fprob)
        return multi
    
    def prob(self, doc, category):
        category_prob = self.catcount(category) / self.totalcount()
        doc_prob = self.docprob(doc, category)
        return category_prob * doc_prob

In [8]:
n = naivebayes(getwords)
sampletrain(n)
print(n.prob('the quick rabbit jumps fences','good'))
print(n.prob('the quick rabbit jumps fences','bad'))

0.03580729166666666
0.0010416666666666667


In [9]:
n.classify('quick rabbit',default='unknown')

[('good', 0.15624999999999997), ('bad', 0.05)]


'good'

In [10]:
n.classify('quick money',default='unknown')

[('bad', 0.1), ('good', 0.09375)]


'bad'

In [11]:
n.setthreshold('bad',3)
n.classify('quick money',default='unknown')

[('bad', 0.1), ('good', 0.09375)]


'bad'

In [12]:
class fisherclassifier(classifier):
    def __init__(self, getfeatures,filename=None):
        classifier.__init__(self,getfeatures)
        self.thresholds = {}
        
    def cprob(self, feature, category):
        clf = self.fprob(feature, category)
        if clf == 0:
            return 0
        freqsum = sum([self.fprob(feature, cat) for cat in self.categories()])
        p = clf / freqsum
        return p
    
    def fisherprob(self, doc, category):
        p = 1
        features = self.getfeatures(doc)
        for feature in features:
            f_c = self.cprob(feature,category)
            if f_c:
                p *= self.cprob(feature,category)
        res = math.log(p) * (-2)
        return self.invchi2(res,len(features)*2)
    
    # 结果归一化函数
    def invchi2(self,chi,df):
        m = chi / 2.0
        sum = term = math.exp(-m)
        for i in range(1, df//2):
            term *= m/i
            sum += term
        return min(sum,1)
    
    def setthresholds(self,category,val):
        self.thresholds[category] = val
        
    def getthresholds(self,category):
        if category in self.thresholds:
            return self.thresholds[category]
        else:
            return 0
    
    def classify(self,doc,default=None):
        categories = self.categories()
        res = {}
        best = default
        max_val = 0.0
        for cat in categories:
            p = self.fisherprob(doc, cat)
            if p > self.getthresholds(cat) and p > max_val:
                best = cat
                max_val = p
        return best
            
        
        

In [13]:
f = fisherclassifier(getwords)
sampletrain(f)
print(f.cprob('quick','good'))
print(f.cprob('money','bad'))
print(f.cprob('casino','bad'))
print(f.weightprob('money','bad',f.cprob))

0.5714285714285715
1.0
1.0
0.75


In [14]:
f.fisherprob('quick','good')

0.5714285714285715

In [15]:
f.classify('the quick rabbit jumps fences')

'good'