In [63]:
import numpy as np
import pandas as pd

In [64]:
def loadDataSet():     #用高斯模型拟合文本
    """
    Create dataset
    
    Returns:
        posting list and classVec
    """
    postingList = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless','grabage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = np.array([0,1,0,1,0,1]) # 1 is absive,0 not
    
    return postingList,classVec

In [65]:
postingList,classVec = loadDataSet()
postingList

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
 ['stop', 'posting', 'stupid', 'worthless', 'grabage'],
 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]

In [66]:
def create_Vocabulary(postingList):
    
    Voc_list =set([])
    for words in postingList:
        Voc_list = Voc_list | set(words)
        
    return list(Voc_list)
    

In [67]:
Vocabulary = create_Vocabulary(postingList)

In [68]:
def words2Vec(Vocabulary,postingList):
    m,n = len(postingList),len(Vocabulary)
    Words_Vec = np.zeros((m,n))
    for i in range(m):
        for word in postingList[i]:
            try:
                index = Vocabulary.index(word)
                Words_Vec[i,index] = 1
            except:
                print('这个{}不在我们的词集中'.format(word))
                exit(0)
    return Words_Vec

In [69]:
Words_Vec = words2Vec(Vocabulary,postingList)
Words_Vec

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0.,
        0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
        0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [None]:
def Model(X,y,X_test,Vocabulary,gamma):
    
    uniuqe_y = np.unique(y)
    K = len(uniuqe_y)
    predict_ = []
    # 计算先验概率
    P_1 =( y.sum()+gamma) /(y.shape[0]+K*gamma)
    P_0 = 1 - P_1
    P_y = {0:P_0,1:P_1}
    test_x = words2Vec(Vocabulary,X_test)
    
    # 计算条件概率
    for label in uniuqe_y:
        pri_P_y = P_y[label]
        Index = np.where(y==label)[0]
        split_X = X[Index]
        Sj = 2
        cond_fenmu = split_X.shape[0] + (Sj * gamma)
        
        sum_feature = np.sum(test_x == split_X,axis=0) + gamma
        cond_prob = np.product(sum_feature / cond_fenmu) 
        
        houyan = pri_P_y * cond_prob
        predict_.append([label,houyan])
    print(predict_)
    predict = sorted(predict_,key=lambda z:z[1],reverse=True)[0][0]
    return predict
        

In [70]:
from sklearn.model_selection import train_test_split
Words_Vec_train,Words_Vec_test,y_train,y_test = train_test_split(Words_Vec,y,test_size = 0.2,random_state=0)

In [71]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(Words_Vec, y)

GaussianNB(priors=None)

In [72]:
clf.score(Words_Vec_train, y_train)

1.0

In [73]:
clf.score(Words_Vec_test, y_test)

1.0