In [2]:
import numpy as np
def kappa(confusion_matrix):
    N = np.sum(confusion_matrix)
    sum_po = 0
    sum_pe = 0
    for i in range(len(confusion_matrix)):
        sum_po += confusion_matrix[i][i]
        i_row = np.sum(confusion_matrix[i,:])
        i_col = np.sum(confusion_matrix[:,i])
        sum_pe += i_col * i_row
    po = sum_po/N
    pe = sum_pe/(N*N)
    kia = (po-pe)/(1-pe)
    return kia

In [15]:
import numpy as np
def loadDataSet():
    dataSet = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop','posting','stupid','worthless','garbage'],
                  ['mr','licks','ate','my','steak','how','to','stop','him'],
                  ['quit','buying','worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1]
    return dataSet,classVec
def createVocabList(dataSet):
    vacabSet = set([])
    for document in dataSet:
        vacabSet = vacabSet | set(document)
    vacabList = list(vacabSet)
    return vacabList

def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)]=1
    return returnVec

In [5]:
def trainNB(trainMatrix,classVec):
    num_docs = len(trainMatrix)
    num_words = len(trainMatrix[0])
    pAb = sum(classVec)/float(num_docs)
    p0Num = np.zeros(num_words)
    p0Denom = 0.0
    p1Num = np.zeros(num_words)
    p1Denom = 0.0
    for i in range(num_docs):
        if classVec[i]==1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p0V = p0Num/p0Denom
    p1V = p1Num/p1Denom
    return p0V,p1V,pAb


In [7]:
def classifyNB(vec2Classify,p0V,p1V,pAb):
    from functools import reduce
    p1 = reduce(lambda x,y:x*y,vec2Classify*p1V)*pAb
    p2 = reduce(lambda x,y:x*y,vec2Classify*p0V)*(1-pAb)
    if p1>p2:
        return 1
    else:
        return 0

In [12]:
def testingNB(testVec):
    dataSet,classVec = loadDataSet()
    vocabList = createVocabList(dataSet)
    trainMat = []
    for inputSet in dataSet:
        trainMat.append(setOfWords2Vec(vocabList,inputSet))
    p0V,p1V,pAb= trainNB(np.array(trainMat),np.array(classVec))
    thisDoc = np.array(setOfWords2Vec(vocabList,testVec))
    if classifyNB(thisDoc,p0V,p1V,pAb):
        print('侮辱')
    else:
        print('非侮辱')

In [16]:
testEntry = ['love', 'my', 'dalmation']
testingNB(testEntry)
# 测试文本2
testEntry = ['stupid', 'garbage']
testingNB(testEntry)

非侮辱
非侮辱


In [17]:
def trainNB(trainMatrix,classVec):
    num_docs = len(trainMatrix)
    num_words = len(trainMatrix[0])
    pAb = sum(classVec)/float(num_docs)
    p0Num = np.ones(num_words)
    p0Denom = 2.0
    p1Num = np.ones(num_words)
    p1Denom = 2.0
    for i in range(num_docs):
        if classVec[i]==1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p0V = np.log(p0Num/p0Denom)
    p1V = np.log(p1Num/p1Denom)
    return p0V,p1V,pAb
def classifyNB(vec2Classify,p0V,p1V,pAb):
    p1 = sum(vec2Classify*p1V)+np.log(pAb)
    p0 = sum(vec2Classify*p0v)+np.log(1-pAb)
    if p1>p0:
        return 1
    else:
        return 0

In [19]:
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) >2]
def spanTest():
    docList = []
    classList = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt'% i).read())
        docList.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt'% i).read())
        docList.append(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainSet = list(np.arange(50))
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0,len(trainSet)))
        testSet.append(trainSet[randIndex])
        del trainSet[randIndex]
    trainMat = []
    trainClass = []
    for docIndex in trainSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClass.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB(np.array(trainMat),np.array(trainClass))
    errorCount = 0
    for docIndex in testSet:
        wordVect = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(wordVect,p0V,p1V,pSpam)!=classList[docIndex]:
            errorCount += 1
            print('classification error',docList[docIndex])
    print('the error rate is :',float(errorCount)/len(testSet))
    return float(errorCount)/len(testSet)
        

In [27]:
import numpy as np
import pandas as pd
import random
def loadDataSet():
    from sklearn import datasets
    iris_data = datasets.load_iris()
    dataSets = pd.DataFrame(iris_data.data)
    dataSets.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    dataSets['类别']=iris_data.target
    return dataSets

In [23]:
def randSplit(dataSet,rate):
    l = list(dataSet.index)
    random.shuffle(l)
    dataSet.index = l
    m = dataSet.shape[0]
    m_train = int(rate*m)
    train_data = dataSet.loc[range(m_train),:]
    test_data = dataSet.loc[range(m_train,m),:]
    train_data.index = range(len(train_data))
    test_data.index = range(len(test_data))
    return train_data,test_data

In [25]:
def gnb_classify(train_data,test_data):
    labels = list(set(train_data.iloc[:,-1]))
    mean_list = []
    var_list = []
    for label in labels:
        curr_label_data = train_data.loc[train_data.iloc[:,-1]==label,:]
        m = curr_label_data.iloc[:,:-1].mean()
        v = np.sum((curr_label_data.iloc[:,:-1]-m)**2)/curr_label_data.shape[0]
        mean_list.append(m)
        var_list.append(v)
    mean_df = pd.DataFrame(mean_list,index=labels)
    var_df = pd.DataFrame(var_list,index=labels)
    result = []
    for j in range(test_data.shape[0]):
        curr_test = test_data.iloc[j,:-1].tolist()
        predict_prob = np.exp(-1*(curr_test-mean_df)**2/(2*var_df))/(np.sqrt(2*np.pi*var_df))
        prob = 1
        for k in range(test_data.shape[1]-1):
            prob *= predict_prob.iloc[:,k]
        cla = prob.index[np.argmax(prob.values)]
        result.append(cla)
    test_data['predict']=result
    acc = (test_data.iloc[:,-1]==test_data.iloc[:,-2]).mean()
    print('预测准确率：',acc)
    return test_data

In [28]:
dataSet = loadDataSet()
for i in range(10):
    train_data, test_data = randSplit(dataSet, 0.8)
    test_result = gnb_classify(train_data, test_data)

预测准确率： 0.8666666666666667
预测准确率： 0.9666666666666667
预测准确率： 0.9333333333333333
预测准确率： 0.9666666666666667
预测准确率： 0.9666666666666667
预测准确率： 0.9
预测准确率： 1.0
预测准确率： 0.9333333333333333
预测准确率： 0.8666666666666667
预测准确率： 0.9333333333333333


In [None]:
def gbn_classify_by_sklearn(train_data,test_data):
    from sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import accuracy_score
    gnb_clf = GaussianNB()
    gnb_clf.fit(train_data.iloc[:,:-1],train_data.iloc[:,-1])
    predict_class = gnb_clf.predict(test_data.iloc[:,-1])
    test_data['predict']=list(predict_class)
    acc = accuracy_score(test_data.iloc[:,-1])