In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random

# 加载数据集
def loadDataSet():
    dataMat = []
    labelMat = []
    fr = open('./testSet.txt')
    for line in fr.readlines():
        lineData = line.strip().split()
        dataMat.append([1.0, float(lineData[0]), float(lineData[1])])
        labelMat.append(int(lineData[2]))
    return dataMat, labelMat


# sigmoid 函数
def sigmoid(inX):
    return 1.0 / (1 + np.exp(-inX))

In [2]:
# 梯度上升
def gradAscent(dataMatIn, classLabels, maxCycles):
    dataMatrix = np.mat(dataMatIn)
    labelsMatrix = np.mat(classLabels).transpose() # 转置，将行向量转置为列向量
    m, n = np.shape(dataMatrix)
    
    alpha = 0.001
    W = np.ones((n, 1))
    for i in range(maxCycles):
        h = sigmoid(dataMatrix * W) # (100, 1)
        error = labelsMatrix - h # (100, 1)
        W = W + alpha * dataMatrix.transpose() * error # (3, 100) * (100, 1)
        
    return W 

#改进版随机梯度上升
def stocGradAscent1(dataMatrixIn, classLabels, numIter=150):
    dataMatrix = np.array(dataMatrixIn)
    m,n = np.shape(dataMatrix)
    weights = np.ones(n)   #initialize to all ones
    for j in range(numIter):
        dataIndex = list(range(m))
        for i in range(m):
            alpha = 4.0/(1.0+j+i)+0.01    #apha decreases with iteration, does not 
            randIndex = int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant
            h = sigmoid(sum(dataMatrix[randIndex]*weights))
            error = classLabels[randIndex] - h
            weights = weights + alpha * error * dataMatrix[randIndex]
            del(dataIndex[randIndex])
    return np.mat(weights.reshape(n, 1))


In [3]:
def plotBestFit(weights, dataMat, labelMat):
    dataArr = np.array(dataMat)
    n = np.shape(dataArr)[0]
    xcord1 = []; ycord1 = []
    xcord2 = []; ycord2 = []
    for i in range(n):
        if labelMat[i] == 1:
            xcord1.append(dataArr[i, 1]); ycord1.append(dataArr[i, 2])
        else:
            xcord2.append(dataArr[i, 1]); ycord2.append(dataArr[i, 2])
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(xcord1, ycord1, s = 30, c = 'red', marker = 's')
    ax.scatter(xcord2, ycord2, s = 30, c = 'green')
    x = np.arange(-4.0, 4.0, 0.1)
    y = ((np.array((-weights[0] - weights[1] * x) / weights[2]))[0]).transpose()
    ax.plot(x, y)
    plt.xlabel('X1')
    plt.ylabel('X2')
    plt.show()

In [4]:
'''
dataSet, labels = loadDataSet()
weights = stocGradAscent1(dataSet, labels, 200)
plotBestFit(weights, dataSet, labels)
'''

'\ndataSet, labels = loadDataSet()\nweights = stocGradAscent1(dataSet, labels, 200)\nplotBestFit(weights, dataSet, labels)\n'

In [5]:
# 预测
def classifyVector(inX, weights):
    prob = sigmoid(sum(inX * weights))
    if prob > 0.5:
        return 1.0
    else:
        return 0.0
    

In [6]:
# 对训练集进行训练，并且对测试集进行测试
def colicTest():
    trainFile = open('horseColicTraining.txt')
    testFile = open('horseColicTest.txt')
    trainingSet = []; trainingLabels = []
    for line in trainFile.readlines():
        currLine = line.strip().split('\t')
        lineArr = []
        for i in range(21):
            lineArr.append(float(currLine[i]))
        trainingSet.append(lineArr)
        trainingLabels.append(float(currLine[21]))
        
    # 开始训练
    weights = stocGradAscent1(trainingSet, trainingLabels, 400)
    errorCount = 0.0
    numTestVec = 0.0
    for line in testFile.readlines():
        numTestVec += 1.0
        currLine = line.strip().split('\t')
        lineArr = []
        for i in range(21):
            lineArr.append(float(currLine[i]))
        if int(classifyVector(np.array(lineArr), weights)) != int(currLine[21]):
            errorCount += 1.0
    errorRate = errorCount / float(numTestVec)
    print("the error rate is:%f" % errorRate)
    return errorRate


# 多次测试求平均值
def multiTest():
    testTimes = 10
    errorRateSum = 0.0
    for i in range(testTimes):
        errorRateSum += colicTest()
    print("the average error rate is:%f" % (errorRateSum / float(testTimes)))
    
    
multiTest()



the error rate is:0.268657
the error rate is:0.417910
the error rate is:0.522388
the error rate is:0.298507
the error rate is:0.373134
the error rate is:0.343284
the error rate is:0.238806
the error rate is:0.417910
the error rate is:0.358209
the error rate is:0.388060
the average error rate is:0.362687
