# 从疝气病症预测病马的死亡率

1. 收集数据：给定数据文件。
2. 准备数据：用python解析文本文件并填充缺失值。
3. 分析数据：可视化并观察数据。
4. 训练算法：使用优化算法，找到最佳的系数。
5. 测试算法：为了量化回归的效果，需要观察错误率。根据错误率决定是否回退到训练阶段，通过改变迭代的次数和步长等参数来得到更好的回归系数。
6. 使用算法：实现一个简单的命令行程序来收集马的症状并输出预测结果。

In [1]:
import numpy as np

In [2]:
# sigmoid函数
def sigmoid(inX):
    return 1.0 / (1+ np.exp(-inX))

In [3]:
# 改进的随机梯度上升算法
def stocGradAscent1(dataMatrix, classLabels, numIter = 150):
    import random
    m, n = np.shape(dataMatrix)
    weights = np.ones(n)
    for j in range(numIter):
        dataIndex = range(m)
        for i in range(m):
            # alpha每次迭代的时候都会调整，会缓解数据波动或者高频波动
            alpha = 4 / (1.0 + j + i) + 0.01
            # 随机选取更新，将减少周期性的波动
            randIndex = int(random.uniform(0, len(dataIndex)))
            h = sigmoid(sum(dataMatrix[randIndex] * weights))
            error = classLabels[randIndex] - h
            weights = weights + alpha * error * dataMatrix[randIndex]
            del(list(dataIndex)[randIndex])
    return weights

## Logistic回归分类函数

In [4]:
def classifyVector(inX, weights):
    prob = sigmoid(sum(inX * weights))
    if prob > 0.5:
        return 1.0
    else:
        return 0.0

In [5]:
def colicTest():
    # 创建数据集
    frTrain = open('horseColicTraining.txt')
    frTest = open('horseColicTest.txt')
    trainingSet = []; trainingLabels = []
    # 训练算法
    for line in frTrain.readlines():
        currLine = line.strip().split('\t')
        lineArr = []
        # 21个特征
        for i in range(21):
            lineArr.append(float(currLine[i]))
        trainingSet.append(lineArr)
        # 最后一列为分类
        trainingLabels.append(float(currLine[21]))
    # 计算参数，迭代500次
    trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500)
    # 测试算法，观察错误率
    errorCount = 0; numTestVec = 0.0
    for line in frTest.readlines():
        numTestVec += 1.0
        currLine = line.strip().split('\t')
        lineArr = []
        for i in range(21):
            lineArr.append(float(currLine[i]))
        if int(classifyVector(np.array(lineArr), trainWeights)) != int(currLine[21]):
            errorCount += 1
    errorRate = float(errorCount) / numTestVec
    print('the error rate of this test is: %f' % errorRate)
    return errorRate

In [6]:
# 调用函数colicTest() 10次并求结果平均值
def multiTest():
    numTests = 10; errorSum = 0.0
    for k in range(numTests):
        errorSum += colicTest()
    print('after %d iterations the average error rate is: %f'
         % (numTests, errorSum/ float(numTests)))

In [7]:
multiTest()

  This is separate from the ipykernel package so we can avoid doing imports until


the error rate of this test is: 0.358209
the error rate of this test is: 0.268657
the error rate of this test is: 0.268657
the error rate of this test is: 0.298507
the error rate of this test is: 0.298507
the error rate of this test is: 0.283582
the error rate of this test is: 0.358209
the error rate of this test is: 0.402985
the error rate of this test is: 0.358209
the error rate of this test is: 0.238806
after 10 iterations the average error rate is: 0.313433
