In [22]:
from numpy import *
import operator
from os import listdir   #列出给定目录的文件名

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

## kNN分类方法
def classify(unknownX, dataSet, labels, k):
    #计算未知元素与给定样本间的距离
    dataSetSize = dataSet.shape[0]
    diffMat = tile(unknownX, (dataSetSize, 1)) - dataSet
    squareDiffMat = diffMat**2
    sqDistance = squareDiffMat.sum(axis = 1)
    distance = sqDistance**0.5
    #对未知元素与所有给定样本间的距离排序
    sortDistanceIndicies = distance.argsort()
    #创建空的字典
    classCount = {}
    #计算与未知元素距离最近的k个样本中类别出现次数最多的那一类
    for i in range(k):
        validLabel = labels[sortDistanceIndicies[i]]
        classCount[validLabel] = classCount.get(validLabel, 0) + 1  #get(键，默认值)返回字典中指定键的值，如果字典中没有键则返回默认值
    sortClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)           #['B':2,'A':2]
    return sortClassCount[0][0]

## txt文件转为矩阵
def file2matrix(filename):
    fr = open(filename)
    arrayLines = fr.readlines()
    numOfLines = len(arrayLines)     #文本行数
    returnMat = zeros((numOfLines, 3))    #创建空数组存放样本
    labelVector = []                         #创建空列表存放标签
    index = 0
    for line in arrayLines:
        line = line.strip()               #去掉每行首位的空格和换行符
        listFromLine = line.split('\t')         #依据|t符号，截取字符并存为列表
        returnMat[index,:] = listFromLine[0:3]     
        labelVector.append(int(listFromLine[-1]))  #标签为Int
        index += 1
    return returnMat, labelVector

##特征值归一化 
#将所有数字特征值转化为0到1之间
#2018.09.26
#edited by:Qingping Zheng
    
def autoNorm(dataSet):
    minVal = dataSet.min(0)
    maxVal = dataSet.max(0)
    ranges = maxVal - minVal
    normDataSet = zeros(shape(dataSet))  #创建一个跟输入矩阵一样大的零矩阵
    m = dataSet.shape[0]
    normDataSet= dataSet - tile(minVal, (m, 1))  #tile复制矩阵minVal，向下复制粘贴m次，向右复制粘贴1次
    normDataSet = normDataSet/tile(ranges, (m, 1)) #两矩阵对应元素相除，而不是矩阵相除
    return normDataSet, ranges, minVal

##分类器测试
#用于测试分类的正确率

def datingClassTest(dataSetName, k, hoRatio): #dataName为数据集名字，k为KNN中k的大小，ration为测试数据集占总的数据集的比例
    datingDataMat, datingLabels = file2matrix(dataSetName)
    normDatingDataMat, ranges, minVal = autoNorm(datingDataMat)
    m = normDatingDataMat.shape[0]
    numTestVecs = int(m*hoRatio) #测试数据集中数据的个数
    errorCount = 0.0
    for i in range(numTestVecs):#range()返回0到99的序列对象，而不是列表，表示从normDatingDataMat中选择前100条数据作为测试数据
    # normDatingDataMat[numTestVecs:m,:] = normDatingDataMat[100:1000,:]表示从第100行到1000行的数据为训练数据。
        classifierResult = classify(normDatingDataMat[i,:], normDatingDataMat[numTestVecs:m,:],\
            datingLabels[numTestVecs:m],k)           #\为换行符号
        print('No. %d. the classifier came back with: %d, the real answer is: %d' \
            % (i, classifierResult, datingLabels[i]))
        if classifierResult != datingLabels[i]:
            errorCount += 1.0
    print ('the total error rate is:%f' % (errorCount/float(numTestVecs)))
    
##约会对象预测函数

def classifyPerson(trainDataSetName):
    resultList = ['not at all','in some does','in large does']                              #结果列表
    percentTats = float(input('percentage of time spent playing video games?'))
    flyMiles = float(input('frequent flier miles earned per year?'))
    iceCream = float(input('liters of icecream consumed per year?'))
    datingDataMat, datingLabels = file2matrix(trainDataSetName)
    normMat, ranges, minVal = autoNorm(datingDataMat)
    inArray = array([flyMiles, percentTats, iceCream])
    normArray = (inArray-minVal)/ranges
    classifierResult = classify(normArray, normMat, datingLabels, 3)
    print('you will probably like this person:',resultList[classifierResult - 1])
    
##图像转换为一个向量
def img2vector(fileName):
    returnVector = zeros((1,1024)) #创建1行24列的全0数组
    #shape(returnVector)                #查看变量维度
    fileRead = open(fileName)
    lineStr = fileRead.readlines()      #读取所有内容，但有分行
    index1 = 0
    for line in lineStr:           #一行一行的遍历
        line = line.strip()   #去掉首尾的空格和换行符
        for j in range(32):
            returnVector[0, 32*index1+j] = int(line[j])
        index1 += 1
    return returnVector

##测试手写数字识别算法
def handWritingClassTest(k):
    hwLabels = []
    trainFileList = listdir('digits/trainingDigits')
    #trainFileList
    m = len(trainFileList)  #m = 1934
    trainMat = zeros((m, 1024))
    for i in range(m):
        fileNameStr = trainFileList[i]     #文件名'5_162.txt'是这样的
        fileStr = fileNameStr.split('.')[0]   #从‘.’处将文件名截断，[0]表示保存文件名的前半截'5_162'，如果是[1]表示保存文件名的后半截‘txt’
        classNumStr = int(fileNameStr.split('_')[0])
        hwLabels.append(classNumStr)
        trainMat[i,:] = img2vector('digits/trainingDigits/%s' % fileNameStr)
    testFileList = listdir('digits/testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('digits/testDigits/%s' % fileNameStr)
        classifierResult = classify(vectorUnderTest, trainMat, hwLabels, k)
        print('the classifier came back with: %d, the real label is: %d' % (classifierResult, classNumStr))
        if classifierResult != classNumStr:
            errorCount += 1.0
    print('\nthe number of testSet：%d, the number of trainSet: %d' % (mTest, m))
    print('\nthe total number of errors is: %d' % errorCount)
    print('\nthe total error rate is: %f' %(errorCount/float(mTest)))