# 实验一

**KNN手写体识别**
1. 获取训练数据"trainSet"及测试数据"testSet"
    - 加载训练数据集，并将训练数据集向量化
    - 获取测试数据集，并将测试数据集向量化
    - 将32*32文本转化成向量：img2vector
2. 训练 KNN 分类器，定义 KNN 分类器：myKNN（）函数。
    - 计算训练集中每个样本与 testDigit 的欧氏距离；
    - 对所得距离求和取平方根；
    - 对距离排序；
    - 按距离的顺序读取标签，计算标签次数
    - 查找出现标签数量最多的类别，作为分类结果
3. 测试训练好的 KNN 分类器：myKNN。
    - 调用 myKNN()函数，将该函数用于测试样本
    - 输出分类结果

In [3]:
# 手写数字识别

import numpy as np
import os

class DigitRecoginze():
    
    def __init__(self):
        self.label = None
        self.train_set = None
        self.test_set = None

    def img2vector(self, filename):
    	# 将像素图像转化为向量
        # 图像像素为32*32
        image_vector = np.zeros((1, 1024))
        f = open(filename, 'r')
        for i in range(32):
            line = f.readline()
            for j in range(32):
                image_vector[0, 32*i + j] = int(line[j])
        
        return image_vector

    def import_data(self,filepath):
        # 导入数据
        data_list = os.listdir(filepath)
        data_list_number = len(data_list)
        
        # 导入label数据
        return_label = np.zeros((data_list_number, 1))
        for i in range(data_list_number):
            return_label[i] = (data_list[i].strip().split('_'))[0]
        # 导入data数据
        return_data_set = np.zeros((data_list_number, 1024))
        for i in range(data_list_number):
            return_data_set[i] = self.img2vector(os.path.join(filepath, data_list[i]))

        return return_data_set, return_label

    
    def train_set_normalize(self, train_set):
        # 归一化
        data_range = np.max(train_set) - np.min(train_set)
        return (train_set - np.min(train_set)) / data_range

    def single_train(self, train_set, testcase_x, train_label, k = 5):
       
        # 计算距离
        train_set_size = train_set.shape[0]
        diff_mat = np.tile(testcase_x, (train_set_size, 1)) - train_set
        distances = (diff_mat**2).sum(axis=1)**0.5
        # print(distances)
        
        # 排序，这里排序结果表示他的排序位置
        distances_sorted = distances.argsort()

        class_result = {}

		# 找出k个点
        for i in range(k):
            now_label = int(train_label[distances_sorted[i]][0])
            # print(now_label)
            class_result[now_label] = class_result.get(now_label, 0)+ 1
        
        # 找出最近最多的点
        max_num = 0
        result_label = 0
        for single_result in class_result:
            if class_result[single_result] > max_num:
                max_num = class_result[single_result]
                result_label = single_result
        return result_label

    # 训练
    def myKNN(self, train_set_filepath, test_set_filepath, k = 5):
        # 导入数据
        train_set, train_label = self.import_data(train_set_filepath)
        train_set = self.train_set_normalize(train_set)
        test_set, test_label = self.import_data(test_set_filepath)
        test_set = self.train_set_normalize(test_set)
        
        error_number = 0
        all_number = test_set.shape[0]
        # 对于每一个测试样本进行测试
        for i in range(all_number):
            result_label = self.single_train(train_set, test_set[i,:], train_label, k)
            if result_label != int(test_label[i][0]):
                error_number = error_number + 1
        
            print("testcase %d: knn send back %d, the real class is %d" %(i, result_label, int(test_label[i][0])))
        print("error ratio = %f" %(float(error_number)/float(all_number)))

# 数据位置修改为自己的
FILE_PATH_TEST = r'HWdigits\testSet'
FILE_PATH_TRAIN = r'HWdigits\trainSet'
_dr = DigitRecoginze()
_dr.myKNN(FILE_PATH_TRAIN,FILE_PATH_TEST)

testcase 0: knn send back 0, the real class is 0
testcase 1: knn send back 0, the real class is 0
testcase 2: knn send back 0, the real class is 0
testcase 3: knn send back 0, the real class is 0
testcase 4: knn send back 0, the real class is 0
testcase 5: knn send back 0, the real class is 0
testcase 6: knn send back 0, the real class is 0
testcase 7: knn send back 0, the real class is 0
testcase 8: knn send back 0, the real class is 0
testcase 9: knn send back 0, the real class is 0
testcase 10: knn send back 0, the real class is 0
testcase 11: knn send back 0, the real class is 0
testcase 12: knn send back 0, the real class is 0
testcase 13: knn send back 0, the real class is 0
testcase 14: knn send back 0, the real class is 0
testcase 15: knn send back 0, the real class is 0
testcase 16: knn send back 0, the real class is 0
testcase 17: knn send back 0, the real class is 0
testcase 18: knn send back 0, the real class is 0
testcase 19: knn send back 0, the real class is 0
testcase 2

testcase 171: knn send back 3, the real class is 3
testcase 172: knn send back 3, the real class is 3
testcase 173: knn send back 3, the real class is 3
testcase 174: knn send back 3, the real class is 3
testcase 175: knn send back 3, the real class is 3
testcase 176: knn send back 3, the real class is 3
testcase 177: knn send back 3, the real class is 3
testcase 178: knn send back 3, the real class is 3
testcase 179: knn send back 3, the real class is 3
testcase 180: knn send back 3, the real class is 3
testcase 181: knn send back 3, the real class is 3
testcase 182: knn send back 8, the real class is 3
testcase 183: knn send back 2, the real class is 3
testcase 184: knn send back 3, the real class is 3
testcase 185: knn send back 3, the real class is 3
testcase 186: knn send back 3, the real class is 3
testcase 187: knn send back 3, the real class is 3
testcase 188: knn send back 3, the real class is 3
testcase 189: knn send back 8, the real class is 3
testcase 190: knn send back 3, 

testcase 342: knn send back 6, the real class is 6
testcase 343: knn send back 6, the real class is 6
testcase 344: knn send back 6, the real class is 6
testcase 345: knn send back 6, the real class is 6
testcase 346: knn send back 6, the real class is 6
testcase 347: knn send back 6, the real class is 6
testcase 348: knn send back 6, the real class is 6
testcase 349: knn send back 6, the real class is 6
testcase 350: knn send back 6, the real class is 6
testcase 351: knn send back 6, the real class is 6
testcase 352: knn send back 6, the real class is 6
testcase 353: knn send back 6, the real class is 6
testcase 354: knn send back 6, the real class is 6
testcase 355: knn send back 6, the real class is 6
testcase 356: knn send back 6, the real class is 6
testcase 357: knn send back 6, the real class is 6
testcase 358: knn send back 6, the real class is 6
testcase 359: knn send back 6, the real class is 6
testcase 360: knn send back 6, the real class is 6
testcase 361: knn send back 6, 

testcase 510: knn send back 9, the real class is 9
testcase 511: knn send back 9, the real class is 9
testcase 512: knn send back 9, the real class is 9
testcase 513: knn send back 9, the real class is 9
testcase 514: knn send back 9, the real class is 9
testcase 515: knn send back 9, the real class is 9
testcase 516: knn send back 9, the real class is 9
testcase 517: knn send back 9, the real class is 9
testcase 518: knn send back 9, the real class is 9
testcase 519: knn send back 9, the real class is 9
testcase 520: knn send back 9, the real class is 9
testcase 521: knn send back 9, the real class is 9
testcase 522: knn send back 9, the real class is 9
testcase 523: knn send back 9, the real class is 9
testcase 524: knn send back 9, the real class is 9
testcase 525: knn send back 9, the real class is 9
testcase 526: knn send back 9, the real class is 9
testcase 527: knn send back 9, the real class is 9
testcase 528: knn send back 9, the real class is 9
testcase 529: knn send back 9, 

In [None]:
#!/usr/bin/env python
# coding: utf-8
'''
Created on Sep 16, 2010
Update  on 2017-05-18
Author: Peter Harrington/羊三/小瑶
GitHub: https://github.com/apachecn/AiLearning
'''
from __future__ import print_function
from numpy import *
# 导入科学计算包numpy和运算符模块operator
import operator
from os import listdir
from collections import Counter


def createDataSet():
    """
    创建数据集和标签

     调用方式
     import kNN
     group, labels = kNN.createDataSet()
    """
    group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
    labels = ['A', 'A', 'B', 'B']
    return group, labels


def classify0(inX, dataSet, labels, k):
    """
    inx[1,2,3]
    DS=[[1,2,3],[1,2,0]]
    inX: 用于分类的输入向量
    dataSet: 输入的训练样本集
    labels: 标签向量
    k: 选择最近邻居的数目
    注意: labels元素数目和dataSet行数相同；程序使用欧式距离公式.

    预测数据所在分类可在输入下列命令
    kNN.classify0([0,0], group, labels, 3)
    """

    # -----------实现 classify0() 方法的第一种方式----------------------------------------------------------------------------------------------------------------------------
    # 1. 距离计算
    dataSetSize = dataSet.shape[0]
    # tile生成和训练样本对应的矩阵，并与训练样本求差
    """
    tile: 列-3表示复制的行数， 行-1／2表示对inx的重复的次数

    In [8]: tile(inx, (3, 1))
    Out[8]:
    array([[1, 2, 3],
        [1, 2, 3],
        [1, 2, 3]])

    In [9]: tile(inx, (3, 2))
    Out[9]:
    array([[1, 2, 3, 1, 2, 3],
        [1, 2, 3, 1, 2, 3],
        [1, 2, 3, 1, 2, 3]])
    """
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    """
    欧氏距离:  点到点之间的距离
       第一行:  同一个点 到 dataSet的第一个点的距离。
       第二行:  同一个点 到 dataSet的第二个点的距离。
       ...
       第N行:  同一个点 到 dataSet的第N个点的距离。

    [[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
    (A1-A2)^2+(B1-B2)^2+(c1-c2)^2
    """
    # 取平方
    sqDiffMat = diffMat ** 2
    # 将矩阵的每一行相加
    sqDistances = sqDiffMat.sum(axis=1)
    # 开方
    distances = sqDistances ** 0.5
    # 根据距离排序从小到大的排序，返回对应的索引位置
    # argsort() 是将x中的元素从小到大排列，提取其对应的index（索引），然后输出到y。
    # 例如: y=array([3,0,2,1,4,5]) 则，x[3]=-1最小，所以y[0]=3;x[5]=9最大，所以y[5]=5。
    # print 'distances=', distances
    sortedDistIndicies = distances.argsort()
    # print 'distances.argsort()=', sortedDistIndicies

    # 2. 选择距离最小的k个点
    classCount = {}
    for i in range(k):
        # 找到该样本的类型
        voteIlabel = labels[sortedDistIndicies[i]]
        # 在字典中将该类型加一
        # 字典的get方法
        # 如: list.get(k,d) 其中 get相当于一条if...else...语句,参数k在字典中，字典将返回list[k];如果参数k不在字典中则返回参数d,如果K在字典中则返回k对应的value值
        # l = {5:2,3:4}
        # print l.get(3,0)返回的值是4；
        # Print l.get（1,0）返回值是0；
        classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
    # 3. 排序并返回出现最多的那个类型
    # 字典的 items() 方法，以列表返回可遍历的(键，值)元组数组。
    # 例如: dict = {'Name': 'Zara', 'Age': 7}   print "Value : %s" %  dict.items()   Value : [('Age', 7), ('Name', 'Zara')]
    # sorted 中的第2个参数 key=operator.itemgetter(1) 这个参数的意思是先比较第几个元素
    # 例如: a=[('b',2),('a',1),('c',0)]  b=sorted(a,key=operator.itemgetter(1)) >>>b=[('c',0),('a',1),('b',2)] 可以看到排序是按照后边的0,1,2进行排序的，而不是a,b,c
    # b=sorted(a,key=operator.itemgetter(0)) >>>b=[('a',1),('b',2),('c',0)] 这次比较的是前边的a,b,c而不是0,1,2
    # b=sorted(a,key=opertator.itemgetter(1,0)) >>>b=[('c',0),('a',1),('b',2)] 这个是先比较第2个元素，然后对第一个元素进行排序，形成多级排序。
    # sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    # return sortedClassCount[0][0]
    # 3.利用max函数直接返回字典中value最大的key
    maxClassCount = max(classCount, key=classCount.get)
    return maxClassCount
    
    # ------------------------------------------------------------------------------------------------------------------------------------------
    # 实现 classify0() 方法的第二种方式

    # """
    # 1. 计算距离
    
    # 欧氏距离:  点到点之间的距离
    #    第一行:  同一个点 到 dataSet的第一个点的距离。
    #    第二行:  同一个点 到 dataSet的第二个点的距离。
    #    ...
    #    第N行:  同一个点 到 dataSet的第N个点的距离。

    # [[1,2,3],[1,2,3]]-[[1,2,3],[1,2,0]]
    # (A1-A2)^2+(B1-B2)^2+(c1-c2)^2
    
    # inx - dataset 使用了numpy broadcasting，见 https://docs.scipy.org/doc/numpy-1.13.0/user/basics.broadcasting.html
    # np.sum() 函数的使用见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sum.html
    # """
	#   dist = np.sum((inx - dataset)**2, axis=1)**0.5
    
    # """
    # 2. k个最近的标签
    
    # 对距离排序使用numpy中的argsort函数， 见 https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.sort.html#numpy.sort
    # 函数返回的是索引，因此取前k个索引使用[0 : k]
    # 将这k个标签存在列表k_labels中
    # """
    # k_labels = [labels[index] for index in dist.argsort()[0 : k]]
	# """
    # 3. 出现次数最多的标签即为最终类别
    
    # 使用collections.Counter可以统计各个标签的出现次数，most_common返回出现次数最多的标签tuple，例如[('lable1', 2)]，因此[0][0]可以取出标签值
	# """
    # label = Counter(k_labels).most_common(1)[0][0]
    # return label

    # ------------------------------------------------------------------------------------------------------------------------------------------


def test1():
    """
    第一个例子演示
    """
    group, labels = createDataSet()
    print(str(group))
    print(str(labels))
    print(classify0([0.1, 0.1], group, labels, 3))


# ----------------------------------------------------------------------------------------
def file2matrix(filename):
    """
    导入训练数据
    :param filename: 数据文件路径
    :return: 数据矩阵returnMat和对应的类别classLabelVector
    """
    fr = open(filename)
    # 获得文件中的数据行的行数
    numberOfLines = len(fr.readlines())
    # 生成对应的空矩阵
    # 例如: zeros(2，3)就是生成一个 2*3的矩阵，各个位置上全是 0 
    returnMat = zeros((numberOfLines, 3))  # prepare matrix to return
    classLabelVector = []  # prepare labels return
    fr = open(filename)
    index = 0
    for line in fr.readlines():
        # str.strip([chars]) --返回移除字符串头尾指定的字符生成的新字符串
        line = line.strip()
        # 以 '\t' 切割字符串
        listFromLine = line.split('\t')
        # 每列的属性数据
        returnMat[index, :] = listFromLine[0:3]
        # 每列的类别数据，就是 label 标签数据
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    # 返回数据矩阵returnMat和对应的类别classLabelVector
    return returnMat, classLabelVector


def autoNorm(dataSet):
    """
    归一化特征值，消除属性之间量级不同导致的影响
    :param dataSet: 数据集
    :return: 归一化后的数据集normDataSet,ranges和minVals即最小值与范围，并没有用到

    归一化公式: 
        Y = (X-Xmin)/(Xmax-Xmin)
        其中的 min 和 max 分别是数据集中的最小特征值和最大特征值。该函数可以自动将数字特征值转化为0到1的区间。
    """
    # 计算每种属性的最大值、最小值、范围
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    # 极差
    ranges = maxVals - minVals
    # -------第一种实现方式---start-------------------------
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    # 生成与最小值之差组成的矩阵
    normDataSet = dataSet - tile(minVals, (m, 1))
    # 将最小值之差除以范围组成矩阵
    normDataSet = normDataSet / tile(ranges, (m, 1))  # element wise divide
    # -------第一种实现方式---end---------------------------------------------
    
    # # -------第二种实现方式---start---------------------------------------
    # norm_dataset = (dataset - minvalue) / ranges
    # # -------第二种实现方式---end---------------------------------------------
    return normDataSet, ranges, minVals


def datingClassTest():
    """
    对约会网站的测试方法
    :return: 错误数
    """
    # 设置测试数据的的一个比例（训练数据集比例=1-hoRatio）
    hoRatio = 0.1  # 测试范围,一部分测试一部分作为样本
    # 从文件中加载数据
    datingDataMat, datingLabels = file2matrix('data/2.KNN/datingTestSet2.txt')  # load data setfrom file
    # 归一化数据
    normMat, ranges, minVals = autoNorm(datingDataMat)
    # m 表示数据的行数，即矩阵的第一维
    m = normMat.shape[0]
    # 设置测试的样本数量， numTestVecs:m表示训练样本的数量
    numTestVecs = int(m * hoRatio)
    print('numTestVecs=', numTestVecs)
    errorCount = 0.0
    for i in range(numTestVecs):
        # 对数据测试
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print("the total error rate is: %f" % (errorCount / float(numTestVecs)))
    print(errorCount)


def img2vector(filename):
    """
    将图像数据转换为向量
    :param filename: 图片文件 因为我们的输入数据的图片格式是 32 * 32的
    :return: 一维矩阵
    该函数将图像转换为向量: 该函数创建 1 * 1024 的NumPy数组，然后打开给定的文件，
    循环读出文件的前32行，并将每行的头32个字符值存储在NumPy数组中，最后返回数组。
    """
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32 * i + j] = int(lineStr[j])
    return returnVect


def handwritingClassTest():
    # 1. 导入数据
    hwLabels = []
    trainingFileList = listdir('data/2.KNN/trainingDigits')  # load the training set
    m = len(trainingFileList)
    trainingMat = zeros((m, 1024))
    # hwLabels存储0～9对应的index位置， trainingMat存放的每个位置对应的图片向量
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]  # take off .txt
        classNumStr = int(fileStr.split('_')[0])
        hwLabels.append(classNumStr)
        # 将 32*32的矩阵->1*1024的矩阵
        trainingMat[i, :] = img2vector('data/2.KNN/trainingDigits/%s' % fileNameStr)

    # 2. 导入测试数据
    testFileList = listdir('data/2.KNN/testDigits')  # iterate through the test set
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]  # take off .txt
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('data/2.KNN/testDigits/%s' % fileNameStr)
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        if (classifierResult != classNumStr): errorCount += 1.0
    print("\nthe total number of errors is: %d" % errorCount)
    print("\nthe total error rate is: %f" % (errorCount / float(mTest)))


if __name__ == '__main__':
    # test1()
    # datingClassTest()
    handwritingClassTest()

# 实验二

**基于全连接神经网络的泰坦尼克号数据集的生存分析**
1. 认识与探索泰坦尼克号数据集
泰坦尼克号数据集是一个 CSV 文件，我们可以创建 DataFrame 对象df 来载入数据集，并删除不需要的字段、处理遗失数据：

In [4]:
import pandas as pd
import numpy as np
seed = 7
np.random.seed(seed)
# 载入数据集
df = pd.read_csv("train.csv")
# 删除不需要的栏位
df = df.drop(["Name", "Ticket", "Cabin"], axis=1)
# 处理遗失数据
df[["Age"]] = df[["Age"]].fillna(value=df[["Age"]].mean())
df[["Fare"]]=df[["Fare"]].fillna(value=df[["Fare"]].mean())
df[["Embarked"]]=df[["Embarked"]].fillna(value=df["Embarked"].value_counts().idxmax())
print(df["Embarked"].value_counts())
print(df["Embarked"].value_counts().idxmax())
# 转换分类数据
df["Sex"] = df["Sex"].map( {"female": 1, "male":0} ).astype(int)
# Embarked 栏位的 One-hot 编码
enbarked_one_hot = pd.get_dummies(df["Embarked"],
prefix="Embarked")
df = df.drop("Embarked", axis=1)
df = df.join(enbarked_one_hot)
# 将标签的 survived 栏位移至最后
df_survived = df.pop("Survived")
df["Survived"] = df_survived
print(df.head())
df.head().to_html("Ch6_2_2.html")
# 分割成训练(80%)和测试(20%)数据集
mask = np.random.rand(len(df)) < 0.8
df_train = df[mask]
df_test = df[~mask]
print("Train:", df_train.shape)
print("Test:", df_test.shape)
# 储存处理后的数据
df_train.to_csv("titanic_train.csv", index=False)
df_test.to_csv("titanic_test.csv", index=False)

S    646
C    168
Q     77
Name: Embarked, dtype: int64
S
   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked_C  \
0            1       3    0  22.0      1      0   7.2500           0   
1            2       1    1  38.0      1      0  71.2833           1   
2            3       3    1  26.0      0      0   7.9250           0   
3            4       1    1  35.0      1      0  53.1000           0   
4            5       3    0  35.0      0      0   8.0500           0   

   Embarked_Q  Embarked_S  Survived  
0           0           1         0  
1           0           0         1  
2           0           1         1  
3           0           1         1  
4           0           1         0  
Train: (703, 11)
Test: (188, 11)


2. 分割成特征数据和标签数据

In [5]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
seed = 7
np.random.seed(seed)
# 载入 Titanic 的训练和测试数据集
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
dataset_train = df_train.values
dataset_test = df_test.values
#分割成特征数据和标签数据
X_train = dataset_train[:, 0:9]
Y_train = dataset_train[:, 9]
X_test = dataset_test[:, 0:9]
Y_test = dataset_test[:, 9]
X_train

array([[1, 0, 3, ..., 1, 0, 'A/5 21171'],
       [2, 1, 1, ..., 1, 0, 'PC 17599'],
       [3, 1, 3, ..., 0, 0, 'STON/O2. 3101282'],
       ...,
       [889, 0, 3, ..., 1, 2, 'W./C. 6607'],
       [890, 1, 1, ..., 0, 0, '111369'],
       [891, 0, 3, ..., 0, 0, '370376']], dtype=object)

3. 数据标准化

In [10]:
# 特征标准化
X_train -= X_train.mean(axis=0)
X_train /= X_train.std(axis=0)
X_test -= X_test.mean(axis=0)
X_test /= X_test.std(axis=0)


ValueError: invalid literal for int() with base 10: 'Braund, Mr. Owen Harris'

4. 定义模型并编译模型

In [None]:
# 定义模型
model = Sequential()
model.add(Dense(11, input_dim=X_train.shape[1],
activation="relu"))
model.add(Dense(11, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
# 编译模型
model.compile(loss="binary_crossentropy", optimizer="adam",
metrics=["accuracy"])


5. 训练模型、评估模型

In [None]:
# 训练模型
print("Training ...")
model.fit(X_train, Y_train, epochs=18, batch_size=10,
verbose=0)
# 评估模型
print("\nTesting ...")
loss, accuracy = model.evaluate(X_train, Y_train, verbose=0)
print("训练数据集的准确度 = {:.2f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print("测试数据集的准确度 = {:.2f}".format(accuracy))
#显示图表来分析模型的训练过程
import matplotlib.pyplot as plt
# 显示训练和验证损失
loss = history.history["loss"]
epochs = range(1, len(loss)+1)
val_loss = history.history["val_loss"]
plt.plot(epochs, loss, "b-", label="Training Loss")
plt.plot(epochs, val_loss, "r--", label="Validation Loss")
plt.title("Training and Validation Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()
# 显示训练和验证准确度
acc = history.history["accuracy"]
epochs = range(1, len(acc)+1)
val_acc = history.history["val_accuracy"]
plt.plot(epochs, acc, "b-", label="Training Acc")
plt.plot(epochs, val_acc, "r--", label="Validation Acc")
plt.title("Training and Validation Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

6. 存储模型

In [None]:
# 存储 Keras 模型
print("Saving Model: titanic.h5 ...")
model.save("titanic.h5")

7. 调用训练好的模型

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
seed = 7
np.random.seed(seed)
# 载入 Titanic 的测试数据集
df_test = pd.read_csv("./titanic_test.csv")
dataset_test = df_test.values
# 分割成特征数据和标签数据
X_test = dataset_test[:, 0:9]
Y_test = dataset_test[:, 9]
# 特征标准化
X_test -= X_test.mean(axis=0)
X_test /= X_test.std(axis=0)
# 建立 Keras 的 Sequential 模型
model = Sequential()
model = load_model("titanic.h5")
# 编译模型
model.compile(loss="binary_crossentropy", optimizer="adam",
metrics=["accuracy"])
#评估模型
print("\nTesting ...")
loss, accuracy = model.evaluate(X_test, Y_test, verbose=0)
print("测试数据集的准确度 = {:.2f}".format(accuracy))
# 计算分类的预测值
print("\nPredicting ...")
Y_pred = model.predict_classes(X_test)
print(Y_pred[:,0])
print(Y_test.astype(int))
# 显示混淆矩阵
tb = pd.crosstab(Y_test.astype(int), Y_pred[:,0],
rownames=["label"], colnames=["predict"])
print(tb)
tb.to_html("Ch6_2_4.html")

# 参考资料

> 实验一  
> [KNN实现手写数字识别](https://blog.51cto.com/u_12228937/3703919)  
> [第2章 k-近邻算法](https://github.com/apachecn/ailearning/blob/master/docs/ml/2.md)  
> []()

> 实验二   
> []()
> []()
> []()