# 利用Keras探索声呐物体分类数据集

## 目录

- [1. 数据集介绍](#1.-数据集介绍)
- [2. 数据加载分析](#2.-数据加载分析)
- [3. 数据处理](#3.-数据处理)

# 1. 数据集介绍

本章使用声呐数据，包括声呐在不同物体的返回。数据有60个变量，代表不同角度的返回值。目标是将石头和金属筒（矿石）分开。

所有的数据都是连续的，从0到1；输出变量中M代表矿石，R代表石头，需要转换为1和0。数据集有208条数据

**数据下载地址**: http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data

# 2. 数据加载分析

In [52]:
import pandas as pd
import numpy as np

dataset = pd.read_csv('../data/sonar.csv', header=None)

In [53]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [54]:
dataset.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [55]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,208.0,0.029164,0.022991,0.0015,0.01335,0.0228,0.03555,0.1371
1,208.0,0.038437,0.03296,0.0006,0.01645,0.0308,0.04795,0.2339
2,208.0,0.043832,0.038428,0.0015,0.01895,0.0343,0.05795,0.3059
3,208.0,0.053892,0.046528,0.0058,0.024375,0.04405,0.0645,0.4264
4,208.0,0.075202,0.055552,0.0067,0.03805,0.0625,0.100275,0.401
5,208.0,0.10457,0.059105,0.0102,0.067025,0.09215,0.134125,0.3823
6,208.0,0.121747,0.061788,0.0033,0.0809,0.10695,0.154,0.3729
7,208.0,0.134799,0.085152,0.0055,0.080425,0.1121,0.1696,0.459
8,208.0,0.178003,0.118387,0.0075,0.097025,0.15225,0.233425,0.6828
9,208.0,0.208259,0.134416,0.0113,0.111275,0.1824,0.2687,0.7106


In [56]:
dataset.iloc[:,60].unique()

array(['R', 'M'], dtype=object)

In [57]:
print dataset.groupby(60).size()

60
M    111
R     97
dtype: int64


"sonar.csv"中，只有两种类别
- R: 石头
- M: 矿石

# 3. 构建一个简单的网络

In [96]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline


In [94]:
seed = 60
np.random.seed(seed)

In [80]:
X = dataset.iloc[:,0:60].values
y = dataset.iloc[:,60].values


[[0.02   0.0371 0.0428 ... 0.0084 0.009  0.0032]
 [0.0453 0.0523 0.0843 ... 0.0049 0.0052 0.0044]
 [0.0262 0.0582 0.1099 ... 0.0164 0.0095 0.0078]
 ...
 [0.0522 0.0437 0.018  ... 0.0138 0.0077 0.0031]
 [0.0303 0.0353 0.049  ... 0.0079 0.0036 0.0048]
 [0.026  0.0363 0.0136 ... 0.0036 0.0061 0.0115]]


将 y 进行 **ONE-HOT** encoder

In [70]:
encoder = LabelEncoder()
Y = encoder.fit_transform(y)

#Y = pd.get_dummies(y1).values

In [101]:
def baseline_model():
    model = Sequential()
    model.add(Dense(100, input_dim=60, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [102]:
estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=150, batch_size=5, verbose=0)
kfold = StratifiedKFold(y=Y, n_folds=10, shuffle=True, random_state=seed)
scores = cross_val_score(estimator, X, Y, cv=kfold)
print 'Results %.2f %% (%.2f %%)' % (scores.mean()*100, scores.std()*100)

Results 64.98 % (8.28 %)


accuracy 为: 64.98%, 标准差为: 8.28%, 效果也不是那么的差，因为实现的代码确实很简单

# 4. 预处理数据增加性能 

预处理数据是个好习惯。神经网络喜欢输入类型的比例和分布一致，为了达到这点可以使用正则化，让数据的平均值是0，标准差是1，这样可以保留数据的分布情况。

scikit-learn的StandardScaler可以做到这点。不应该在整个数据集上直接应用正则化：应该只在测试数据上交叉验证时进行正则化处理，使正则化成为交叉验证的一环，让模型没有新数据的先验知识，防止模型发散。

scikit-learn的Pipeline可以直接做到这些。我们先定义一个 StandardScaler，然后进行验证：

In [105]:
estimaters = []
estimaters.append(('standardize', StandardScaler()))
estimaters.append(('mlp', KerasClassifier(build_fn=baseline_model, nb_epoch=150, batch_size=5, verbose=0)))
pipeline = Pipeline(estimaters)

kfold = StratifiedKFold(y=Y, n_folds=10, shuffle=True, random_state=seed)
scores = cross_val_score(pipeline, X, Y, cv=kfold)
print 'Standardized %.2f %% (%.2f %%)' % (scores.mean()*100, scores.std()*100)

Standardized 75.01 % (6.97 %)


增加了 Standardize 的确提高了 accuracy, 降低了 标准差

# 5. 调整模型的拓扑和神经元

神经网络有很多参数，例如初始化权重、激活函数、优化算法等等。我们一直没有说到调整网络的拓扑结构：扩大或缩小网络。我们试验一下：


## 4.1 缩小网络

有可能数据中有冗余：原始数据是不同角度的信号，有可能其中某些角度有相关性。我们把第一层隐层缩小一些，强行提取特征试试。

我们把之前的模型隐层的100个神经元减半到50个，这样神经网络需要挑选最重要的信息。之前的正则化有效果,也一并在这里做一下.

In [109]:
def baseline_model():
    model = Sequential()
    model.add(Dense(50, input_dim=60, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

np.random.seed(seed)
estimaters = []
estimaters.append(('standardize', StandardScaler()))
estimaters.append(('mlp', KerasClassifier(build_fn=baseline_model, nb_epoch=150, batch_size=5, verbose=0)))
pipeline = Pipeline(estimaters)

kfold = StratifiedKFold(y=Y, n_folds=10, shuffle=True, random_state=seed)
scores = cross_val_score(pipeline, X, Y, cv=kfold)
print 'Standardized %.2f %% (%.2f %%)' % (scores.mean()*100, scores.std()*100)

Standardized 74.15 % (11.23 %)


## 4.2 扩大网络
扩大网络后，神经网络更有可能提取关键特征，以非线性方式组合。

In [116]:
def baseline_model():
    model = Sequential()
    model.add(Dense(120, input_dim=60, activation='relu'))
    model.add(Dense(60, activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(15, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

np.random.seed(seed)
# estimaters = []
# estimaters.append(('standardize', StandardScaler()))
# estimaters.append(('mlp', KerasClassifier(build_fn=baseline_model, nb_epoch=150, batch_size=20, verbose=0)))
# pipeline = Pipeline(estimaters)

estimator = KerasClassifier(build_fn=baseline_model, nb_epoch=50, batch_size=10, verbose=0)
kfold = StratifiedKFold(y=Y, n_folds=5, shuffle=True, random_state=seed)
scores = cross_val_score(estimator, X, Y, cv=kfold)
print 'Standardized %.2f %% (%.2f %%)' % (scores.mean()*100, scores.std()*100)

Standardized 61.50 % (8.02 %)
