In [1]:
import os
import pandas as pd
import pickle
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier


## 最小データ数の取得用関数
最もサンプル数の少ないデータの数を取得

In [2]:
def search_min_data_num(num_classes):
    # init
    dict_num = []

    for i in range(num_classes):
        file_name = "./data/" + TRAIN_DATA_FILES[i] + '.csv'
        data_set = pd.read_csv(file_name, header=None)
        
        num_data_set = len(data_set)
        dict_num.append(num_data_set)
        
        print(TRAIN_DATA_FILES[i], num_data_set, sep=': ')

    min_data_num = min(dict_num)
    print('\n')
    print("min_data_num:", min_data_num)
    
    return min_data_num

## Define

In [3]:
TRAIN_DATA_FILES = ['cross', 'dead', 'left', 'right', 'straight', 'threeway']
# NUM_CLASSES = 6
NUM_CLASSES = len(TRAIN_DATA_FILES)
num_data_set = search_min_data_num(NUM_CLASSES)
REPLACE_NAN = 0

epochs = 120
batch_size = 100


cross: 688
dead: 4377
left: 2590
right: 1852
straight: 3952
threeway: 10048


min_data_num: 688


## チューニングパラメータ設定
辞書内の数字を全通り試して最もスコアの高いものを選ぶことができる

In [4]:
def param():
    ret = {
        'mid_lay1':[300, 600, 800, 900, 1000, 1100],
        'mid_lay2':[300, 600, 800, 900, 1000, 1100],
        'dropout1':[0.2, 0.3, 0.4, 0.5],
        'dropout2':[0.2, 0.3, 0.4, 0.5]
#        'optimizer':["adam", "adagrad"]
    }

    return ret

## 学習データ数の取得用関数  
データ読み込み時、それぞれのラベルの学習データの内、最も数の少ないものに合わせる

In [5]:
def adjust_data_num(num_class):
    # header = 列名
    file_name = "./data/" + TRAIN_DATA_FILES[num_class] + '.csv'
    data_set = pd.read_csv(file_name, header=None)

    return data_set.sample(num_data_set)

## 学習データ読み込み用関数

In [11]:
def split_data():
    files = os.listdir('./data')
    X = []
    Y = []
    all_data_set = []
    labels = []


    for i in range(NUM_CLASSES):
        try:
            data_set = adjust_data_num(i)
            all_data_set.append(data_set)

            # one_hot_vectorを作りラベルとして追加
            tmp = np.zeros((num_data_set, NUM_CLASSES))
            tmp[:, i] = 1
            labels.append(tmp)
        except pd.io.common.EmptyDataError:
            print("ERROR: {} is empty".format(file_name))

    X = pd.concat(all_data_set)
    # replace Nan with 'REPLACE_NAN'
    X = X.fillna(REPLACE_NAN)
    Y = np.concatenate(labels, axis=0)

#    _, DIM_input_data = data_set.shape

    X_train, X_validation_and_test, Y_train, Y_validation_and_test = train_test_split(X, Y,train_size=0.6, test_size=0.4)
    X_validation, X_test, Y_validation, Y_test = train_test_split(X_validation_and_test, Y_validation_and_test, train_size=0.5, test_size=0.5)

    return X_train, X_validation, X_test, Y_train, Y_validation, Y_test

## グラフプロット用関数

In [7]:
def plt_result(epochs, history):
    plt.plot(range(1, epochs+1), history.history['acc'], label="training")
    plt.plot(range(1, epochs+1), history.history['val_acc'], label="validation")
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

## ネットワーク設計
現在  
input(726) -  
<span>　</span>mid_lay1(dropout1) - mid_lay2(dropout2) -  
<span>　　</span>output(6)

In [8]:
def make_model(activation="relu", optimizer="adam", mid_lay1=100, mid_lay2=100, dropout1=0.25, dropout2=0.25):
    model = Sequential()
    model.add(Dense(mid_lay1, input_dim=726, activation=activation))
    model.add(Dropout(dropout1))

    model.add(Dense(mid_lay2, activation=activation))
    model.add(Dropout(dropout2))

    model.add(Dense(NUM_CLASSES, activation="softmax"))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# 学習実行

###### データ読み込み

In [12]:
X_train, X_validation, X_test, Y_train, Y_validation, Y_test = split_data()


###### 学習

In [16]:
# Retrieve model and parameter into GridSearchCV
model = KerasClassifier(build_fn=make_model, verbose=0)

param_grid = param()
grid = GridSearchCV(estimator=model, param_grid=param_grid)

# Run grid search
grid_result = grid.fit(X_train, Y_train)

# Get the best score and the optimized mode
print (grid_result.best_score_)
print (grid_result.best_params_)

# Now see the optimized model
mid_lay1 = grid_result.best_params_['mid_lay1']
mid_lay2 = grid_result.best_params_['mid_lay2']
dropout1 = grid_result.best_params_['dropout1']
dropout2 = grid_result.best_params_['dropout2']

model = make_model(mid_lay1=mid_lay1, mid_lay2=mid_lay2, dropout1=dropout1, dropout2=dropout2)
model.summary()

164
285
112
ERROR: ./data/right.csv is empty
3798
696
0.7665677549930991
{'dropout1': 0.25, 'dropout2': 0.25, 'mid_lay1': 200, 'mid_lay2': 200}
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_187 (Dense)            (None, 200)               145400    
_________________________________________________________________
dropout_125 (Dropout)        (None, 200)               0         
_________________________________________________________________
dense_188 (Dense)            (None, 200)               40200     
_________________________________________________________________
dropout_126 (Dropout)        (None, 200)               0         
_________________________________________________________________
dense_189 (Dense)            (None, 6)                 1206      
Total params: 186,806
Trainable params: 186,806
Non-trainable params: 0
__________________________________________________________