# 机器学习工程师纳米学位毕业项目--驾驶员状态检测

In [2]:
#下载数据

from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile('imgs.zip'):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc='Dataset') as pbar:
        urlretrieve('https://storage.googleapis.com/kaggle-competitions-data/kaggle/5048/imgs.zip?GoogleAccessId=competitions-data@kaggle-161607.iam.gserviceaccount.com&Expires=1501760775&Signature=lLmT1REaZts91RV75wp1lfSPU4EVF6xgXHt5pIjbBm%2B5ZJ7ZG69WMwbNTOHVRWlpzfvt%2F9YSpEk0HWTDvOLi3ccKsfMeTBur0%2BjYIgP9Thn4THcZqEDGqsJqKJKU2hfQIiM4VL6xS20GslHM7VSW9naeg1CNxpEwOsS%2BHDVNBYcrjvFxLj%2BquCjZjsGOmqsP9MtftC5fiJHAeGDgy2cSHg6Whpu8GxjTQ%2FGnuYOm8az39pzIJAQDxwEUgeD0sDxYrmlQRbnVx8pkhRJ4OStI257yrQk5TsOLNIaisvkMuLgf2vn00wkyCH5X6VF0btvXgRCydr1T60AXNkf9xbuH2Q%3D%3D', 'imgs.zip', pbar.hook)

Dataset: 4.29GB [01:14, 57.9MB/s]                               


In [None]:
#解压

import zipfile

zf = zipfile.ZipFile('imgs.zip' , 'r')
zf.extractall('imgs/')
zf.close()

In [1]:
#移动test文件夹图片

import shutil, os

os.mkdir('imgs/Test')

In [2]:
shutil.move('imgs/test', 'imgs/Test')

'imgs/Test/test'

In [3]:
os.remove('imgs.zip')

In [23]:
#图片预处理

from keras.preprocessing.image import ImageDataGenerator

gen = ImageDataGenerator(rescale=1./255)
train_generator = gen.flow_from_directory('imgs/train', target_size=(224,224), batch_size=16, shuffle=False, class_mode='categorical')
test_generator = gen.flow_from_directory('imgs/Test', target_size=(224,224), batch_size=16, shuffle=False, class_mode=None)

Found 22424 images belonging to 10 classes.
Found 79726 images belonging to 1 classes.


In [26]:
train_generator.classes[20000]

8

In [24]:
#使用ResNet50模型导出特征向量

from keras.applications.resnet50 import ResNet50
import h5py
import numpy as np
from sklearn.utils import shuffle

model = ResNet50(include_top=False, weights='imagenet')
train = model.predict_generator(train_generator, 22424//16+1)
test = model.predict_generator(test_generator, 79726//16+1)

with h5py.File("ResNet50.h5") as h:
    h.create_dataset("train", data=train)
    h.create_dataset("test", data=test)
    h.create_dataset("label", data=train_generator.classes)
    

#np.save(open('bottleneck_features_train.npy', 'w'), train)
#np.save(open('bottleneck_features_test.npy', 'w'), test)

In [12]:
X_train.shape

(22424, 1, 1, 2048)

In [28]:
y_train.shape

(22424,)

In [5]:
# y_trian 转为独热编码

from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit(np.array([0,1,2,3,4,5,6,7,8,9]))
y_train = lb.transform(y_train)



In [7]:
y_train[200]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [None]:
from keras.applications.vgg16 import VGG16
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense

model = VGG16(include_top=False, weights='imagenet')

top_model = Sequential()
top_model.add(Flatten(input_shape=model.output_shape[1:]))
top_model.add(Dense(2048, activation='relu'))
top_model.add(Dropout(0.5))
top_model.add(Dense(512, activation='relu'))
top_model.add(Dropout(0.5))
top_model.add(Dense(10, activation='softmax'))

model.add(top_model)

In [None]:
model.compile(optimizer='adadelta',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train,
          nb_epoch=50, batch_size=128,
          validation_split=0.2)

In [9]:
#利用模型预测结果

y_pred = model.predict(X_test, verbose=1)
#y_pred = y_pred.clip(min=0.005, max=0.995)



In [10]:
y_pred[50]

(79726, 10)

In [None]:
#输出预测结果

import pandas as pd

df = pd.read_csv("sample_submission.csv")

for i in range(y_pred.shape[0]):
    df.iloc[i,1:11] = y_pred[i]
    
df.to_csv('sample_submission.csv', index=None)