In [0]:
from google.colab import drive
drive.mount('/content/drive')

Data Preprocessing

In [1]:
import pandas as pd
# df=pd.read_csv('drive/My Drive/Colab Notebooks/Final Project/Task 2/train.csv')
df=pd.read_csv('train.csv')   # my PC
df['label'] = df['label'].astype(str)
df.head()

Unnamed: 0,image,label
0,0000.png,0
1,0001.png,1
2,0002.png,1
3,0003.png,1
4,0004.png,1


In [5]:
traindf = df.sample(frac=0.75, random_state=777)   # 隨機將train.csv裡的檔案分成train:validation=3:1
validf = df.drop(traindf.index)

display(traindf.head())
display(validf.head())
print('各DataFrame 大小:', len(df), len(traindf), len(validf))

Unnamed: 0,image,label
3437,3437.png,0
1646,1646.png,1
4035,4035.png,0
2041,2041.png,1
1920,1920.png,0


Unnamed: 0,image,label
8,0008.png,1
17,0017.png,0
19,0019.png,0
20,0020.png,0
28,0028.png,0


各DataFrame 大小: 5000 3750 1250


In [11]:
import os
from keras.preprocessing.image import ImageDataGenerator

base_dir = "drive/My Drive/Colab Notebooks/Final Project/Task 2"   # Colab 路徑
# base_dir = os.getcwd()   # My PC路徑 (notebook當前路徑)
train_dir = os.path.join(base_dir, 'train_img')
valid_dir = os.path.join(base_dir, 'train_img')
test_dir = os.path.join(base_dir, 'test_img')

BATCH_SIZE = 50

train_datagen = ImageDataGenerator(  # data augmentation
        rotation_range=15,
        shear_range=0.1,
        zoom_range=0.2,
        horizontal_flip=True,
        rescale=1./255., 
        # validation_split=0.25
)
train_generator = train_datagen.flow_from_dataframe( 
    dataframe=traindf, 
    directory=train_dir, 
    x_col='image', 
    y_col='label', 
    has_ext=False, 
    # subset="training", 
    class_mode="binary", 
    batch_size=BATCH_SIZE,
    target_size=(224, 224)
)

vaild_datagen = ImageDataGenerator(
        rescale=1./255.
)
validation_generator = vaild_datagen.flow_from_dataframe( 
    dataframe=validf, 
    directory=valid_dir,
    x_col='image', 
    y_col='label', 
    has_ext=False, 
    # subset="validation", 
    class_mode="binary",
    batch_size=BATCH_SIZE,
    target_size=(224, 224)
)

# 
test_datagen=ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),
    color_mode='rgb',
    shuffle=False,
    class_mode='categorical'
)

Found 3750 validated image filenames belonging to 2 classes.
Found 1250 validated image filenames belonging to 2 classes.
Found 913 images belonging to 1 classes.


Model

In [0]:
from keras.applications import VGG16

conv_base = VGG16(weights='imagenet',
                  include_top=False,
                  input_shape=(224, 224, 3)
)

In [0]:
conv_base.summary()

In [16]:
import numpy as np

def extract_features(generator, sample_count):
    features = np.zeros(shape=(sample_count, 7, 7, 512))
    labels = np.zeros(shape=(sample_count))
    batch_size = generator.batch_size
    # generator = datagen.flow_from_directory(
    #     directory,
    #     target_size=(150, 150),
    #     batch_size=batch_size,
    #     class_mode='binary')
    i = 0

    if generator.class_mode is None:
      for inputs_batch in generator:
        features_batch = conv_base.predict(inputs_batch)
        print(inputs_batch.shape, end=' ')
        features[i : (i + features_batch.shape[0])] = features_batch   # 避免最後一個batch的shape不合
        i += features_batch.shape[0]
        if i >= sample_count:
          # Note that since generators yield data indefinitely in a loop,
          # we must `break` after every image has been seen once.
          break
      return features
    else:
      for inputs_batch, labels_batch in generator:
        features_batch = conv_base.predict(inputs_batch)
        print(inputs_batch.shape, end=' ')
        # features[i * batch_size : (i + 1) * batch_size] = features_batch
        features[i : (i + features_batch.shape[0])] = features_batch   # 避免最後一個batch的shape不合
        # labels[i * batch_size : (i + 1) * batch_size] = labels_batch
        labels[i : (i + features_batch.shape[0])] = labels_batch
        i += features_batch.shape[0]
        if i >= sample_count:
          break
    print()
    return features, labels

train_features, train_labels = extract_features(train_generator, train_generator.samples)
validation_features, validation_labels = extract_features(validation_generator, validation_generator.samples)
test_features = extract_features(test_generator, test_generator.samples)

(50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) (50, 224, 224, 3) 

KeyboardInterrupt: ignored

In [0]:
train_features = np.reshape(train_features, (train_generator.samples, 7*7*512))
validation_features = np.reshape(validation_features, (validation_generator.samples, 7*7*512))
test_features = np.reshape(test_features, (test_generator.samples, 7*7*512))

In [0]:
from keras import models
from keras import layers
from keras import optimizers
from keras import regularizers

model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_dim=7*7*512))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dropout(0.3))
# model.add(layers.Dense(512, activation='relu'))
# model.add(layers.Dropout(0.3))
model.add(layers.Dense(1, activation='sigmoid'))

In [0]:
from keras import optimizers

model.compile(loss='binary_crossentropy',
              optimizer=optimizers.RMSprop(lr=1e-4),
              metrics=['acc'])

Training

In [0]:
for data_batch, labels_batch in train_generator:
    print('data batch shape:', data_batch.shape)
    print('labels batch shape:', labels_batch.shape)
    break

In [0]:
history = model.fit(
    train_features, 
    train_labels,
    epochs=50,
    batch_size=BATCH_SIZE,
    validation_data=(validation_features, validation_labels)
)

In [0]:
model.save('task2_1218.h5')

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc)+1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
# plt.savefig('accu.png')

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
# plt.savefig('loss.png')

plt.show()

Test

In [0]:
def to_class(i):              # 小數轉成整數
    new = list()
    for e in i:
        if e >= 0.5:
            new.append(1)
        else:
            new.append(0)
    return new

In [0]:
def revise(x):                  # 改filename (留\\後面的檔名就好)
    return x.split('\\')[1]

In [0]:
pred = model.predict(test_features)
print(len(pred))
print(pred)

classes = to_class(list(pred.reshape(-1)))

filenames = test_generator.filenames    # Get filenames (set shuffle=false in generator is important)

In [0]:
np.array(classes).mean()

0.4403066812705367

In [0]:
# 還沒試 rotation等等
# https://medium.com/@jackycsie/%E5%B0%8F%E6%95%B8%E6%93%9A%E7%9A%84%E9%80%86%E8%A5%B2-c04fee852539
# ReduceLROnPlateau
#　https://hackmd.io/@allen108108/SyCsOIkxB

In [0]:
output = pd.DataFrame(np.array([filenames, classes]).transpose(), columns=['image', 'label'])
output.head

In [0]:
output['image'] = output['image'].apply(revise)
output.head

In [0]:
output.to_csv('drive/My Drive/Colab Notebooks/Final Project/Task 2/Result/result4.csv', index=False, encoding='utf-8')