In [0]:
import tensorflow as tf

In [0]:
import keras
import numpy as np
import pandas as pd 
from keras.datasets import cifar10
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.core import Dense, Flatten,Activation
from keras.layers.convolutional import Conv2D
from keras.optimizers import Adam
from keras.layers.pooling import MaxPooling2D
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from datetime import timedelta
import time
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dropout, BatchNormalization
from keras import regularizers
import sys

np.random.seed(666)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [0]:
def load_weights_freeze_given_layers(model, weight_path, freeze_count):
    model.load_weights(weight_path)
    for layer in model.layers[:freeze_count]:
        layer.trainable = False 

    return model
# Get n randomly selected indices from the data 
def getTrainingDataIndicesOfSize(data, size):
    chosen_indices = np.random.choice(np.arange(len(data)), size, replace=False)
    return chosen_indices
# Simple function to get a new sample from the given class
def getNewDataOfClass(original_data, class_id):
    chosen_index = np.random.choice(np.arange(len(original_data)), 1, replace=False)[0]
    while(original_data[chosen_index][0] != class_id):
        chosen_index = np.random.choice(np.arange(len(original_data)), 1, replace=False)[0]
    return chosen_index
# Add new samples of each class until the sample-size-per-class target is reached
def fillLowSampleCountClassesTo(original_data, indices, target_count):
    unique, counts = np.unique(original_data[indices], return_counts=True)
    combined_stats = np.asarray((unique, counts)).T
    for i in range(len(combined_stats)):
        samples_needed = target_count - combined_stats[i][1]
        while(samples_needed > 0):
            potential_index = getNewDataOfClass(original_data, combined_stats[i][0])
            if not (np.any(indices[:] == potential_index)):
                indices = np.append(indices,potential_index)
                samples_needed -= 1
    return indices
# Remove samples of each class until the sample-size-per-class target is reached
def deleteSamplesOverPerClassLimit(original_data, indices, target_count):
    unique, counts = np.unique(original_data[indices], return_counts=True)
    combined_stats = np.asarray((unique, counts)).T
    for i in range(len(combined_stats)):
        samples_to_delete = combined_stats[i][1] - target_count
        while(samples_to_delete > 0):
            potential_index = np.random.choice(np.arange(len(indices)), 1, replace=False)[0]
            if (original_data[indices[potential_index]] == combined_stats[i][0]):
                indices = np.delete(indices,potential_index)
                samples_to_delete -= 1
    return indices
# Get exactly n samples of each class randomly
# 1. step: get n_class times sample_per_class indices (this gives approximately uniform distribution)
# 2-3. step: correct sample-per-class differences by removing excess samples and adding to underrepresented classes
def getRandomSamplesOfEachClassOfSize(data, n_classes, sample_per_class):
    indices = getTrainingDataIndicesOfSize(data, n_classes*sample_per_class)
    indices = fillLowSampleCountClassesTo(data, indices, sample_per_class)
    indices = deleteSamplesOverPerClassLimit(data, indices, sample_per_class)
  
  
    return indices

def classification_report_dataframe(report, index=0):
    report_data = []
    lines = report.split('\n')
    row = {}

    for line in lines[2:-5]:
        row_data = line.split('      ')
        class_idx = str(int(row_data[1]))
        row[class_idx + 'precision'] = float(row_data[2])
        row[class_idx + 'recall'] = float(row_data[3])
        row[class_idx + 'f1_score'] = float(row_data[4])
    
    dataframe = pd.DataFrame(data=row, index=[index])
    
        return dataframe

def timeElapsed(start_time):
    current_time = time.time()
    time_dif = current_time - start_time
    return str(timedelta(seconds=int(round(time_dif))))

def generateTrainingSet(x_supervised, y_supervised, x,y):
    y = np.reshape(y,(40000,1))
    x_train_supervised, x_valid_supervised, y_train_supervised, y_valid_supervised = train_test_split(x_supervised, y_supervised, test_size=0.2, shuffle= False)
    x_train_unsupervised, x_valid_unsupervised, y_train_unsupervised, y_valid_unsupervised = train_test_split(x, y, test_size=0.005, shuffle= False)
    

    x_train_supervised = np.concatenate((x_train_supervised,x_train_supervised, x_train_supervised, x_train_supervised,x_train_supervised,x_train_supervised))
    x_train_unsupervised = np.concatenate((x_train_unsupervised,x_train_unsupervised,x_train_unsupervised,x_train_unsupervised))
    x_train = np.concatenate((x_train_supervised,x_train_unsupervised))
    
    y_train_supervised = np.concatenate((y_train_supervised,y_train_supervised, y_train_supervised, y_train_supervised,y_train_supervised,y_train_supervised))
    y_train_unsupervised = np.concatenate((y_train_unsupervised,y_train_unsupervised,y_train_unsupervised,y_train_unsupervised))
    y_train = np.concatenate((y_train_supervised,y_train_unsupervised))
    
    x_valid = np.concatenate((x_valid_supervised, x_valid_unsupervised))
    y_valid = np.concatenate((y_valid_supervised, y_valid_unsupervised))
    
    return x_train, x_valid, y_train, y_valid  

def train_gen(supervised_gen, unsupervised_gen):
    while True:

        sup_images, sup_labels = next(supervised_gen)
        usup_images, usup_labels = next(unsupervised_gen)

        images = np.concatenate((sup_images, usup_images))
        labels = np.concatenate((sup_labels,usup_labels))

        yield images,labels
        
def filtered_train_gen(supervised_gen, unsupervised_gen):
    treshold = 0.85
    while True:

        sup_images, sup_labels = next(supervised_gen)
        usup_images, usup_labels = next(unsupervised_gen)

        img_tmp = np.concatenate((sup_images, usup_images))
        lab_tmp = np.concatenate((sup_labels,usup_labels))
        
        images = []
        labels = []
        while len(labels) != 100:
            for i in range(len(lab_tmp)):
                if max(lab_tmp[i]) > treshold:
                    images.append(img_tmp[i])
                    label = np.argmax(lab_tmp[i])
                    labels.append(to_categorical(label))

            img_tmp, lab_tmp = next(unsupervised_gen)    
        
        yield images,labels
        
def filter_labels(x,predictions):
    assert(len(x) == len(predictions))
    treshold = 0.9
    
    images =[]
    labels = []
    
    for i in range(len(predictions)):
            if max(predictions[i]) > treshold:
                images.append(x[i])
                label = np.argmax(predictions[i])
                labels.append(label)

    return np.array(images), labels

In [0]:
def getModel():
    weight_decay = 1e-4
    model = Sequential()
    model.add(Conv2D(32, (3,3), padding='same', input_shape=(32, 32, 3), kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(32, (3,3), padding='same', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))
    
    model.add(Conv2D(64, (3,3), padding='same', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(64, (3,3), padding='same', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))
    
    model.add(Conv2D(128, (3,3), padding='same', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Activation('relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(128, (3,3), padding='same', kernel_regularizer=regularizers.l2(weight_decay)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.3))
    
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001, decay=1e-6),
                  metrics=['accuracy'])
    return model


In [0]:
n_classes = 10
n_iters = 10
data_splits = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
original_split = 0.1
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()
x, x_supervised, y, y_supervised = train_test_split(X_train, Y_train, test_size=original_split, shuffle= False)
start_time = time.time()

for i in range(len(data_splits)):
    x, x_supervised, y, y_supervised = train_test_split(X_train, Y_train, test_size=original_split, shuffle= False)
    split = data_splits[i]
    supervised_test_scores =  pd.DataFrame()
    x_train, x_valid, y_train, y_valid = train_test_split(x_supervised, y_supervised, test_size=0.2, shuffle= True)

    #Run several iterations as cross-validation    
    for iteration in range(n_iters):
        model = getModel()
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0,restore_best_weights=True)
        
        datagen = ImageDataGenerator(
                rotation_range=15,
                width_shift_range=0.1,
                height_shift_range=0.1,
                horizontal_flip=True)
        
        datagen.fit(x_train)
        
        #Supervised training
        history = model.fit_generator(datagen.flow(x_train/255.0, to_categorical(y_train), batch_size=128), 
                                steps_per_epoch=x_train.shape[0] // 128, 
                                epochs=200,
                                verbose=0, 
                                callbacks=[keras.callbacks.History(),early_stopping], 
                                validation_data=(x_valid / 255.0, to_categorical(y_valid)))
        
        score = model.evaluate(X_test / 255.0, to_categorical(Y_test), batch_size=128, verbose=0)
        score = dict(zip(model.metrics_names, score))
        print(score)
        score_df = pd.DataFrame(data=score, index=[0])
        y_pred = model.predict_classes(X_test)
        report = classification_report(Y_test, y_pred)
        report_df = classification_report_dataframe(report)
        supervised_test_scores = supervised_test_scores.append(pd.concat([score_df, report_df], axis=1))
        modelpath="models/" + str(int(split*10)) + "/" + str(iteration) + "-supervised_model.h5"
        model.save(modelpath)
        
        
    
    
    supervised_logpath = "logs/" + "test/" + str(int(original_split*5000)) + "/" + str(int(split*10)) + "/"+ "supervised.csv"
    supervised_test_scores.to_csv(supervised_logpath)
    print("Finished with data split:", split, "Time elapsed so far:", timeElapsed(start_time))

    
    print("Process over. Total time elapsed:", timeElapsed(start_time))

    dd_test_scores =  pd.DataFrame()

    #Load supervised model, predict on unlabeled data
    for iteration in range(n_iters):
        model = getCompleteModel()
        model_path =  "models/" + str(int(split*10)) + "/" + str(iteration) + "-supervised_model.h5"
        model.load_weights(model_path)
        transformations = ["flip", "rotate", "wshift", "hshift"]
        predictions = np.zeros((40000,10))
        for i in range(len(transformations)):
            path = "images/" + transformations[i]
            basic = ImageDataGenerator(rescale=1./255)
            gen = basic.flow_from_directory(path, target_size=(32,32), class_mode=None, batch_size=64, shuffle=False)
            current_preds = model.predict_generator(gen, verbose=0)
            predictions = predictions + current_preds
        predictions = predictions / len(transformations)
        predictions = np.argmax(predictions, axis=1)
        ref = ImageDataGenerator(rescale=1./255)

        unsupervised_samples = int(split * 40000)

        #Set up extended training set
        x = x[:unsupervised_samples]
        y = y[:unsupervised_samples]
        predictions = predictions[:unsupervised_samples]

        x_train_supervised, x_valid_supervised, y_train_supervised, y_valid_supervised = train_test_split(x_supervised, y_supervised, test_size=0.2, shuffle= False)
        x_train_unsupervised, x_valid_unsupervised, y_train_unsupervised, y_valid_unsupervised = train_test_split(x, predictions, test_size=2/3*len(y_valid_supervised)/len(x), shuffle= False)

        train_samples = len(x_train_supervised) + len(x_train_unsupervised)
        validation_samples = len(x_valid_supervised) + len(x_valid_unsupervised)

        supervised_gen_train = ref.flow(x_train_supervised,to_categorical(y_train_supervised),shuffle=True, seed=666, batch_size=60)
        unsupervised_gen_train = ref.flow(x_train_unsupervised,to_categorical(y_train_unsupervised),shuffle=True, seed=666, batch_size=40)

        supervised_gen_valid = ref.flow(x_valid_supervised,to_categorical(y_valid_supervised),shuffle=True, seed=666, batch_size=60)
        unsupervised_gen_valid = ref.flow(x_valid_unsupervised,to_categorical(y_valid_unsupervised),shuffle=True, seed=666, batch_size=40)

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0,restore_best_weights=True)

        model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.00025, decay=1e-6),
              metrics=['accuracy'])

        #Fine-tune model on extended dataset
        history = model.fit_generator(train_gen(supervised_gen_train,unsupervised_gen_train),
                                      steps_per_epoch=train_samples//100,
                                      epochs=400,
                                      verbose=0,
                                      callbacks=[keras.callbacks.History(), early_stopping],
                                      validation_data=train_gen(supervised_gen_valid,unsupervised_gen_valid),
                                      validation_steps=validation_samples//100,
                                      shuffle=True)
        score = model.evaluate(X_test / 255.0, to_categorical(Y_test), batch_size=128, verbose=0)
        score = dict(zip(model.metrics_names, score))
        print(score)
        score_df = pd.DataFrame(data=score, index=[0])
        y_pred = model.predict_classes(X_test)
        report = classification_report(Y_test, y_pred)
        report_df = classification_report_dataframe(report)
        dd_test_scores = dd_test_scores.append(pd.concat([score_df, report_df], axis=1))
        modelpath="models/" + str(int(split*10)) + "/" + str(iteration) + "-dd_model.h5"
        model.save(modelpath)

    dd_logpath = "logs/" + "test/" + str(int(original_split*5000)) + "/" + str(int(split*10)) + "/"+ "dd.csv"
    dd_test_scores.to_csv(dd_logpath)
    print("Finished with data split:", split, "Time elapsed so far:", timeElapsed(start_time))