# 1. Base Model
---

In this notebook, we will prepare dataset and build a base model.

## Import Libraries

In [71]:
from keras import preprocessing, Input, Model
from keras_preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import sys
sys.path.append("../src")
from destination import destination

## Create the Target Matrix
Form ```tag_list.txt``` and ```data_list.txt``` in HARRISON dataset, we create a target matrix as a DataFrame.

In [143]:
# Read files
filename = pd.read_csv("../HARRISON/data_list.txt", names=["filename"], header=None)
hashtag = pd.read_csv("../HARRISON/tag_list.txt", names=["labels"], header=None)

# Convert filenames from "instagram_dataset/xxx/yyy.jpg" to "xxx_yyy.jpg"
filename["filename"] = filename["filename"].apply(lambda x: "_".join(x.split("/")[1:]))

# Concatenate filname and labels
target = pd.concat([filename, hashtag], axis=1)
target.head(5)

Unnamed: 0,filename,labels
0,sea_image_50.jpg,sea instapic instagram trip travel
1,sea_image_1284.jpg,sea
2,sea_image_1122.jpg,sea love
3,sea_image_1679.jpg,beach sea trip island japan
4,sea_image_1736.jpg,sun sand sea sky friend beach thailand trip ad...


In [144]:
# Use vectorizer to generate a one-hot encoding
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(target["labels"])
columns = vectorizer.get_feature_names()
X_df = pd.DataFrame(X.toarray(), columns=columns)

# Combine hashtag encodings with file names
target = pd.concat([target, X_df], axis=1)
target = shuffle(target, random_state=42)

num_images = target.shape[0]
num_hashtags = X_df.shape[1]

In [7]:
# Save target as csv for later use
target.to_csv(destination("../model", "target.csv"), index=False)

## Prepare Images

In [9]:
IMAGE_DIR = "../HARRISON/images"

# Instanciate a data generator
datagen = ImageDataGenerator(rescale=1./255.)

# Use these values to split data into train, val, and test
train_idx = int(target.shape[0] * .70)
val_idx = int(target.shape[0] * .90)

# Create train data generator
train_generator = datagen.flow_from_dataframe(
        dataframe=target[:train_idx],
        directory=IMAGE_DIR,
        x_col="filename",
        y_col=columns,
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode="raw",
        target_size=(100,100)
    )

# Create val data generator
valid_generator = datagen.flow_from_dataframe(
        dataframe=target[train_idx:val_idx],
        directory=IMAGE_DIR,
        x_col="filename",
        y_col=columns,
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode="raw",
        target_size=(100,100)
    )

# Create test data generator
test_generator = datagen.flow_from_dataframe(
        dataframe=target[val_idx:],
        directory=IMAGE_DIR,
        x_col="filename",
        batch_size=1,
        seed=42,
        shuffle=False,
        class_mode=None,
        target_size=(100,100)
    )

Found 40168 validated image filenames.
Found 11476 validated image filenames.
Found 5739 validated image filenames.


## Build a Model

In [11]:
# Define model
inp = Input(shape=(100,100,3))
x = Conv2D(32, (3, 3), padding='same')(inp)
x = Activation('relu')(x)
x = Conv2D(32, (3, 3))(x)
x = Activation('relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)
x = Conv2D(64, (3, 3), padding='same')(x)
x = Activation('relu')(x)
x = Conv2D(64, (3, 3))(x)
x = Activation('relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)
x = Flatten()(x)
x = Dense(512)(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)
output = []
for i in range(num_hashtags):
    output.append(Dense(1, activation='sigmoid')(x))

model = Model(inp,output)

# Compile model
model.compile(optimizers.rmsprop(
    lr=0.0001,
    decay=1e-6),
    loss = ["binary_crossentropy" for i in range(num_hashtags)],
    metrics=["accuracy"])

## Fitting the Model

In [12]:
def generator_wrapper(generator):
    for batch_x,batch_y in generator:
        yield (batch_x,[batch_y[:,i] for i in range(num_hashtags)])

STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size

model.fit_generator(generator=generator_wrapper(train_generator),
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data=generator_wrapper(valid_generator),
    validation_steps=STEP_SIZE_VALID,
    epochs=1,
    verbose=0)

 0.9993 - val_dense_1409_accuracy: 0.9993 - val_dense_1410_accuracy: 0.9990 - val_dense_1411_accuracy: 0.9988 - val_dense_1412_accuracy: 0.9983 - val_dense_1413_accuracy: 0.9998 - val_dense_1414_accuracy: 0.9907 - val_dense_1415_accuracy: 0.9904 - val_dense_1416_accuracy: 0.9993 - val_dense_1417_accuracy: 0.9965 - val_dense_1418_accuracy: 0.9988 - val_dense_1419_accuracy: 0.9976 - val_dense_1420_accuracy: 0.9983 - val_dense_1421_accuracy: 0.9994 - val_dense_1422_accuracy: 0.9959 - val_dense_1423_accuracy: 0.9942 - val_dense_1424_accuracy: 0.9973 - val_dense_1425_accuracy: 0.9975 - val_dense_1426_accuracy: 0.9997 - val_dense_1427_accuracy: 0.9955 - val_dense_1428_accuracy: 0.9990 - val_dense_1429_accuracy: 0.9995 - val_dense_1430_accuracy: 0.9969 - val_dense_1431_accuracy: 0.9983 - val_dense_1432_accuracy: 0.9987 - val_dense_1433_accuracy: 0.9968 - val_dense_1434_accuracy: 0.9987 - val_dense_1435_accuracy: 0.9976 - val_dense_1436_accuracy: 0.9937 - val_dense_1437_accuracy: 0.9813 - val_

<keras.callbacks.callbacks.History at 0x7fad8cb34f10>

In [196]:
# Save trained model
model.save(destination("../model", "model"))

## Prediction

In [14]:
test_generator.reset()
pred = model.predict_generator(test_generator, steps=STEP_SIZE_TEST, verbose=1)



In [180]:
# Zscore normalization
pred_reshaped = np.array(pred).reshape((num_hashtags, num_images-val_idx)).transpose()
normalized = (pred_reshaped - pred_reshaped.mean())/pred_reshaped.std()
above3std = (b > 3).astype(int)

# Get lists of hashtags
y_pred = pd.concat([
    pd.Series(test_generator.filenames, name="filename"),
    pd.Series(vectorizer.inverse_transform(above3std), name="y_pred")],
    axis=1)

y_true = target[target["filename"].isin(results["filename"])][["filename", "labels"]]
y_true["y_true"] = y_true["labels"].apply(lambda x: x.split())
y_true.drop(["labels"], axis=1, inplace=True)

results = pd.merge(y_true, y_pred, left_on="filename", right_on="filename")
results.head()

Unnamed: 0,filename,y_true,y_pred
0,justinbieber_image_1127.jpg,"[likeme, follow, onedirection, harrystyles, ar...","[black, family, fashion, friend, girl, love, n..."
1,makeup_image_696.jpg,[makeup],"[beach, beautiful, black, bored, family, fashi..."
2,truth_image_2843.jpg,"[word, wisdom, truth, quote]","[beach, beautiful, black, family, fashion, fri..."
3,dog_image_1067.jpg,"[bff, cloud, dog, bestfriends, dogsofinstagram]","[beach, beautiful, beauty, black, bored, boyfr..."
4,family_image_2692.jpg,"[kid, boy, child, park, april, outdoors, famil...","[beach, beautiful, black, bored, family, fashi..."


## Measure Performance
Count the number of hashtags that's not in y_true. Note that this metric is not really useful.

In [191]:
def performance(row):
    err = 0
    for hashtag in  row.y_pred:
        if hashtag not in row.y_true:
            err += 1
    return err

err = results.apply(performance, axis=1)


## Some of the Pictures with predicted hashtags
Only 10 of the predicted hashtags are shown with the pictures.

In [227]:
## Manually Check Some of the Results...
import ipyplot

paths = list(results.filename.apply(lambda x: f"../HARRISON/images/{x}"))
labels = [tags[:10] for tags in results.y_pred]

ipyplot.plot_images(paths, labels)

## Reference
https://medium.com/@vijayabhaskar96/multi-label-image-classification-tutorial-with-keras-imagedatagenerator-cd541f8eaf24