# 1. Base Model
---

In this notebook, we will prepare dataset and build a base model.

## Import Libraries

In [1]:
from keras import preprocessing, Input, Model, optimizers
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dense, Activation, Flatten, Dropout
from keras.layers import Conv2D, MaxPooling2D
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

import sys
sys.path.append("../src")
from utils import destination

Using TensorFlow backend.


## Create the Target Matrix
Form ```tag_list.txt``` and ```data_list.txt``` in HARRISON dataset, we create a target matrix as a DataFrame.

In [2]:
DATA_LIST = "../HARRISON/data_list_oversampled.txt"
TAG_LIST = "../HARRISON/tag_list_oversampled.txt"

# Read files
filename = pd.read_csv(DATA_LIST, names=["filename"], header=None)
hashtag = pd.read_csv(TAG_LIST, names=["labels"], header=None,skip_blank_lines=False).fillna("")

# Convert filenames from "instagram_dataset/xxx/yyy.jpg" to "xxx_yyy.jpg"
# filename["filename"] = filename["filename"].apply(lambda x: "_".join(x.split("/")[1:]))

# Concatenate filname and labels
target = pd.concat([filename, hashtag], axis=1)
target.head(5)

Unnamed: 0,filename,labels
0,sea_image_50.jpg,adventure beach fly instagram instapic lake oc...
1,sea_image_1284.jpg,beach lake ocean river sea water
2,sea_image_1122.jpg,beach friendship joy lake love ocean passion r...
3,sea_image_1679.jpg,adventure america beach europe germany island ...
4,sea_image_1736.jpg,adventure beach boyfriend brother cloud cousin...


In [3]:
# Use vectorizer to generate a one-hot encoding
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(target["labels"])
columns = vectorizer.get_feature_names()
X_df = pd.DataFrame(X.toarray(), columns=columns)

# Combine hashtag encodings with file names
target = pd.concat([target, X_df], axis=1)
target = shuffle(target, random_state=42)

num_images = target.shape[0]
num_hashtags = X_df.shape[1]

print(f"Number of images: {num_images}")
print(f"Number of hashtags: {num_hashtags}")

Number of images: 107383
Number of hashtags: 997


## Prepare Images

In [4]:
IMAGE_DIR = "../HARRISON/images"

# Instanciate a data generator
datagen = ImageDataGenerator(rescale=1./255.)

# Use these values to split data into train, val, and test
train_idx = int(target.shape[0] * .70)
val_idx = int(target.shape[0] * .90)

# Create train data generator
train_generator = datagen.flow_from_dataframe(
        dataframe=target[:train_idx],
        directory=IMAGE_DIR,
        x_col="filename",
        y_col=columns,
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode="raw",
        target_size=(100,100)
    )

# Create val data generator
valid_generator = datagen.flow_from_dataframe(
        dataframe=target[train_idx:val_idx],
        directory=IMAGE_DIR,
        x_col="filename",
        y_col=columns,
        batch_size=32,
        seed=42,
        shuffle=True,
        class_mode="raw",
        target_size=(100,100)
    )

# Create test data generator
test_generator = datagen.flow_from_dataframe(
        dataframe=target[val_idx:],
        directory=IMAGE_DIR,
        x_col="filename",
        batch_size=1,
        seed=42,
        shuffle=False,
        class_mode=None,
        target_size=(100,100)
    )

Found 75168 validated image filenames.
Found 21476 validated image filenames.
Found 10739 validated image filenames.


## Build a Model
### 1. Multiple Output Nodes

In [5]:
# Define model
inp = Input(shape=(100,100,3))
x = Conv2D(32, (3, 3), padding='same')(inp)
x = Activation('relu')(x)
x = Conv2D(32, (3, 3))(x)
x = Activation('relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)
x = Conv2D(64, (3, 3), padding='same')(x)
x = Activation('relu')(x)
x = Conv2D(64, (3, 3))(x)
x = Activation('relu')(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.25)(x)
x = Flatten()(x)
x = Dense(512)(x)
x = Activation('relu')(x)
x = Dropout(0.5)(x)
output = []
for i in range(num_hashtags):
    output.append(Dense(1, activation='sigmoid')(x))

model = Model(inp,output)

# Compile model
model.compile(optimizers.rmsprop(
    lr=0.0001,
    decay=1e-6),
    loss = ["binary_crossentropy" for i in range(num_hashtags)],
    metrics=["accuracy"])

## Fitting the Model

In [6]:
def generator_wrapper(generator):
    for batch_x,batch_y in generator:
        yield (batch_x,[batch_y[:,i] for i in range(num_hashtags)])

STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size

history = model.fit_generator(generator=generator_wrapper(train_generator),
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data=generator_wrapper(valid_generator),
    validation_steps=STEP_SIZE_VALID,
    epochs=1,
    verbose=0)

In [7]:
# Save model for future use
dest = os.path.join("..", "model")
model.save(destination(dest, "keras_model_oversampled"))

## Prediction

In [10]:
test_generator.reset()
pred = model.predict_generator(test_generator, steps=STEP_SIZE_TEST, verbose=1)



In [109]:
# Zscore normalization
pred_reshaped = np.array(pred).reshape((num_hashtags, num_images-val_idx)).transpose()
normalized = (pred_reshaped - pred_reshaped.mean())/pred_reshaped.std()
abv3std = np.where(normalized>3, normalized, 0)

# Get lists of hashtags
y_pred = pd.concat([
    pd.Series(test_generator.filenames, name="filename"),
    pd.Series([get_conf(row) for row in abv3std], name="y_pred")],
    axis=1)

results = target[["filename", "labels"]][val_idx:]
results.rename(columns={"labels": 'y_true'}, inplace=True)
results.index = y_pred.index
results["y_pred"] = y_pred["y_pred"]
results.head()

Unnamed: 0,filename,y_true,y_pred
0,school_image_3897.jpg,brazil colombia country energy fly gopro outdo...,"{'wonderful': 5.5500836, 'mother': 4.6565623, ..."
1,justinbieber_image_3861.jpg,actor arianagrande dance dancer japanese justi...,"{'blue': 7.9821415, 'purple': 6.717256, 'yello..."
2,justinbieber_image_6242.jpg,belieber beliebers bizzle kidrauhl myworld pur...,"{'mother': 6.5127997, 'wonderful': 5.7173877, ..."
3,fashion_image_457.jpg,chic fashion fashionable fashionista menswear ...,"{'wonderful': 6.444609, 'mother': 6.133388, 'p..."
4,justinbieber_image_9023.jpg,arianagrande followforfollow followtrain gainp...,"{'mother': 4.7865744, 'wonderful': 4.5713468, ..."


## Some of the Pictures with predicted hashtags
Only 10 of the predicted hashtags are shown with the pictures.

In [91]:
## Manually Check Some of the Results...
import ipyplot
from heapq import nlargest

paths = list(results.filename.apply(lambda x: f"../HARRISON/images/{x}"))
labels = [[f"{key}({d[key]:.4f})" for key in nlargest(10, d, key=d.get)] for d in results["y_pred"]]

ipyplot.plot_images(paths, labels)

## Reference
https://medium.com/@vijayabhaskar96/multi-label-image-classification-tutorial-with-keras-imagedatagenerator-cd541f8eaf24