#### Importing required packages to be used in the notebook

In [None]:
# Generic
import os
import time

# Data analysis and computing
import csv
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer 
import random
import json
from sklearn.metrics import roc_curve, auc

# Image Processing
import PIL
from PIL import Image

# Visualisation
# Note: Uncomment to use the notebook as python script
#### import matplotlib
#### matplotlib.use('Agg')
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from IPython.display import HTML, display

# Pytorch
import torch
import torch.nn as nn
import torch.utils.data as data_utils
import torchvision.models as models
from torch.nn.modules import CrossEntropyLoss, L1Loss
import torch.optim as optim
from torch.autograd import Variable
from torchvision import datasets, transforms

#### Declaration of constants to be used in the notebook

In [None]:
# Declarations related to data and file paths
DATA_PATH = 'datasets/'
MOVIE_POSTERS_PATH = DATA_PATH + 'posters/'
MOVIE_GENRE_FILE = DATA_PATH + 'MovieGenre.csv'
TARGET_PATH = 'output/model1/'

# Configurations related to training
DATASET_SIZE = 15000
AUGMENT_RATIO = None       # Set AUGMENT_RATIO to None to have no augmentation
TRAINING_SET_PROPORTION = 0.80
BATCH_SIZE = 32
LEARNING_RATE = 0.0042234
WEIGHT_DECAY = 0.00001
NUM_EPOCHS = 2
DECAY_EPOCHS = 200
MOMENTUM = 0.9
L2_REGULARIZATION = 0.01

# Image dimensions
WIDTH = 64
HEIGHT = 64
CHANNELS = 3

# GPU-related configurations 
USE_GPU = torch.cuda.is_available()
GPUS = [0, 1, 2]
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2,3"

In [None]:
if USE_GPU:
    torch.cuda.set_device(GPUS[0])

In [None]:
# Define functions for image processing

toImage = transforms.ToPILImage()

preprocessFn = transforms.Compose([
        transforms.RandomHorizontalFlip(),
])

### Dataset Inspection

#### 1. Movie Genres

Read the Movie Genre file

In [None]:
data = []

with open(MOVIE_GENRE_FILE, 'r', encoding='latin1') as f:
    csv_reader = csv.reader(f)

    for row in csv_reader:
        data.append(row)

Print the first few entries to inspect the structure of data

In [None]:
data[0:5]

Create a dataframe from the data, using the entries at the first index as the column headers

In [None]:
columns = data.pop(0)

columns = list(map(lambda x: x.lower().replace(' ', '_'), columns))
movie_genre_df = pd.DataFrame(data, columns=columns)

Inspect the dataframe

In [None]:
print('movie_genre_df.shape', movie_genre_df.shape)

Clean the movie genre dataset to only contain relevant samples

In [None]:
# Drop duplicates
movie_genre_df.drop_duplicates(inplace=True)
# Drop those samples that do not have an associated genre
movie_genre_df.drop(movie_genre_df[movie_genre_df['genre'] == ''].index, inplace=True)
# Drop those samples that do not have an associated poster
movie_genre_df.drop(movie_genre_df[movie_genre_df['poster'] == ''].index, inplace=True)
# Reset the indices on the dataframe
movie_genre_df.reset_index(drop=True, inplace=True)

In [None]:
print('movie_genre_df.shape', movie_genre_df.shape)

In [None]:
movie_genre_df.head()

Process the dataframe so that genres can be accessed easily further on 

In [None]:
# Split genre to create a list of values 
movie_genre_df['genre'] = movie_genre_df['genre'].apply(lambda x: x.split('|'))

# Expand genre into its own dataframe
tags = movie_genre_df['genre'].apply(pd.Series).fillna('')

# Rename the columns
tags = tags.rename(columns = lambda x : 'genre_' + str(x))

# Merge the genre dataframe back into the original dataframe
movie_genre_df = pd.concat([movie_genre_df[:], tags[:]], axis=1)

Inspect the dataframe

In [None]:
print('movie_genre_df.shape', movie_genre_df.shape)

In [None]:
movie_genre_df.head()

Retrieve the distribution of unique genres across all movies

In [None]:
# Get distribution of unique values from multiple genre columns
# Remove the entry representing no genre
# Get combined counts of unique genres

genre_distribution = movie_genre_df[['genre_0', 'genre_1', 'genre_2']] \
            .apply(lambda s: s.value_counts()) \
            .drop('') \
            .sum(axis=1) \
            .reset_index()
                
# Rename the columns acordingly                
genre_distribution.columns = ['genre', 'count']

# Sort the dataframe to order by predominant genres 
genre_distribution = genre_distribution.sort_values(by='count', ascending=False).reset_index(drop=True)

# Store the unique genres as a dictionary
unique_genres = genre_distribution['genre'].tolist()
unique_genres = {x: i for (i,x) in enumerate(unique_genres)}

multi_label_binarizer = MultiLabelBinarizer(list(unique_genres.keys()))

Visualise the distribution of genres across movies

In [None]:
genres_count = genre_distribution.shape[0]
colors = cm.rainbow(np.linspace(0, 1, genres_count))


plot = genre_distribution.plot(x='genre', y='count', kind='bar', width=0.8, rot=0, figsize=(15,6), 
                               color=colors, legend=None)

plot.set_title('Movie Genres', fontweight='bold')

plot.set_ylabel('Count of Movies')
plot.set_xlabel('Movie Genre')

plot.set_xticklabels(labels=genre_distribution['genre'], rotation=30)

rects = plot.patches

# Now make some labels
labels = [int(genre_distribution['count'][i]) for i in range(len(rects))]

for rect, label in zip(rects, labels):
    height = rect.get_height()
    plot.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plot.get_figure() #.save_fig('')

In [None]:
print('movie_genre_df.shape', movie_genre_df.shape)

Mark the movies that need to be augmented as they belong to minority classes 

In [None]:
# Fetch those genres that consist of fewer then X(say 10000) samples
genres_partial = set(genre_distribution[genre_distribution['count'] < 10000]['genre'].tolist())
# Label the movies based on their genres
movie_genre_df['no_augment'] = movie_genre_df['genre'].apply(lambda x: bool(set(x) - genres_partial))

augment_movie_genre_df = (movie_genre_df[movie_genre_df['no_augment'] == False])

In [None]:
print('augment_movie_genre_df.shape', augment_movie_genre_df.shape)

In [None]:
# Get distribution of unique values from multiple genre columns
# Remove the entry representing no genre
# Get combined counts of unique genres

augment_genre_distribution = augment_movie_genre_df[['genre_0', 'genre_1', 'genre_2']] \
            .apply(lambda s: s.value_counts()) \
            .drop('') \
            .sum(axis=1) \
            .reset_index()
                
# Rename the columns acordingly                
augment_genre_distribution.columns = ['genre', 'count']

# Sort the dataframe to order by predominant genres 
augment_genre_distribution = augment_genre_distribution.sort_values(by='count', ascending=False) \
                                    .reset_index(drop=True)

Visualise the distribution of genres across movies

In [None]:
genres_count = augment_genre_distribution.shape[0]
colors = cm.rainbow(np.linspace(0, 1, genres_count))


plot = augment_genre_distribution.plot(x='genre', y='count', kind='bar', width=0.8, rot=0, figsize=(15,6), 
                               color=colors, legend=None)

plot.set_title('Movie Genres', fontweight='bold')

plot.set_ylabel('Count of Movies')
plot.set_xlabel('Movie Genre')

plot.set_xticklabels(labels=augment_genre_distribution['genre'], rotation=30)

rects = plot.patches

# Now make some labels
labels = [int(augment_genre_distribution['count'][i]) for i in range(len(rects))]

for rect, label in zip(rects, labels):
    height = rect.get_height()
    plot.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plot.get_figure() #.save_fig('')

#### 2. Movie Posters

Create a list of the movie posters names by traversing the directory

In [None]:
valid_poster_files = set(os.listdir(MOVIE_POSTERS_PATH))
valid_movie_samples = set(movie_genre_df['imdbid'].apply(lambda x: x+'.jpg').tolist())
augment_valid_movie_samples = set(augment_movie_genre_df['imdbid'].apply(lambda x: x+'.jpg').tolist())

poster_files = valid_poster_files.intersection(valid_movie_samples)
augment_poster_files = valid_poster_files.intersection(augment_valid_movie_samples)

imdb_ids = [i.split('.')[0] for i in poster_files]
movie_posters = [MOVIE_POSTERS_PATH + i for i in poster_files] # if os.path.getsize(MOVIE_POSTERS_PATH + i) > 0]

augment_imdb_ids = [i.split('.')[0] for i in augment_poster_files]
augment_movie_posters = [MOVIE_POSTERS_PATH + i for i in augment_poster_files]

print("Number of movie posters =", len(movie_posters))
print("Number of augment movie posters =", len(augment_movie_posters))

Create a mini-batch of posters and associated genres.

In [None]:
range_movies = list(range(len(movie_posters)))
range_augment_movies = list(range(len(augment_movie_posters)))

if AUGMENT_RATIO is None:
    augment_sample_size = min(len(range_augment_movies), DATASET_SIZE//2)
    sample_size = DATASET_SIZE - augment_sample_size
else:
    sample_size = DATASET_SIZE//AUGMENT_RATIO
    augment_sample_size = (DATASET_SIZE - sample_size)
    
sample_movies = random.sample(range_movies, sample_size)
sample_augment_movies = random.sample(range_augment_movies, augment_sample_size)

In [None]:
mini_batch_posters = [movie_posters[i] for i in sample_movies]
mini_batch_imdb_ids = [imdb_ids[i] for i in sample_movies]

mini_batch_genres = [movie_genre_df[movie_genre_df['imdbid'] == i]['genre'].item() for i in mini_batch_imdb_ids]

augment_mini_batch_posters = [augment_movie_posters[i] for i in sample_augment_movies]
augment_mini_batch_imdb_ids = [augment_imdb_ids[i] for i in sample_augment_movies]

augment_mini_batch_genres = [augment_movie_genre_df[augment_movie_genre_df['imdbid'] == i]['genre'].item() \
                                 for i in augment_mini_batch_imdb_ids]

num_classes = len(unique_genres)

print("Number of movie posters =", len(mini_batch_posters))
print("Number of augment movie posters =", len(augment_mini_batch_posters))

Inspect the dimensions of one of the posters

In [None]:
print("Poster dimensions :", np.array(PIL.Image.open(mini_batch_posters[0])).shape)

### Dataset preparation

Partition the data into training and test sets

In [None]:
train_size = int(len(mini_batch_posters)*TRAINING_SET_PROPORTION)

train_posters = mini_batch_posters[:train_size]
test_posters = mini_batch_posters[train_size:]

X_train = torch.FloatTensor(train_size, CHANNELS, HEIGHT, WIDTH).zero_()
X_test = torch.FloatTensor(len(mini_batch_posters) - train_size, CHANNELS, HEIGHT, WIDTH).zero_()

# Resize the posters as square images are nice to work with and also apply necessary pre-processing
for i, poster in enumerate(train_posters):
    X_train[i] = torch.from_numpy(np.transpose(np.array(PIL.Image.open(poster).resize((WIDTH,HEIGHT))).T, (0,2,1))) \
                        .float().unsqueeze(0)

for i, poster in enumerate(test_posters):
    X_test[i] = torch.from_numpy(np.transpose(np.array(PIL.Image.open(poster).resize((WIDTH,HEIGHT))).T, (0,2,1))) \
                       .float().unsqueeze(0)
        
y = multi_label_binarizer.fit_transform(mini_batch_genres)
y_train = torch.from_numpy(y[:train_size])
y_test = torch.from_numpy(y[train_size:])

In [None]:
augment_train_size = int(len(augment_mini_batch_posters)*TRAINING_SET_PROPORTION)

augment_train_posters = augment_mini_batch_posters[:augment_train_size]
augment_test_posters = augment_mini_batch_posters[augment_train_size:]

augment_X_train = torch.FloatTensor(augment_train_size, CHANNELS, HEIGHT, WIDTH).zero_()
augment_X_test = torch.FloatTensor(len(augment_mini_batch_posters) - augment_train_size, \
                                   CHANNELS, HEIGHT, WIDTH).zero_()

for i, poster in enumerate(augment_train_posters):
    if AUGMENT_RATIO is None:
        processed_image = PIL.Image.open(poster)
    else:
        processed_image = preprocessFn(PIL.Image.open(poster))
    augment_X_train[i] = torch.from_numpy(np.transpose(np.array(processed_image \
                                            .resize((WIDTH,HEIGHT))).T, (0,2,1))).float().unsqueeze(0)

for i, poster in enumerate(augment_test_posters):
    augment_X_test[i] = torch.from_numpy(np.transpose(np.array(PIL.Image.open(poster) \
                                            .resize((WIDTH,HEIGHT))).T, (0,2,1))).float().unsqueeze(0)
    
augment_y = multi_label_binarizer.fit_transform(augment_mini_batch_genres)
augment_y_train = torch.from_numpy(augment_y[:augment_train_size])
augment_y_test = torch.from_numpy(augment_y[augment_train_size:])

Visualise few movie posters

In [None]:
markup = '<table>'

temp_genres = mini_batch_genres + augment_mini_batch_genres

for i, poster in enumerate(mini_batch_posters + augment_mini_batch_posters):
    if i%12 == 0:
        markup += '</tr/><tr>'

    markup += '<td><img src=' + poster +' width="70" height="70"/><span>' + ',\n'.join(temp_genres[i]) + \
                            '</span</td>'
    
markup += '</table>'

display(HTML(markup))

In [None]:
X_train = torch.cat((X_train, augment_X_train))
X_test = torch.cat((X_test, augment_X_test))

y_train = torch.cat((y_train, augment_y_train))
y_test = torch.cat((y_test, augment_y_test))


print("X_train shape: ", X_train.size())
print("y_train shape: ", y_train.size())
print("X_test shape: ", X_test.size())
print("y_test shape: ", y_test.size())

Create Tensor objects for the training and test sets

In [None]:
train = data_utils.TensorDataset(X_train, y_train)

# Use weight-sampling to account for class-imbalance
class_sample_count = [int(genre_distribution[genre_distribution['genre'] == x]['count'].values[0]) \
                          for x in multi_label_binarizer.classes]
weights = (1 / torch.Tensor(class_sample_count)).double()
sampler = data_utils.sampler.WeightedRandomSampler(weights, train_size)
train_loader = data_utils.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)#, sampler = sampler)

test = data_utils.TensorDataset(X_test, y_test)
test_loader = data_utils.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)

dataset_sizes = {'train': len(train), 'val': 0, 'test': len(test)}

### Network Architecture

Define a custom architecture on top of the pre-trained model.

Since the dataset is small, it is not a good idea to fine-tune the entire ConvNet due to overfitting concerns.
As the dataset is very different from the ImageNet dataset, it might not be best to train the classifier from the top of the network, which contains more dataset-specific features. Instead, it might work better to train the classifier from activations somewhere earlier in the network.

In [None]:
class CustomNet(nn.Module):
    def __init__(self, baseName, model, freeze='all'):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(CustomNet, self).__init__()
        self.model = model

        if freeze == 'all':
            for param in self.model.features.parameters():
                param.requires_grad = False
        else:
            for layer in freeze:
                for param in getattr(self.model, layer).parameters():
                    param.requires_grad = False

        self.model.avgpool = nn.AdaptiveAvgPool2d(1)
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        x = self.model(x)
        return x

Define the **train** and **test** functions

In [None]:
def exp_lr_scheduler(optimizer, epoch, init_lr=LEARNING_RATE, lr_decay_epoch=DECAY_EPOCHS):
    """Decay learning rate by a factor of 0.1 every lr_decay_epoch epochs."""
    lr = init_lr * (WEIGHT_DECAY**(epoch // lr_decay_epoch))

    if epoch % lr_decay_epoch == 0:
        print('LR is set to {}'.format(lr))

    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    return optimizer

In [None]:
def train_model(model, criterion, optimizer, lr_scheduler, num_epochs=25):
    
    start = time.time()
    loss_history = []
    
    
    for epoch in range(NUM_EPOCHS):
        
        optimizer = lr_scheduler(optimizer, epoch)
        # Set model to training mode
        model.train(True)  

        running_loss = []

        # Iterate over data.
        for batch_idx, (inputs, targets) in enumerate(train_loader):

            if USE_GPU:
                inputs = Variable(inputs.cuda()).float()
                targets = Variable(targets.cuda()).float()
            else:
                inputs = Variable(inputs)
                targets = Variable(targets).float()

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            loss.backward()
            optimizer.step()

            running_loss.append(loss.data[0])
            del inputs
            del targets

        epoch_loss = np.mean(running_loss)
        loss_history.append(epoch_loss)

        print('Epoch {}/{}'.format(epoch, NUM_EPOCHS - 1), 'Training loss: {:.4f}'.format(epoch_loss))
        print('-' * 10)
        
    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    
    return model, loss_history

In [None]:
def test_model(model):
    
    start = time.time()
    num_labels = []
    num_preds = []
    num_labels_union = []
    preds_correct = []
    diff_labels = []
    target_labels = []
    predicted_labels = []
    data_inputs = []
    predicted_outputs = []
    output_targets = []
    
    # Set model to training mode
    model.train(False)  

    # Iterate over data
    for batch_idx, (inputs, targets) in enumerate(test_loader):

        if USE_GPU:
            inputs = Variable(inputs.cuda())
            targets = Variable(targets.cuda()).float()
        else:
            inputs = Variable(inputs)
            targets = Variable(targets).float()

        # forward
        outputs = model(inputs)
        
            
        for i, t in enumerate(targets.data):
            labels = t.nonzero()[:,0].tolist()
            
            num_labels.append(len(labels)) 
            
            probs = torch.exp(outputs.data[i])
            j, preds = probs.topk(num_labels[-1])
            
            probs[preds] = 1
        
            if USE_GPU:
                probs[torch.LongTensor(list(set(range(num_classes)) - set(preds))).cuda()] = 0
                predicted_outputs.append(probs.cpu().numpy())
                output_targets.append(targets.data[i].cpu().numpy())
            else:
                probs[torch.LongTensor(list(set(range(num_classes)) - set(preds)))] = 0
                predicted_outputs.append(probs.numpy())
                output_targets.append(targets.data[i].numpy())
        
            preds = [p for k, p in enumerate(preds) if j[k] != 0.0]
            
            data_inputs.append(inputs.data[i])
            
            target_labels.append([multi_label_binarizer.classes[i] for i in labels])
            predicted_labels.append([multi_label_binarizer.classes[i] for i in preds])
            
            preds_correct.append(len(set(preds).intersection(labels))) 
            num_preds.append(len(preds))
            num_labels_union.append(len(set(preds).union(labels)))
            diff_labels.append(len(set(preds).symmetric_difference(labels)))

        
    total_preds_correct = sum(preds_correct)
    total_num_labels = sum(num_labels)
    total_num_preds = sum(num_preds)
    
    label_cardinality = np.mean(num_labels)
    label_density = np.mean([i/total_num_labels for i in num_labels])
    
    precision = np.mean([x/num_preds[i] for i,x in enumerate(preds_correct) if num_preds[i] != 0])
    recall = np.mean([x/num_labels[i] for i,x in enumerate(preds_correct)])
    f1_score = (2*precision*recall)/(precision+recall)
    
    jaccard_index = np.mean([x/num_labels_union[i] for i,x in enumerate(preds_correct)])
    hamming_loss = np.mean([x/total_num_labels for i,x in enumerate(diff_labels)])
    
    time_elapsed = time.time() - start
    
    result = {
        'label_cardinality': label_cardinality,
        'label_density': label_density,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'jaccard_index': jaccard_index,
        'hamming_loss': hamming_loss,
        'total_num_labels': total_num_labels,
        'total_num_preds': total_num_preds,
        'total_preds_correct': total_preds_correct,
        'target_labels': target_labels,
        'predicted_labels': predicted_labels,
        'output_targets': output_targets,
        'predicted_outputs': predicted_outputs
    }
    print('Testing complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    
    return result

Download pretrained models and adapt them by fine-tuning

In [None]:
resnet152 = models.resnet152(pretrained=True)
inception_v3 = models.inception_v3(pretrained=True)
alexnet = models.alexnet(pretrained=True)
vggnet16_bn = models.vgg16_bn()

pretrained_models = {
    'resnet': {
        'model': resnet152,
        'freeze': ['conv1', 'bn1', 'relu', 'maxpool', 'layer1', 'layer2', 'layer3']
    },
#     'inception_v3': {
#         'model': inception_v3,
#         'freeze': 'all'
#     },
#     'alexnet': {
#         'model': alexnet,
#         'freeze': 'all'
#     },
#     'vggnet16_bn': {
#         'model': vggnet16_bn,
#         'freeze': 'all'
#     }
}

Define the parameters required for training the model

In [None]:
base_model = pretrained_models['resnet']
custom_model = CustomNet('resnet', base_model['model'], base_model['freeze'])

# Note: Uncomment to load a saved model 
#### custom_model.load_state_dict(torch.load(TARGET_PATH + 'model.pth.tar'))

if USE_GPU:
    print('Using GPU...', torch.cuda.current_device())
    custom_model = custom_model.cuda()

print('Custom Model', custom_model)

criterion = nn.MultiLabelSoftMarginLoss()
optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, custom_model.model.parameters()), \
                          lr=LEARNING_RATE, weight_decay=L2_REGULARIZATION)

### Deep Learning in Action

#### 1. Train the model

In [None]:
best_model, loss_history = train_model(custom_model, criterion, optimizer, exp_lr_scheduler, \
                                             num_epochs=NUM_EPOCHS)

with open(TARGET_PATH + 'loss_history' + '.json', 'w') as fp:
    json.dump(loss_history, fp)

#### 2. Visualise the training loss over epochs

In [None]:
plt.figure()
plt.plot(np.arange(0, NUM_EPOCHS), loss_history)

plt.xlabel('Epoch')
plt.ylabel('Training Loss')
plt.title('Loss history over epochs')

plt.grid(True)
plt.savefig(TARGET_PATH + 'loss_curve.jpg')

#### 3. Use the model to get predictions on the test set

In [None]:
result = test_model(best_model)

In [None]:
print(result['f1_score'], result['hamming_loss'], result['jaccard_index'], result['precision'], result['recall'])
# result['label_cardinality'], result['label_density']

In [None]:
torch.save(best_model.state_dict(), TARGET_PATH + 'model.pth.tar')

#### 4. Visualize the ROC curve and compute the AUC

In [None]:
targets = np.array(result['output_targets'])
outputs = np.array(result['predicted_outputs'])

for i in range(len(outputs)):
    fpr, tpr, _ = roc_curve(targets[i], outputs[i])
    
    print('AUC=', auc(fpr, tpr))
    plt.plot(fpr, tpr, color='darkorange', lw=0.5)
    
    plt.title('AUC=' + str(auc(fpr, tpr)))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.savefig(TARGET_PATH + 'roc_curve.jpg')

#### 5. Save the prediction results for inspection later

In [None]:
del result['output_targets']
del result['predicted_outputs']

with open(TARGET_PATH + 'prediction_results' + '.json', 'w') as fp:
    json.dump(result, fp)