# Import libraries and set up environments

In [1]:
import pandas as pd
import os
from nltk import wordpunct_tokenize
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import cv2
from tqdm import tqdm
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.preprocessing import image
from keras.layers import BatchNormalization

In [2]:
# !pip install -q transformers datasets

# Loading dataset
We divide the assigned task into two subtasks: Predicting genre based on movies' titles and on movies' posters. As each of those two subtasks has different approach, we have to set up Loading dataset phase as two separate phases

In [3]:
!gdown 1hUqu1mbFeTEfBvl-7fc56fHFfCSzIktD

Downloading...
From: https://drive.google.com/uc?id=1hUqu1mbFeTEfBvl-7fc56fHFfCSzIktD
To: /content/ml1m.zip
100% 105M/105M [00:00<00:00, 179MB/s] 


In [4]:
!unzip -qq ml1m.zip -d ml1m

replace ml1m/content/dataset/movies_test.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [5]:
specified_directory = 'ml1m/content/dataset'
os.listdir(specified_directory)

['ratings.dat',
 'movies_train.dat',
 'ml1m-images',
 'users.dat',
 'genres.txt',
 'movies_test.dat']

## Loading dataset for movies' titles preprocessing

In [6]:
from datasets import Features, Value, ClassLabel, load_dataset

# Loading movie_train dataset and movie_test dataset
movies_features = Features({'id': Value('int64'), 'title': Value('string'), 'genre': Value('string')})
file_dict = {'train': 'ml1m/content/dataset/movies_train.dat', 'test':'ml1m/content/dataset/movies_test.dat'}
dataset = load_dataset('csv', data_files=file_dict, delimiter='::', column_names=['id', 'title', 'genre'], features=movies_features, split=None)

As we can see, the dataset contains 2 splits: one for training and one for testing.

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'genre'],
        num_rows: 3106
    })
    test: Dataset({
        features: ['id', 'title', 'genre'],
        num_rows: 777
    })
})


Using genres.txt, we create a list that contains the labels, as well as 2 dictionaries that map labels to integers and back



In [8]:
genres_features = Features({'genre': Value('string')})
genres_dict = {'movie_genres': 'ml1m/content/dataset/genres.txt'}
genres = load_dataset('text', data_files=genres_dict, features=genres_features)

labels = []
for i in range(genres['movie_genres'].num_rows):
  labels.append(genres['movie_genres'][i].get('genre'))

id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print('id2label: ', id2label)
print('label2id: ', label2id)

id2label:  {0: 'Crime', 1: 'Thriller', 2: 'Fantasy', 3: 'Horror', 4: 'Sci-Fi', 5: 'Comedy', 6: 'Documentary', 7: 'Adventure', 8: 'Film-Noir', 9: 'Animation', 10: 'Romance', 11: 'Drama', 12: 'Western', 13: 'Musical', 14: 'Action', 15: 'Mystery', 16: 'War', 17: "Children's"}
label2id:  {'Crime': 0, 'Thriller': 1, 'Fantasy': 2, 'Horror': 3, 'Sci-Fi': 4, 'Comedy': 5, 'Documentary': 6, 'Adventure': 7, 'Film-Noir': 8, 'Animation': 9, 'Romance': 10, 'Drama': 11, 'Western': 12, 'Musical': 13, 'Action': 14, 'Mystery': 15, 'War': 16, "Children's": 17}


## Loading dataset for movies' posters preprocessing

In [9]:
images_path = specified_directory + '/ml1m-images'

# movies_train dataset
movie_path = specified_directory + '/movies_train.dat'
movies_train = pd.read_csv(movie_path,
                               engine='python',
                               sep='::',
                               names=['movieID'
                                      , 'title'
                                      , 'genres'],
                          encoding='latin-1',
                          index_col = False).set_index('movieID')
movies_train['genres'] = movies_train.genres.str.split('|')

# movies_test dataset
movie_path = specified_directory + '/movies_test.dat'
movies_test = pd.read_csv(movie_path,
                               engine='python',
                               sep='::',
                               names=['movieID'
                                      , 'title'
                                      , 'genres'],
                          encoding='latin-1',
                          index_col = False).set_index('movieID')
movies_test['genres'] = movies_test.genres.str.split('|')

Insert a new column for images path

In [10]:
movies_train['id'] = movies_train.index
movies_train.reset_index(inplace=True)
movies_train['img_path'] = movies_train.apply(lambda row: os.path.join(images_path, f'{row.id}.jpg'), axis = 1)

movies_test['id'] = movies_test.index
movies_test.reset_index(inplace=True)
movies_test['img_path'] = movies_test.apply(lambda row: os.path.join(images_path, f'{row.id}.jpg'), axis = 1)

As we figure out that there are a number of images missing, we define a helper function to see which movies do not have poster image

In [11]:
def is_image_file(file_path):
    return os.path.isfile(file_path)

# Filter movies with missing images
movies_train['image_exists'] = movies_train['img_path'].apply(is_image_file)
movies_test['image_exists'] = movies_test['img_path'].apply(is_image_file)

In [12]:
movies_test

Unnamed: 0,movieID,title,genres,id,img_path,image_exists
0,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",3397,ml1m/content/dataset/ml1m-images/3397.jpg,True
1,2067,Doctor Zhivago (1965),"[Drama, Romance, War]",2067,ml1m/content/dataset/ml1m-images/2067.jpg,True
2,2651,Frankenstein Meets the Wolf Man (1943),[Horror],2651,ml1m/content/dataset/ml1m-images/2651.jpg,True
3,2989,For Your Eyes Only (1981),[Action],2989,ml1m/content/dataset/ml1m-images/2989.jpg,True
4,3415,"Mirror, The (Zerkalo) (1975)",[Drama],3415,ml1m/content/dataset/ml1m-images/3415.jpg,True
...,...,...,...,...,...,...
772,2309,"Inheritors, The (Die Siebtelbauern) (1998)",[Drama],2309,ml1m/content/dataset/ml1m-images/2309.jpg,False
773,2421,"Karate Kid, Part II, The (1986)","[Action, Adventure, Drama]",2421,ml1m/content/dataset/ml1m-images/2421.jpg,True
774,3255,"League of Their Own, A (1992)","[Comedy, Drama]",3255,ml1m/content/dataset/ml1m-images/3255.jpg,True
775,974,Algiers (1938),"[Drama, Romance]",974,ml1m/content/dataset/ml1m-images/974.jpg,True


Display movies on train set and test set which do not have poster image

In [13]:
# Display movies with missing images in training dataset
missing_images = movies_train[movies_train['image_exists'] == False]
print("Movies with missing images:")
missing_images[['title', 'img_path']]

Movies with missing images:


Unnamed: 0,title,img_path
4,Dear Jesse (1997),ml1m/content/dataset/ml1m-images/1901.jpg
5,"Jar, The (Khomreh) (1992)",ml1m/content/dataset/ml1m-images/758.jpg
6,Stag (1997),ml1m/content/dataset/ml1m-images/1636.jpg
8,"End of the Affair, The (1955)",ml1m/content/dataset/ml1m-images/3126.jpg
10,Midnight Dancers (Sibak) (1994),ml1m/content/dataset/ml1m-images/794.jpg
...,...,...
3087,Fire on the Mountain (1996),ml1m/content/dataset/ml1m-images/1448.jpg
3092,Condo Painting (2000),ml1m/content/dataset/ml1m-images/3356.jpg
3093,Buck and the Preacher (1972),ml1m/content/dataset/ml1m-images/3373.jpg
3103,Heaven's Burning (1997),ml1m/content/dataset/ml1m-images/1832.jpg


In [14]:
# Display movies with missing images in test set
missing_images = movies_test[movies_test['image_exists'] == False]
print("Movies with missing images:")
missing_images[['title', 'img_path']]

Movies with missing images:


Unnamed: 0,title,img_path
5,Fausto (1993),ml1m/content/dataset/ml1m-images/576.jpg
21,Tashunga (1995),ml1m/content/dataset/ml1m-images/1118.jpg
37,"Paris, France (1993)",ml1m/content/dataset/ml1m-images/559.jpg
48,Squeeze (1996),ml1m/content/dataset/ml1m-images/1557.jpg
55,Mondo (1996),ml1m/content/dataset/ml1m-images/1577.jpg
...,...,...
753,3 Strikes (2000),ml1m/content/dataset/ml1m-images/3322.jpg
760,Color Me Blood Red (1965),ml1m/content/dataset/ml1m-images/3346.jpg
762,Beauty (1998),ml1m/content/dataset/ml1m-images/2563.jpg
767,Trick or Treat (1986),ml1m/content/dataset/ml1m-images/2464.jpg


After filtering movies with missing images, the remainings are the ones with images

In [15]:
# Movies with images in training dataset
movies_train_with_images = movies_train[movies_train['image_exists'] == True]
print("Movies with images")
movies_train_with_images

Movies with images


Unnamed: 0,movieID,title,genres,id,img_path,image_exists
0,1650,Washington Square (1997),[Drama],1650,ml1m/content/dataset/ml1m-images/1650.jpg,True
1,185,"Net, The (1995)","[Sci-Fi, Thriller]",185,ml1m/content/dataset/ml1m-images/185.jpg,True
2,1377,Batman Returns (1992),"[Action, Adventure, Comedy, Crime]",1377,ml1m/content/dataset/ml1m-images/1377.jpg,True
3,3204,"Boys from Brazil, The (1978)",[Thriller],3204,ml1m/content/dataset/ml1m-images/3204.jpg,True
7,2382,Police Academy 5: Assignment: Miami Beach (1988),[Comedy],2382,ml1m/content/dataset/ml1m-images/2382.jpg,True
...,...,...,...,...,...,...
3099,2921,High Plains Drifter (1972),[Western],2921,ml1m/content/dataset/ml1m-images/2921.jpg,True
3100,502,"Next Karate Kid, The (1994)","[Action, Children's]",502,ml1m/content/dataset/ml1m-images/502.jpg,True
3101,2539,Analyze This (1999),[Comedy],2539,ml1m/content/dataset/ml1m-images/2539.jpg,True
3102,3038,"Face in the Crowd, A (1957)",[Drama],3038,ml1m/content/dataset/ml1m-images/3038.jpg,True


In [16]:
# Movies with images in test dataset
movies_test_with_images = movies_test[movies_test['image_exists'] == True]
print("Movies with images")
movies_test_with_images

Movies with images


Unnamed: 0,movieID,title,genres,id,img_path,image_exists
0,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",3397,ml1m/content/dataset/ml1m-images/3397.jpg,True
1,2067,Doctor Zhivago (1965),"[Drama, Romance, War]",2067,ml1m/content/dataset/ml1m-images/2067.jpg,True
2,2651,Frankenstein Meets the Wolf Man (1943),[Horror],2651,ml1m/content/dataset/ml1m-images/2651.jpg,True
3,2989,For Your Eyes Only (1981),[Action],2989,ml1m/content/dataset/ml1m-images/2989.jpg,True
4,3415,"Mirror, The (Zerkalo) (1975)",[Drama],3415,ml1m/content/dataset/ml1m-images/3415.jpg,True
...,...,...,...,...,...,...
771,1286,Somewhere in Time (1980),"[Drama, Romance]",1286,ml1m/content/dataset/ml1m-images/1286.jpg,True
773,2421,"Karate Kid, Part II, The (1986)","[Action, Adventure, Drama]",2421,ml1m/content/dataset/ml1m-images/2421.jpg,True
774,3255,"League of Their Own, A (1992)","[Comedy, Drama]",3255,ml1m/content/dataset/ml1m-images/3255.jpg,True
775,974,Algiers (1938),"[Drama, Romance]",974,ml1m/content/dataset/ml1m-images/974.jpg,True


### Genres processing for posters

In [17]:
with open(specified_directory + '/genres.txt', 'r') as f:
   genre_all = f.readlines()
   genre_all = [x.replace('\n','') for x in genre_all]
genre2idx = {genre:idx for idx, genre in enumerate(genre_all)}
genre2idx

{'Crime': 0,
 'Thriller': 1,
 'Fantasy': 2,
 'Horror': 3,
 'Sci-Fi': 4,
 'Comedy': 5,
 'Documentary': 6,
 'Adventure': 7,
 'Film-Noir': 8,
 'Animation': 9,
 'Romance': 10,
 'Drama': 11,
 'Western': 12,
 'Musical': 13,
 'Action': 14,
 'Mystery': 15,
 'War': 16,
 "Children's": 17}

In [18]:
# Function to convert genres to NumPy array of 0s and 1s
def genres_to_array(genres, genre2idx):
    genre_array = np.zeros(len(genre2idx), dtype=int)
    for genre in genres:
        if genre in genre2idx:
            genre_array[genre2idx[genre]] = 1
    return genre_array

In [19]:
# Add a new column 'genre_array' to 'movies_train_with_images'
movies_train_with_images = movies_train_with_images.copy()  # Create a copy of the DataFrame
movies_train_with_images.loc[:, 'genre_array'] = movies_train_with_images.genres.apply(lambda x: genres_to_array(x, genre2idx))
movies_train_with_images

Unnamed: 0,movieID,title,genres,id,img_path,image_exists,genre_array
0,1650,Washington Square (1997),[Drama],1650,ml1m/content/dataset/ml1m-images/1650.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,185,"Net, The (1995)","[Sci-Fi, Thriller]",185,ml1m/content/dataset/ml1m-images/185.jpg,True,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,1377,Batman Returns (1992),"[Action, Adventure, Comedy, Crime]",1377,ml1m/content/dataset/ml1m-images/1377.jpg,True,"[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, ..."
3,3204,"Boys from Brazil, The (1978)",[Thriller],3204,ml1m/content/dataset/ml1m-images/3204.jpg,True,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,2382,Police Academy 5: Assignment: Miami Beach (1988),[Comedy],2382,ml1m/content/dataset/ml1m-images/2382.jpg,True,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
3099,2921,High Plains Drifter (1972),[Western],2921,ml1m/content/dataset/ml1m-images/2921.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3100,502,"Next Karate Kid, The (1994)","[Action, Children's]",502,ml1m/content/dataset/ml1m-images/502.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3101,2539,Analyze This (1999),[Comedy],2539,ml1m/content/dataset/ml1m-images/2539.jpg,True,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3102,3038,"Face in the Crowd, A (1957)",[Drama],3038,ml1m/content/dataset/ml1m-images/3038.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."


In [20]:
# Add a new column 'genre_array' to 'movies_test_with_images'
movies_test_with_images = movies_test_with_images.copy()  # Create a copy of the DataFrame
movies_test_with_images['genre_array'] = movies_test_with_images['genres'].apply(lambda x: genres_to_array(x, genre2idx))
movies_test_with_images

Unnamed: 0,movieID,title,genres,id,img_path,image_exists,genre_array
0,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",3397,ml1m/content/dataset/ml1m-images/3397.jpg,True,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2067,Doctor Zhivago (1965),"[Drama, Romance, War]",2067,ml1m/content/dataset/ml1m-images/2067.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
2,2651,Frankenstein Meets the Wolf Man (1943),[Horror],2651,ml1m/content/dataset/ml1m-images/2651.jpg,True,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2989,For Your Eyes Only (1981),[Action],2989,ml1m/content/dataset/ml1m-images/2989.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,3415,"Mirror, The (Zerkalo) (1975)",[Drama],3415,ml1m/content/dataset/ml1m-images/3415.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
771,1286,Somewhere in Time (1980),"[Drama, Romance]",1286,ml1m/content/dataset/ml1m-images/1286.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
773,2421,"Karate Kid, Part II, The (1986)","[Action, Adventure, Drama]",2421,ml1m/content/dataset/ml1m-images/2421.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, ..."
774,3255,"League of Their Own, A (1992)","[Comedy, Drama]",3255,ml1m/content/dataset/ml1m-images/3255.jpg,True,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
775,974,Algiers (1938),"[Drama, Romance]",974,ml1m/content/dataset/ml1m-images/974.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [21]:
# Add a new column 'genre_array' to 'movies_test'
movies_test = movies_test.copy()  # Create a copy of the DataFrame
movies_test['genre_array'] = movies_test['genres'].apply(lambda x: genres_to_array(x, genre2idx))
movies_test

Unnamed: 0,movieID,title,genres,id,img_path,image_exists,genre_array
0,3397,"Great Muppet Caper, The (1981)","[Children's, Comedy]",3397,ml1m/content/dataset/ml1m-images/3397.jpg,True,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2067,Doctor Zhivago (1965),"[Drama, Romance, War]",2067,ml1m/content/dataset/ml1m-images/2067.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
2,2651,Frankenstein Meets the Wolf Man (1943),[Horror],2651,ml1m/content/dataset/ml1m-images/2651.jpg,True,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2989,For Your Eyes Only (1981),[Action],2989,ml1m/content/dataset/ml1m-images/2989.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4,3415,"Mirror, The (Zerkalo) (1975)",[Drama],3415,ml1m/content/dataset/ml1m-images/3415.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
772,2309,"Inheritors, The (Die Siebtelbauern) (1998)",[Drama],2309,ml1m/content/dataset/ml1m-images/2309.jpg,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
773,2421,"Karate Kid, Part II, The (1986)","[Action, Adventure, Drama]",2421,ml1m/content/dataset/ml1m-images/2421.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, ..."
774,3255,"League of Their Own, A (1992)","[Comedy, Drama]",3255,ml1m/content/dataset/ml1m-images/3255.jpg,True,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
775,974,Algiers (1938),"[Drama, Romance]",974,ml1m/content/dataset/ml1m-images/974.jpg,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [22]:
# create y_total_test for evaluation phase
y_total_test = np.array(movies_test.genre_array.tolist())

# Data Preprocessing

## Movies' titles preprocessing

For following works, we create a dictionary for labels and a helper function

In [23]:
labels_dict = {}
for label in labels:
  labels_dict[label] = []
  for key in dataset.keys():
    for s in dataset[key]['genre']:
      gen_list = s.split("|")
      if label in gen_list:
        labels_dict[label].append(True)
      else:
        labels_dict[label].append(False)

def input_labels():
  global milestone
  global added
  labels_dict_input = {}
  for label in labels:
    labels_dict_input[label] = labels_dict[label][milestone:(milestone) + added]
  return labels_dict_input

Preprocessing data for movies' titles

In [24]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

counter = 0
milestone = 0
added = 0

def preprocess_data(examples):
  global milestone
  global added
  # take a batch of texts
  text = examples['title']
  added = len(text)
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=7, return_tensors="pt")
  # add labels
  labels_batch = input_labels()
  milestone += added
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

As a result, we have encoded dataset like following code

In [25]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3106
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [26]:
# Take an example from training dataset
example = encoded_dataset['train'][0]
print("Keys: ", example.keys())
print("Example: ", example)
print("Genre: ", [id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0])

Keys:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
Example:  {'input_ids': [101, 2899, 2675, 1006, 2722, 1007, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1], 'labels': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
Genre:  ['Drama']


In [27]:
tokenizer.decode(example['input_ids'])

'[CLS] washington square ( 1997 ) [SEP]'

Finally, we set the format of our data to PyTorch tensors. This will turn the training and the test sets into standard PyTorch datasets

In [28]:
encoded_dataset.set_format("torch")

## Movies' posters preprocessing

In [29]:
from tqdm import tqdm

SIZE = 200
X_train_dataset = []

for i in tqdm(range(movies_train_with_images.shape[0])):
    img_path = movies_train_with_images.iloc[i].img_path

    # Read the image using cv2
    img = cv2.imread(img_path)

    # Resize the image to the target size
    img = cv2.resize(img, (SIZE, SIZE))

    # Convert to float and normalize
    img = img.astype('float32') / 255

    X_train_dataset.append(img)

X_train = np.array(X_train_dataset)
print("X_train shape:", X_train.shape)

100%|██████████| 2602/2602 [00:08<00:00, 312.20it/s]


X_train shape: (2602, 200, 200, 3)


In [30]:
X_test_dataset = []

for i in tqdm(range(movies_test_with_images.shape[0])):
    img_path = movies_test_with_images.iloc[i].img_path

    # Read the image using cv2
    img = cv2.imread(img_path)

    # Resize the image to the target size
    img = cv2.resize(img, (SIZE, SIZE))

    # Convert to float and normalize
    img = img.astype('float32') / 255

    X_test_dataset.append(img)

X_test = np.array(X_test_dataset)
print("X_test shape:", X_test.shape)

100%|██████████| 654/654 [00:01<00:00, 347.67it/s]


X_test shape: (654, 200, 200, 3)


In [31]:
y_train = np.array(movies_train_with_images.genre_array.tolist())
y_train.shape

(2602, 18)

In [32]:
y_test = np.array(movies_test_with_images.genre_array.tolist())
y_test.shape

(654, 18)

# Define Model

## Define model for movies' titles


In [33]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Define model for movies's posters

In [34]:
poster_model = Sequential()

poster_model.add(Conv2D(filters=16, kernel_size=(5, 5),
                        activation="relu", input_shape=(SIZE,SIZE,3)))
poster_model.add(BatchNormalization())
poster_model.add(MaxPooling2D(pool_size=(2, 2)))
poster_model.add(Dropout(0.2))

poster_model.add(Conv2D(filters=32, kernel_size=(5, 5), activation='relu'))
poster_model.add(MaxPooling2D(pool_size=(2, 2)))
poster_model.add(BatchNormalization())
poster_model.add(Dropout(0.2))

poster_model.add(Conv2D(filters=64, kernel_size=(5, 5), activation="relu"))
poster_model.add(MaxPooling2D(pool_size=(2, 2)))
poster_model.add(BatchNormalization())
poster_model.add(Dropout(0.2))

poster_model.add(Conv2D(filters=64, kernel_size=(5, 5), activation='relu'))
poster_model.add(MaxPooling2D(pool_size=(2, 2)))
poster_model.add(BatchNormalization())
poster_model.add(Dropout(0.2))

poster_model.add(Flatten())

poster_model.add(Dense(128, activation='relu'))
poster_model.add(Dropout(0.5))
poster_model.add(Dense(64, activation='relu'))
poster_model.add(Dropout(0.5))

poster_model.add(Dense(y_train.shape[1], activation='sigmoid'))

In [35]:
poster_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 196, 196, 16)      1216      
                                                                 
 batch_normalization (Batch  (None, 196, 196, 16)      64        
 Normalization)                                                  
                                                                 
 max_pooling2d (MaxPooling2  (None, 98, 98, 16)        0         
 D)                                                              
                                                                 
 dropout (Dropout)           (None, 98, 98, 16)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 94, 94, 32)        12832     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 47, 47, 32)        0

# Train the models

## Train the model for movies' titles

In [36]:
batch_size = 8
metric_name = "f1"

Installing dependencies.
*The following code cell requires us to `Restart Session` and I could not find any alternative ways`

In [37]:
!pip install -U transformers



In [38]:
!pip install transformers[torch]



In [75]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    overwrite_output_dir = True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

To compute metrics while training, we define a `compute_metrics` function, that returns a dictionary with the desired metric values.

In [40]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_w_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'weighted')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_w_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

Verify a batch as well as a forward pass before training the model

In [41]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [42]:
encoded_dataset['train']['input_ids'][0]

tensor([ 101, 2899, 2675, 1006, 2722, 1007,  102])

In [43]:
type(encoded_dataset['train']['input_ids'][0])

torch.Tensor

In [44]:
# # forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.6983, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.1353, -0.3269,  0.2553, -0.1064,  0.2255, -0.5996, -0.2955, -0.6439,
          0.3274, -0.1154, -0.4446, -0.3539, -0.1046,  0.5551, -0.3357,  0.5982,
          0.1288, -0.0075]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [45]:
type(encoded_dataset['train']['input_ids'][0])

torch.Tensor

Define Trainer object and start training

In [50]:
title_trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

title_trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,2.943673,0.135976,0.5,0.227799
2,2.810500,2.149932,0.013838,0.5,0.005148
3,2.409500,1.774372,0.135976,0.5,0.227799
4,1.938700,1.596754,0.143124,0.5,0.001287
5,1.938700,1.70311,0.027811,0.5,0.005148
6,1.488200,1.432191,0.264322,0.5,0.0
7,1.226800,0.685847,0.135976,0.5,0.227799
8,0.905700,0.5795,0.149813,0.5,0.030888
9,0.634200,0.649544,0.151287,0.5,0.001287
10,0.634200,0.285418,0.0,0.5,0.0


Checkpoint destination directory bert-finetuned-sem_eval-english/checkpoint-389 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-sem_eval-english/checkpoint-778 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-sem_eval-english/checkpoint-1167 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-sem_eval-english/checkpoint-1556 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-sem_eval-english/checkpoint-1945 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory bert-finetuned-sem_eval-english/checkpoint-2334 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=3890, training_loss=1.5072259819599534, metrics={'train_runtime': 570.5206, 'train_samples_per_second': 54.442, 'train_steps_per_second': 6.818, 'total_flos': 111745749361680.0, 'train_loss': 1.5072259819599534, 'epoch': 10.0})

## Train the model for movies' posters

In [51]:
# early stopping
from keras import callbacks
earlystopping = callbacks.EarlyStopping(monitor='val_loss',
                                        mode="min", patience=8,
                                        restore_best_weights=True)

In [52]:
#Calculating binary cross entropy for each label.
poster_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = poster_model.fit(X_train, y_train, epochs=25, validation_data=(X_test, y_test), batch_size=64, callbacks=[earlystopping])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25


# Outputs and overall evaluation

### Evaluation on titles only

In [53]:
title_trainer.evaluate()

{'eval_loss': 1.4321914911270142,
 'eval_f1': 0.2643224087558948,
 'eval_roc_auc': 0.5,
 'eval_accuracy': 0.0,
 'eval_runtime': 1.3772,
 'eval_samples_per_second': 564.17,
 'eval_steps_per_second': 71.157,
 'epoch': 10.0}

In [54]:
f1_title = title_trainer.evaluate()['eval_f1']

In [55]:
title_f1 = np.float64(f1_title)

### Evaluation on posters only

In [56]:
y_poster_pred = poster_model.predict(X_test)



In [57]:
threshold = 0.05
y_pred_binary = (y_poster_pred > threshold).astype(int)

# Calculate F1 score
poster_f1 = f1_score(y_test, y_pred_binary, average='weighted')

print("Weighted F1 Score:", poster_f1)

Weighted F1 Score: 0.3377952989555883


## Outputs

In [58]:
# Title training output
y_title_pred = title_trainer.predict(encoded_dataset['test'])

# Poster training output
y_poster_pred = poster_model.predict(X_test)



### Combine two outputs

In [59]:
y_title_pred

PredictionOutput(predictions=array([[   0.4038164 ,    1.7303541 , -105.901215  , ...,  -12.580274  ,
         -32.832554  ,  -29.828915  ],
       [   0.4038164 ,    1.7303541 , -105.901215  , ...,  -12.580274  ,
         -32.832554  ,  -29.828915  ],
       [   0.4038164 ,    1.7303541 , -105.901215  , ...,  -12.580274  ,
         -32.832554  ,  -29.828915  ],
       ...,
       [   0.4038164 ,    1.7303541 , -105.901215  , ...,  -12.580274  ,
         -32.832554  ,  -29.828915  ],
       [   0.4038164 ,    1.7303541 , -105.901215  , ...,  -12.580274  ,
         -32.832554  ,  -29.828915  ],
       [   0.40381652,    1.7303531 , -105.90121   , ...,  -12.580272  ,
         -32.832554  ,  -29.82891   ]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32), met

In [60]:
sigmoid = torch.nn.Sigmoid()
y_title_tensor = torch.from_numpy(y_title_pred.predictions)
probs = sigmoid(y_title_tensor)

In [61]:
probs[0]

tensor([5.9960e-01, 8.4946e-01, 0.0000e+00, 2.7625e-03, 6.6893e-04, 9.9652e-01,
        1.9637e-04, 3.1785e-06, 2.2069e-12, 3.3185e-13, 7.1944e-09, 9.5516e-01,
        2.0360e-15, 1.2717e-10, 9.3781e-01, 3.4392e-06, 5.5081e-15, 1.1104e-13])

In [62]:
y_final_title_pred = [[0 for i in range(len(labels))] for j in range(dataset['test'].num_rows)]
for i in range(probs.shape[0]):
  for j in range(probs.shape[1]):
    y_final_title_pred[i][j] = probs[i][j].item()


In [63]:
y_final_title_pred = np.array(y_final_title_pred)

In [64]:
y_final_title_pred.shape

(777, 18)

In [65]:
y_title_pred.predictions.shape

(777, 18)

In [66]:
y_poster_pred.shape

(654, 18)

In [67]:
y_final_title_pred[0]

array([5.99604249e-01, 8.49457681e-01, 0.00000000e+00, 2.76252977e-03,
       6.68928609e-04, 9.96524751e-01, 1.96371475e-04, 3.17847071e-06,
       2.20690133e-12, 3.31848318e-13, 7.19443527e-09, 9.55163062e-01,
       2.03597667e-15, 1.27173716e-10, 9.37809527e-01, 3.43918123e-06,
       5.50811550e-15, 1.11036810e-13])

In [68]:
y_poster_pred[0]

array([0.13980074, 0.2096324 , 0.08061279, 0.17881647, 0.16827613,
       0.45823142, 0.06316186, 0.20925595, 0.02946592, 0.12809125,
       0.1989111 , 0.33367866, 0.09255355, 0.15812339, 0.20897184,
       0.05842439, 0.13888429, 0.2911199 ], dtype=float32)

As the performance on images is better than the performance on title, we use define the weights as followed:

In [69]:
w1 = (title_f1)/(title_f1 + poster_f1)
w2 = (poster_f1)/(title_f1 + poster_f1)

In [70]:
print("w1: ", w1)
print("w2: ", w2)

w1:  0.4389879343700522
w2:  0.5610120656299479


In [71]:
# Combined output
y_overall_pred = [[0 for i in range(len(labels))] for j in range(dataset['test'].num_rows)]
y_overall_pred = np.array(y_overall_pred)

i_true = 0
for i in range(dataset['test'].num_rows):
  if movies_test['image_exists'][i] == True:
    for j in range(len(labels)):
      y_overall_pred[i][j] = w1*y_final_title_pred[i][j] + w2*y_poster_pred[i_true][j]
    i_true += 1
  else:
    for j in range(len(labels)):
      y_overall_pred[i][j] = y_final_title_pred[i][j]

In [72]:
y_final_title_pred

array([[5.99604249e-01, 8.49457681e-01, 0.00000000e+00, ...,
        3.43918123e-06, 5.50811550e-15, 1.11036810e-13],
       [5.99604249e-01, 8.49457681e-01, 0.00000000e+00, ...,
        3.43918123e-06, 5.50811550e-15, 1.11036810e-13],
       [5.99604249e-01, 8.49457681e-01, 0.00000000e+00, ...,
        3.43918123e-06, 5.50811550e-15, 1.11036810e-13],
       ...,
       [5.99604249e-01, 8.49457681e-01, 0.00000000e+00, ...,
        3.43918123e-06, 5.50811550e-15, 1.11036810e-13],
       [5.99604249e-01, 8.49457681e-01, 0.00000000e+00, ...,
        3.43918123e-06, 5.50811550e-15, 1.11036810e-13],
       [5.99604309e-01, 8.49457622e-01, 0.00000000e+00, ...,
        3.43918805e-06, 5.50811550e-15, 1.11037231e-13]])

In [73]:
y_test

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [74]:
#f1_score
from sklearn.metrics import f1_score

# Convert the predicted probabilities to binary values based on a threshold
threshold = 0.5

y_overall_pred_binary = (y_final_title_pred > threshold).astype(int)

# Calculate F1 score
f1 = f1_score(y_total_test, y_overall_pred_binary, average='weighted')

print("Weighted F1 Score:", f1)

Weighted F1 Score: 0.2776515607586077
