In [None]:
!pip install -U scikit-learn

In [None]:
!pip install -r https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import shutil
import json
import re
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import torch.utils.data as data_utils
from torch.nn.modules import MSELoss, L1Loss

import sklearn.preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
import glob
import csv
import cv2
import random
from PIL import Image
from itertools import product

In [2]:
path1 = "./Movie_Poster_Metadata/groundtruth"
temp_path = "./Movie_Poster_Metadata/temp_groundtruth"
path2 = "./Movie_Poster_Metadata/updated_groundtruth"

### Reading the input file and creating a clean one
Note: only run once

In [13]:
dir_list = os.listdir(path1)
 
if not os.path.exists(temp_path):
  os.makedirs(temp_path)    

if not os.path.exists(path2):
  os.makedirs(path2)



for file_name in dir_list:
    
    with open(path1+'/'+file_name,'r',encoding='utf-16-le') as file1:

        temp_file = open(temp_path+'/'+file_name,'w',encoding='utf-8')

        for line in file1.readlines():

            line = line.replace("}\n","},\n")
            
            # reading all lines that begin with "  "_id""
            y = re.findall("^  \"_id\"", line)
            if not y:
                temp_file.write(line)

    file1.close()
    temp_file.close()

In [14]:
dir_list = os.listdir(temp_path)
 
for file_name in dir_list:
    
    with open(temp_path+'/'+file_name,'r',encoding='utf-8') as temp_file:
    
        file2 = open(path2+'/'+file_name,'w',encoding='utf-8')

        lines = temp_file.readlines()
        lines = lines[1:-1]

        file2.write("[{")
        file2.writelines(lines)
        file2.write("}]")
        
    temp_file.close()
    file2.close()

shutil.rmtree(temp_path)  

### Augmenting the data set
Note: only run once

To-Do: Balance data according to occurence of genres. Summarize genres with little data.

In [5]:
path3 = "./Movie_Poster_Dataset"

# Going through all jpg-files, they are chopped up into 100x100 chunks and saved into a new folder
for dirname in os.listdir(path3):
    for filename in os.listdir(path3 + "/" + dirname):
        name, ext = os.path.splitext(filename)
        if(ext == '.jpg'):
            image = Image.open(os.path.join(path3 + "/" + dirname, filename))
            width, height = image.size
            chopsize = 100
            for x0 in range(0, width, chopsize):
                for y0 in range(0, height, chopsize):
                    if(y0+chopsize <= height and x0+chopsize <= width):
                        box = (x0, y0, x0+chopsize, y0+chopsize)
                        image.crop(box).save('./Movie_Poster_Dataset_Cropped/%s.x%03d.y%03d.jpg' % (filename.replace('.jpg',''), x0, y0))


In [34]:
path3 = "./Movie_Poster_Dataset"

# Going through all jpg-files, they are chopped up into 100x100 chunks and saved into a new folder
for dirname in os.listdir(path3):
    for filename in os.listdir(path3 + "/" + dirname):
        name, ext = os.path.splitext(filename)
        if(ext == '.jpg'):
            image = Image.open(os.path.join(path3 + "/" + dirname, filename))
            box = (0, 0, 100, 100)
            image.crop(box).save('./Movie_Poster_Dataset_Cropped_Once/%s.jpg' % (filename))


In [None]:
#to be used later to augment data of underrepresented genres (balance data)

print('Nr of movies in json: '+str(len(dicts)))
missing = []
for obj in dicts:
    genrelist = obj.get('Genre').split(',')
    fname = obj.get('imdbID') + '.jpg'
    if(path.exists(fname)):
        for genre in genrelist:
            #copy the file with name obj.key("imdbID") to each genre folder
            if(genre == 'N/A'):
                shutil.copy2(os.path.join('.', fname), './NotApplicable')
            elif(genre == 'Adult' || genre == 'Game-Show' || genre == 'News' || genre == 'Reality-TV' || genre == 'Talk-Show' || genre == 'Western'):
                shutil.copy2(os.path.join('.', fname), './Other')
            else:
                shutil.copy2(os.path.join('.', fname), './'+genre.lstrip())
    else:
        missing.append(fname)


print('Nr of missing IDs: '+str(len(missing)))


### Function to append all the json objects into dataframe 

In [3]:
dir_list = os.listdir(path2)

movies_df = pd.DataFrame()

for file_name in dir_list:    

#     try:
    df = pd.read_json(path2+'/'+file_name,encoding='utf-8',orient='records')
    df = df[['imdbID','Director','Genre','imdbRating']]
    movies_df = pd.concat([movies_df,df], ignore_index=True)

#     except:
#         print(file_name)
        
print(movies_df.dtypes)
print(movies_df.head(20))
print(movies_df.shape)

imdbID        object
Director      object
Genre         object
imdbRating    object
dtype: object
       imdbID                                  Director  \
0   tt0080684                            Irvin Kershner   
1   tt0081562                            Sidney Poitier   
2   tt0080339  Jim Abrahams, David Zucker, Jerry Zucker   
3   tt0080377                            Buddy Van Horn   
4   tt0081375                              Howard Zieff   
5   tt0080549                             Michael Apted   
6   tt0081529                               Hal Needham   
7   tt0080453                            Randal Kleiser   
8   tt0080455                               John Landis   
9   tt0081283                            Robert Redford   
10  tt0081353                             Robert Altman   
11  tt0081696                             James Bridges   
12  tt0081505                           Stanley Kubrick   
13  tt0081480                              Jay Sandrich   
14  tt0080520    

### Creating multi-hot encoded genre vectors

In [4]:
#remove duplicates and set imdbID as index
movies_df = movies_df.drop_duplicates(subset=["imdbID"], keep="last")
movies_df.set_index("imdbID", inplace=True)

In [5]:
mlb = MultiLabelBinarizer()
multihot = mlb.fit_transform(movies_df["Genre"].dropna().str.split(", "))
genres_df = pd.DataFrame({"multihot":[multihot.astype(int)]}, index = movies_df.index)
movies_df = pd.concat([movies_df, genres_df], axis=1 )
print(mlb.classes_)
print(movies_df.head(10))

['Action' 'Adult' 'Adventure' 'Animation' 'Biography' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Family' 'Fantasy' 'Game-Show' 'History' 'Horror'
 'Music' 'Musical' 'Mystery' 'N/A' 'News' 'Reality-TV' 'Romance' 'Sci-Fi'
 'Short' 'Sport' 'Talk-Show' 'Thriller' 'War' 'Western']
                                           Director  \
imdbID                                                
tt0080684                            Irvin Kershner   
tt0081562                            Sidney Poitier   
tt0080339  Jim Abrahams, David Zucker, Jerry Zucker   
tt0080377                            Buddy Van Horn   
tt0081375                              Howard Zieff   
tt0080549                             Michael Apted   
tt0081529                               Hal Needham   
tt0080453                            Randal Kleiser   
tt0080455                               John Landis   
tt0081283                            Robert Redford   

                                Genre imdbRating  \
imdbID      

In [6]:
#create a dictionary with multi-hot encoded vectors; index = imdbID
multihot_dict = {movies_df.index.tolist()[i] : multihot[i] for i in range(0, len(multihot))}
#print(multihot_dict)

### Adding the images to the dataframe
Note: not used

In [32]:
flist=glob.glob('./Movie_Poster_Dataset/*/*.jpg')

imdb_id_arr = ["0" for a in range(len(flist))]
image_arr = ["0" for a in range(len(flist))]
index = 0

for filename in flist:
        
    imdb_id = filename[filename.index("tt"):filename.index(".jpg")]
        
    imdb_id_arr[index] = imdb_id
                
    img = np.array(cv2.imread(filename))
    img = np.swapaxes(img, 2,0)
    img = np.swapaxes(img, 2,1)
    
    image_arr[index] = img
    
    index +=1 
        
image_dict = {
    "imdbID": imdb_id_arr,
    "Poster": image_arr
}

images_df = pd.DataFrame.from_dict(image_dict)
images_df = images_df.drop_duplicates(subset=["imdbID"], keep="last")
images_df.set_index("imdbID", inplace=True)
movies_df = pd.concat([movies_df, images_df], axis=1)
print(movies_df.head(10))

                                           Director  \
imdbID                                                
tt0080684                            Irvin Kershner   
tt0081562                            Sidney Poitier   
tt0080339  Jim Abrahams, David Zucker, Jerry Zucker   
tt0080377                            Buddy Van Horn   
tt0081375                              Howard Zieff   
tt0080549                             Michael Apted   
tt0081529                               Hal Needham   
tt0080453                            Randal Kleiser   
tt0080455                               John Landis   
tt0081283                            Robert Redford   

                                Genre imdbRating  \
imdbID                                             
tt0080684  Action, Adventure, Fantasy        8.8   
tt0081562               Comedy, Crime        6.8   
tt0080339                      Comedy        7.8   
tt0080377              Action, Comedy        6.0   
tt0081375                 C

### Passing the images through a convolutional network

In [8]:
#training controls
batch_size = 20
epochs = 2
training_size = 0.7
learning_rate = 0.001
dropout = [0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.15]
# input image dimensions
img_rows, img_cols = 100, 100

In [35]:
# the data holders
x_test = []
x_train = []
y_test = []
y_train = []

#images need to have the same size!!
flist=glob.glob('./Movie_Poster_Dataset_Cropped_Once/*.jpg')

length=int(len(flist)*training_size)
i = 0

#create lists with input data (images) and output data (multi-hot encoded genre vectors)
for filename in flist:
        
    imdb_id = filename[filename.index("tt"):filename.index(".jpg")]
      
    if imdb_id in multihot_dict:
        img = np.array(cv2.imread(filename))
        img = np.swapaxes(img, 2,0)
        img = np.swapaxes(img, 2,1)
        
        genre_arr = np.empty([28])

        for j in range(len(multihot_dict[imdb_id])):
            genre_arr[j] = multihot_dict[imdb_id][j]
    
        if(i<length):  
            x_train.append(img)
            y_train.append(genre_arr)
        else:
            x_test.append(img)
            y_test.append(genre_arr)
        
        i +=1 

In [36]:
print(len(x_train))
print(len(y_train))

print(x_train[0])
print(y_train[0])

5636
5636
[[[ 69   0   8 ... 202 202 202]
  [ 73   8  17 ... 202 204 202]
  [ 48   0   0 ... 206 208 206]
  ...
  [218 219 223 ... 152  98  85]
  [216 217 222 ... 178  88  39]
  [215 216 221 ... 202 101  23]]

 [[ 68   0   7 ... 232 232 232]
  [ 72   7  16 ... 232 232 232]
  [ 48   0   0 ... 234 233 234]
  ...
  [243 244 244 ... 157 103  90]
  [241 242 243 ... 183  93  44]
  [240 241 242 ... 207 106  28]]

 [[ 88  17  27 ... 233 233 233]
  [ 92  27  36 ... 233 233 233]
  [ 66  13  18 ... 235 235 235]
  ...
  [247 248 246 ... 166 112  99]
  [245 246 245 ... 192 102  53]
  [244 245 244 ... 216 115  37]]]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0.]


In [38]:
#converting the data from lists to numpy arrays
x_train=np.asarray(x_train,dtype=float)
x_test=np.asarray(x_test,dtype=float)
y_train=np.asarray(y_train,dtype=float)
y_test=np.asarray(y_test,dtype=float)

#scaling down the RGB data
x_train /= 255
x_test /= 255

#printing stats about the features
print('x_train shape:', x_train.shape)
print(x_train.shape[0], 'train samples')
print(x_test.shape[0], 'test samples')

train_length = x_train.shape[0]

x_train=torch.from_numpy(x_train)
x_test=torch.from_numpy(x_test)
y_train=torch.from_numpy(y_train)
y_test=torch.from_numpy(y_test)

train = data_utils.TensorDataset(x_train, y_train)
train_loader = data_utils.DataLoader(train, batch_size=batch_size, shuffle=True)

test = data_utils.TensorDataset(x_test, y_test)
test_loader = data_utils.DataLoader(test, batch_size=batch_size, shuffle=False)

x_train shape: (5636, 3, 100, 100)
5636 train samples
2416 test samples


In [39]:
class Net(nn.Module):
    def __init__(self, input_shape=(3, img_rows, img_cols)):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 128, kernel_size=2)
        self.conv1_drop = nn.Dropout2d(p=dropout[0])
        self.conv2 = nn.Conv2d(128, 64, kernel_size=2)
        self.conv2_drop = nn.Dropout2d(p=dropout[1])
        self.conv3 = nn.Conv2d(64, 64, kernel_size=2)
        self.conv3_drop = nn.Dropout2d(p=dropout[2])
        self.conv4 = nn.Conv2d(64, 64, kernel_size=2)
        self.conv4_drop = nn.Dropout2d(p=dropout[3])
        self.conv5 = nn.Conv2d(64, 32, kernel_size=2)
        self.conv5_drop = nn.Dropout2d(p=dropout[4])
        self.conv6 = nn.Conv2d(32, 16, kernel_size=2)
        self.conv6_drop = nn.Dropout2d(p=dropout[5])
        
        n_size = self._get_conv_output(input_shape)
        
        self.fc1 = nn.Linear(n_size, 16)
        self.fc1_drop = nn.Dropout(p=dropout[6])
        self.fc2 = nn.Linear(16, 16)
        self.fc2_drop = nn.Dropout(p=dropout[7])
        self.fc3 = nn.Linear(16, 8)
        self.fc3_drop = nn.Dropout(p=dropout[8])
        self.fc4 = nn.Linear(8, 1)
        
    def _get_conv_output(self, shape):
        bs = 1
        input = Variable(torch.rand(bs, *shape))
        output_feat = self._forward_features(input)
        n_size = output_feat.data.view(bs, -1).size(1)
        return n_size
        
    def _forward_features(self, x):
        x = F.relu(F.max_pool2d(self.conv1_drop(self.conv1(x)), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = F.relu(F.max_pool2d(self.conv3_drop(self.conv3(x)), 2))
        x = F.relu(F.max_pool2d(self.conv4_drop(self.conv4(x)), 2))
        x = F.relu(F.max_pool2d(self.conv5_drop(self.conv5(x)), 2))
        #x = F.relu(F.max_pool2d(self.conv6_drop(self.conv6(x)), 2))
        return x
        
    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1_drop(self.conv1(x)), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = F.relu(F.max_pool2d(self.conv3_drop(self.conv3(x)), 2))
        x = F.relu(F.max_pool2d(self.conv4_drop(self.conv4(x)), 2))
        x = F.relu(F.max_pool2d(self.conv5_drop(self.conv5(x)), 2))
        #x = F.relu(F.max_pool2d(self.conv6_drop(self.conv6(x)), 2))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1_drop(self.fc1(x)))
        x = F.relu(self.fc2_drop(self.fc2(x)))
        x = F.relu(self.fc3_drop(self.fc3(x)))
        x = self.fc4(x)
        return x

model = Net()
criterion = MSELoss(size_average=True)
human_criterion = L1Loss(size_average=True)
optimizer = optim.RMSprop(model.parameters(), lr=learning_rate,
            alpha=0.9, eps=1e-08, weight_decay=0.0)

def train(epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data).float(), Variable(target).float()
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        human_loss= human_criterion(output, target)
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} {:.6f}'.format(
            epoch, batch_idx * len(data), len(train_loader.dataset),
            100. * batch_idx / len(train_loader), loss.data, human_loss.data))

def test():
    print('test')
    model.eval()
    test_loss = 0
    correct = 0
    human_loss = 0
    i = 0
    for batch_idx, (data, target) in enumerate(test_loader):
        i+=1
        data, target = Variable(data, volatile=True).float(), Variable(target).float()
        output = model(data)
        loss = criterion(output, target)
        test_loss += loss
        human_loss += human_criterion(output, target)
        if loss==0:
            correct+=1

    print('\nTest set: \nAverage sq_loss: {:.4f} \nAverage abs_loss: {:.4f} \nGuessed 100% correct: {:.4f}\n'.format(test_loss.data/i, human_loss.data/i, correct))

model.float()
print(model)
for epoch in range(0, epochs):
    train(epoch)
    test()

    #cross-entropy for classification evaluation!

  return F.mse_loss(input, target, reduction=self.reduction)


Net(
  (conv1): Conv2d(3, 128, kernel_size=(2, 2), stride=(1, 1))
  (conv1_drop): Dropout2d(p=0.3, inplace=False)
  (conv2): Conv2d(128, 64, kernel_size=(2, 2), stride=(1, 1))
  (conv2_drop): Dropout2d(p=0.3, inplace=False)
  (conv3): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
  (conv3_drop): Dropout2d(p=0.3, inplace=False)
  (conv4): Conv2d(64, 64, kernel_size=(2, 2), stride=(1, 1))
  (conv4_drop): Dropout2d(p=0.3, inplace=False)
  (conv5): Conv2d(64, 32, kernel_size=(2, 2), stride=(1, 1))
  (conv5_drop): Dropout2d(p=0.2, inplace=False)
  (conv6): Conv2d(32, 16, kernel_size=(2, 2), stride=(1, 1))
  (conv6_drop): Dropout2d(p=0.2, inplace=False)
  (fc1): Linear(in_features=128, out_features=16, bias=True)
  (fc1_drop): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc2_drop): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=16, out_features=8, bias=True)
  (fc3_drop): Dropout(p=0.15, inplace=False)
  (fc4): Linear(in_features

  return F.l1_loss(input, target, reduction=self.reduction)






  return F.mse_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)
  data, target = Variable(data, volatile=True).float(), Variable(target).float()


test

Test set: 
Average sq_loss: 0.0724 
Average abs_loss: 0.1501 
Guessed 100% correct: 0.0000



test

Test set: 
Average sq_loss: 0.0723 
Average abs_loss: 0.1480 
Guessed 100% correct: 0.0000



### Passing the images through object detection

In [8]:
batch_size = 20
epochs = 2
training_size = 0.7
learning_rate = 0.001
img_rows, img_cols = 100, 100

In [10]:
# the data holders
x_test_yolo = []
x_train_yolo = []
y_test_yolo = []
y_train_yolo = []

#images need to have the same size!!
flist=glob.glob('./Movie_Poster_Dataset_Cropped_Once/*.jpg')

#pretrained YOLOv5 model
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

length=int(len(flist)*training_size)
i = 0

#create lists with input data (object confidence vector) and output data (multi-hot encoded genre vectors)
for filename in flist:
        
    imdb_id = filename[filename.index("tt"):filename.index(".jpg")]
      
    if imdb_id in multihot_dict:
        img = np.array(cv2.imread(filename))
        img = np.swapaxes(img, 2,0)
        img = np.swapaxes(img, 2,1)
        
        results = yolo_model(img, size = 100)
            
        #create an array for the 91 object categories and set initial confidence to 0
        obj_arr = np.empty([91])
        for x in range(91):
            obj_arr[x] = 0.0

        #update the confidence values according to the object detection results
        for obj in results.pandas().xyxy[0]:
            index =  results.pandas().xyxy[0]['class']
            obj_arr[index] = obj_arr[index] + results.pandas().xyxy[0].confidence
        
        #create multi-hot encoded genre vector
        genre_arr = np.empty([28])

        for j in range(len(multihot_dict[imdb_id])):
            genre_arr[j] = multihot_dict[imdb_id][j]
        
        if(i<length):                   
            x_train_yolo.append(obj_arr)
            y_train_yolo.append(genre_arr)
        else:
            x_test_yolo.append(obj_arr)
            y_test_yolo.append(genre_arr)
        
        i +=1 

Using cache found in C:\Users\carol/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2022-5-5 torch 1.11.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 


In [12]:
#converting the data from lists to numpy arrays
x_train_yolo=np.asarray(x_train_yolo,dtype=float)
x_test_yolo=np.asarray(x_test_yolo,dtype=float)
y_train_yolo=np.asarray(y_train_yolo,dtype=float)
y_test_yolo=np.asarray(y_test_yolo,dtype=float)

#printing stats about the features
print('x_train shape:', x_train_yolo.shape)
print(x_train_yolo.shape[0], 'train samples')
print(x_test_yolo.shape[0], 'test samples')

train_length = x_train_yolo.shape[0]

x_train_yolo=torch.from_numpy(x_train_yolo)
x_test_yolo=torch.from_numpy(x_test_yolo)
y_train_yolo=torch.from_numpy(y_train_yolo)
y_test_yolo=torch.from_numpy(y_test_yolo)

train = data_utils.TensorDataset(x_train_yolo, y_train_yolo)
train_loader = data_utils.DataLoader(train, batch_size=batch_size, shuffle=True)

test = data_utils.TensorDataset(x_test_yolo, y_test_yolo)
test_loader = data_utils.DataLoader(test, batch_size=batch_size, shuffle=False)

x_train shape: (5636, 91)
5636 train samples
2416 test samples


In [13]:
#fully connected layer after object detection
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        #fully connected layer
        self.fc1 = nn.Linear(91, 28)
        
    def forward(self, x):
        x = self.fc1(x)
        output = F.log_softmax(x, dim=1)
        return output

model = Net()

result = model.train()
criterion = MSELoss(size_average=True)
human_criterion = L1Loss(size_average=True)
optimizer = optim.RMSprop(model.parameters(), lr=learning_rate,
            alpha=0.9, eps=1e-08, weight_decay=0.0)


