# Prediction based on Trailer (naive approach)

## Import

In [0]:
import json
import os
import pickle
import urllib

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

try:
    from google.colab import drive
    drive_dir = '/content/drive'
    drive.mount(drive_dir)
    os.chdir("drive/My Drive/AML/Git_lastClone/neural-network/trailer_model/naive_approach")
    root_dir = '/content/drive/My Drive/AML/'
    git_dir = root_dir+'Git/'
    COLAB_IN = True
except:
    COLAB_IN = False

if COLAB_IN:
    !pip install youtube-dl
    !pip install livelossplot
    
import tensorflow as tf
from keras import layers, models, applications
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, BatchNormalization, Lambda
import cv2
from keras import backend as K
import numpy as np
from tensorflow.python.keras.utils import Sequence
import time
from sklearn.model_selection import train_test_split
from livelossplot.keras import PlotLossesCallback
from IPython.display import clear_output
clear_output(wait=True)

## Dataset

### Loader

In [0]:
!echo "--force-ipv4" > /etc/youtube-dl.conf
class Entry:
  
  def __init__(self, **kwargs):
    self.movie_id = kwargs['movie_id']
    self.name = kwargs['name']
    self.revenue_opening = kwargs['revenue_opening']
    self.revenue_total = kwargs['revenue_total']
    
  def __repr__(self):
    return f'Name: {self.name}; movie ID: {self.movie_id}'
    
  def set_metadata(self, metadata_dict):
    self.year = metadata_dict['year']
    self.genres = metadata_dict['genres']
    self.actors = metadata_dict['actors']
    self.directors = metadata_dict['directors']
    self.creators = metadata_dict['creators']
    self.duration = metadata_dict['duration']
    
  def set_trailers(self, trailer_dict):
    self.imdb_trailer = trailer_dict['imdb_trailer']
    self.youtube_trailer = trailer_dict['youtube_trailer']
    
 
    

class Dataset:
  
  def __init__(self, dataset_dir):
    self.basic_data = pd.read_csv(os.path.join(dataset_dir, 'Dataset.csv'))
    self.dataset_dir = dataset_dir
    
    self.entries = list()
    self.movie_indeces = dict()
    self._create_entries()
    
    self.Youtube_urlroot = "https://www.youtube.com"
    self.Imdb_urlroot = "https://www.imdb.com"
    
  def get_entry(self, movie_id):
    return self.entries[self.movie_indeces[movie_id]]
  
  def get_revenue(self, movie_id):
    tmp_df = self.basic_data[self.basic_data['movie_id'] == movie_id]
    return tmp_df['revenue_opening'].values[0]
  
  def load_metadata(self):
    with open(os.path.join(self.dataset_dir, 'metadata.data'), 'rb') as fr:
      metadata_dict = pickle.load(fr)
      
    for k, v in metadata_dict['global'].items():
      self.__dict__[k] = self._decompress_global_categorical(v)
      
    for movie_id, movie_metadata in metadata_dict['entries'].items():
      entry = self.entries[self.movie_indeces[movie_id]]
      entry.set_metadata(movie_metadata)
      
      # decompress categorical
      entry.actors = self._decompress_categorical(entry.actors, len(self.all_actors))
      entry.creators = self._decompress_categorical(entry.creators, len(self.all_creators))
      entry.directors = self._decompress_categorical(entry.directors, len(self.all_directors))
      entry.genres = self._decompress_categorical(entry.genres, len(self.all_genres))
    
  def load_trailers(self):
    with open(os.path.join(self.dataset_dir, 'trailers.data'), 'rb') as fr:
      trailers_dict = pickle.load(fr)
      
    for movie_id, trailers in trailers_dict.items():
      entry = self.entries[self.movie_indeces[movie_id]]
      entry.set_trailers(trailers)
  
  def GetTrailer(self, IMDbVideoUrl, YoutubeVideoUrl, trailers_dir, filename): # download trailer
        file_output = os.path.join(trailers_dir, filename+".mp4")
        !youtube-dl -f best --force-ipv4 --merge-output-format mp4 '$YoutubeVideoUrl' -o '$file_output' > /dev/null
        
            
  def _create_entries(self):
    for idx, [movie_id, name, rev_open, rev_total] in self.basic_data.iterrows():
      entry = Entry(movie_id=movie_id, name=name,
                   revenue_opening = rev_open,
                   revenue_total = rev_total)
      self.entries.append(entry)
      self.movie_indeces[movie_id] = idx
      
  def _compress_global_categorical(self, input_dict):
    output_dict = dict()
    for k, v in input_dict.items():
      output_dict[k] = np.argmax(v)

    return output_dict
  
  def _decompress_global_categorical(self, input_dict):
    output_dict = dict()
    for k, v in input_dict.items():
      indicator = np.zeros(len(input_dict))
      indicator[v] = 1
      output_dict[k] = indicator
      
    return output_dict
  
  def _compress_categorical(self, arr):
    dct = dict()
    for idx, val in enumerate(arr):
      if val > 0:
        dct[idx] = val

    return dct
  
  def _decompress_categorical(self, arr, total_len):
    indicator = np.zeros(total_len)
    for idx, val in arr.items():
      indicator[idx] = val
    
    return indicator

In [0]:
movies = Dataset(dataset_dir = '../../dataset/')

movies.load_trailers()
movies.load_metadata()

## Naive

In [0]:
import numpy as np
import keras
trailers_dir = './'
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, movies, labels, batch_size, n_classes, list_IDs, dim=(300, 300), n_channels=3, shuffle=True):
        'Initialization'
        self.movies = movies
        self.dim = dim
        self.batch_size = batch_size
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
        self.labels = labels

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        
        

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization
        X = []
        y =[]
       
        for i, ID in enumerate(list_IDs_temp):
           
            broken = False
            print("ID", ID)
            # Store sample
            entry = self.movies.entries[ID]
            #print(self.movies.entries[ID])
            videoFile = os.path.join(trailers_dir, entry.movie_id +'.mp4')
            #print('Retrieving movie ', entry.movie_id)
            if not os.path.isfile(videoFile):
                print('Donwload trailer')
                movies.GetTrailer(entry.imdb_trailer, entry.youtube_trailer, trailers_dir, entry.movie_id)
                #time.sleep(5)
                timer = time.time()
                while not os.path.isfile(videoFile):
                    #print('Trailer is not yet ready')
                    time.sleep(1)
                    if(time.time() - timer > 20):
                        broken = True
                        break
            if broken:
                continue
            print('trailer downloaded')
            
            #fps
            fps=1
            #read the video from the file
            cap = cv2.VideoCapture(videoFile)
            cap.set(cv2.CAP_PROP_FPS, 15)
            n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            video_fps = cap.get(cv2.CAP_PROP_FPS)
            ret = True
            frame_index = -1
            buf = []
            fc = 0
            print("video procesing")
            while ret:
                try:
                    #next frame
                    frame_index = frame_index + 1   
                    #read the frame
                    ret, frame = cap.read()


                    if video_fps > fps and frame_index % int(video_fps / fps) != 0:
                        continue

                    if ret:
                        ret, im = cap.read()
                        im = cv2.resize(im, self.dim)
                        im = (im / 255.).astype(np.float32)

                        buf.append(im)
                except:
                    print("error")
                    continue
                
            #os.remove(os.path.join(trailers_dir, entry.movie_id+'.mp4'))
            X.append(np.array(buf))
            y.append(self.labels[ID])
            #print(X[i-1].shape)
        return np.array(X), np.array(y)

In [0]:
class InceptionV3LSTM:
    def __init__(self):
        pass
    
    # ****** model ******
    def CreateModel(self, resolution, n_classes):
        # set learning phase to 0
        K.set_learning_phase(0)
        self.n_classes = n_classes


        video = layers.Input(shape=(None,)+resolution+(3,),name='video_input')
        cnn = applications.InceptionV3(
            weights='imagenet',
            include_top=False,
            pooling='avg'
        )
        cnn.trainable = False
        # wrap cnn into Lambda and pass it into TimeDistributed
        encoded_frame = layers.TimeDistributed(Lambda(lambda x: cnn(x)))(video)
        encoded_vid = layers.LSTM(128)(encoded_frame)
        outputs = layers.Dense(n_classes, activation='softmax')(encoded_vid)
        self.model = models.Model(inputs=[video],outputs=outputs)
        

In [0]:
n_classes = 10
resolution = (200, 200)
naive_classifier = InceptionV3LSTM()
naive_classifier.CreateModel(resolution = resolution, n_classes = n_classes)

In [0]:
def labels(num_classes, revenues):
    # non linear labeling
    quantiles = np.linspace(1./num_classes, 1, num_classes)
    class_limits = [0,]
    for q in quantiles:
        class_limits.append(np.quantile(revenues, q))

    return np.array(class_limits)

def set_labels(class_limits, revenues):
    return np.sum(class_limits.reshape(class_limits.size, 1) < revenues, 0) - 1

def onehot(values):
    n_values = np.max(values) + 1
    return np.eye(n_values)[values]

revenuesRaw = [movies.entries[i].revenue_opening for i in range(len(movies.entries))]
revenuesOneHot = onehot(set_labels(labels(n_classes, revenuesRaw), revenuesRaw))

In [0]:
naive_classifier.model.compile(optimizer=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True),loss='mse', metrics=['acc', 'mse']) #loss='categorical_crossentropy'


# Parameters
params = {'dim': resolution,
          'batch_size': 1,
          'n_classes': n_classes,
          'n_channels': 3,
          'shuffle': True,
         }


# suffle
idx = np.random.permutation(len(movies.entries))[:2000]
revenuesRaw = np.array(revenuesRaw)
revenuesOneHot = revenuesOneHot[idx]
movies.entries = [movies.entries[i] for i in idx] 

# logger

csv_logger = keras.callbacks.CSVLogger('./naive/log.csv', separator=',', append=False)
tbCallBack = keras.callbacks.TensorBoard(log_dir='./naive/graph/', histogram_freq=0, write_graph=True, write_images=True)
modelCallback = keras.callbacks.ModelCheckpoint('./naive/weights', monitor='val_loss', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=1)

# Generators
training_generator = DataGenerator(movies, revenuesOneHot, list_IDs = list(range(0, 100)), **params)
validation_generator = DataGenerator(movies, revenuesOneHot, list_IDs = list(range(0, 100)), **params)
# Train model on dataset
train_history = naive_classifier.model.fit_generator(generator=training_generator,
                    #validation_data=validation_generator,
                    shuffle=True,
                    max_queue_size=1,
                    use_multiprocessing=False, verbose = 1, callbacks=[tbCallBack, csv_logger, modelCallback])