In [1]:
import torch
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
import pickle

data_file = "../data/tmdb_posters.pkl"
data = pickle.load(open(data_file, 'rb'))

genre_list = pickle.load(open("../data/tmdb_genres_list.pkl", 'rb'))

In [3]:
from common import Data, Split, Batches, load_data, encode_y, load_split

# get overviews from data
OVERVIEWS = Data(np.array([d['overview'] for d in data]))
# get title from data
TITLES = Data(np.array([d['title'] for d in data]))

OVERVIEWS.save("../local/overviews.pkl")
TITLES.save("../local/titles.pkl")

OVERVIEWS = load_data("../local/overviews.pkl")
TITLES = load_data("../local/titles.pkl")

# get genres, encode as 'one'-hot vectors
GENRES = Data(encode_y(np.array([d['genre_ids'] for d in data])))
GENRES.save("../local/genres.pkl")


GENRES = load_data("../local/genres.pkl")

In [4]:
# create train-test split
train, val, test = OVERVIEWS.create_splits(0.8,0.1)
train.save("../local/train.pkl")
val.save("../local/val.pkl")
test.save("../local/test.pkl")

train = load_split("../local/train.pkl")
val = load_split("../local/val.pkl")
test = load_split("../local/test.pkl")

In [60]:
from vocab import Vocab, load_vocab

# create vocab, this is to support fine-tuning of embeddings (otherwise don't call add_sentences)
vocab = Vocab()
vocab.initialize_glove("../local/glove/glove.6B.300d.txt")
print(len(vocab)) # 400001
new_words_overviews = vocab.add_sentences(train.get_data(OVERVIEWS))
new_words_titles = vocab.add_sentences(train.get_data(TITLES))
print(len(vocab))  # 421262
print(len(new_words_overviews)) # 18086
print(len(new_words_titles)) # 3175
vocab.save("../local/vocab.pkl")

# create embedding layer, for now we freeze the embedding layer. (default is freeze=True)
vocab = load_vocab("../local/vocab.pkl")
embedding = vocab.create_pytorch_embeddings()
torch.save(embedding, "../local/embedding.pth")

embedding = torch.load('../local/embedding.pth')

In [6]:
# encode data as indices
OVERVIEWS_ENCODED = Data(vocab.encode_sentences(OVERVIEWS.data))
TITLES_ENCODED = Data(vocab.encode_sentences(TITLES.data))

OVERVIEWS_ENCODED.save("../local/overviews_encoded.pkl")
TITLES_ENCODED.save("../local/titles_encoded.pkl")

OVERVIEWS_ENCODED = load_data("../local/overviews_encoded.pkl")
TITLES_ENCODED = load_data("../local/titles_encoded.pkl")