In [1]:
import torch
import numpy as np
import pickle

%load_ext autoreload
%autoreload 2

In [2]:
from vocab import Vocab, load_vocab
from common import Data, Split, Batches, load_data, encode_y, load_split
from utils import ProgressBar

genre_list = pickle.load(open("../data/tmdb_genres_list.pkl", 'rb'))

GENRES = load_data("../local/genres.pkl")
train = load_split("../local/train.pkl")
val = load_split("../local/val.pkl")
test = load_split("../local/test.pkl")
embedding = torch.load('../local/embedding.pth')
OVERVIEWS_ENCODED = load_data("../local/overviews_encoded.pkl")
TITLES_ENCODED = load_data("../local/titles_encoded.pkl")

In [3]:
from pack import Pack
from model import Model1

In [None]:
model = Model1(embedding, hidden_dim=256, num_layers=2, cuda=True)
optimizer = torch.optim.Adam(filter(lambda p:p.requires_grad, model.parameters()))
loss = torch.nn.MultiLabelSoftMarginLoss().cuda()

In [None]:
def train_batches(batch_size, display=True):
    losses = []

    batches = Batches(train, batch_size)

    pb = ProgressBar(batches.batch_N, display=display)

    train.shuffle()
    pb.reset()
    for i in range(batches.batch_N):
        model.zero_grad()
        optimizer.zero_grad()

        texts = batches.get_data(OVERVIEWS_ENCODED, i)
        titles = batches.get_data(TITLES_ENCODED, i)
        title_pack = Pack(titles, cuda=True)
        text_pack = Pack(texts, cuda=True)
        y_true = batches.get_data(GENRES, i)
        y_true = torch.autograd.Variable(torch.from_numpy(y_true)).cuda().type(torch.cuda.FloatTensor)
        model_output = model(title_pack, text_pack)

        l = loss(model_output, y_true)
        l.backward()

        optimizer.step()

        losses.append(l.data.cpu().numpy()[0])

        pb.tick()
        
    return losses

In [None]:
import torch.nn.functional as F
def inference(split, model):
    batches = Batches(split, 128)

    model = model.eval()
    
    preds = []
    trues = []
    
    for i in range(batches.batch_N):    
        texts = batches.get_data(OVERVIEWS_ENCODED, i)
        titles = batches.get_data(TITLES_ENCODED, i)

        title_pack = Pack(titles, cuda=True)
        text_pack = Pack(texts, cuda=True)
        model_output = model(title_pack, text_pack)
        y_pred = (F.sigmoid(model_output).cpu()>0.5).data.numpy()

        y_true = batches.get_data(GENRES, i)
        
        preds.append(y_pred)
        trues.append(y_true)

    model = model.train()
    
    return np.concatenate(preds), np.concatenate(trues)

In [None]:
def train_epoches(n_epochs, batch_size):
    epoch_losses = []
    for epoch in range(n_epochs):
        losses = train_batches(batch_size, display=False)
        epoch_losses.append(losses)
        print("epoch {}:".format(epoch), np.mean(losses))
    p,t=inference(val, model)
    pr = precision_score(p,t,average='weighted')
    rc = recall_score(p,t,average='weighted')
    print("P", pr, "\tR:", rc)
    return epoch_losses

In [None]:
epoch_losses = train_epoches(20, 128)