In [122]:
import pandas as pd
import numpy as np
from sklearn.metrics import jaccard_score
import os
import time
from tqdm import tnrange, tqdm_notebook
# https://nbviewer.jupyter.org/github/kaushaltrivedi/bert-toxic-comments-multilabel/blob/master/toxic-bert-multilabel-classification.ipynb?source=post_page-------

In [123]:

WD = os.getcwd()
DATA_DIR = os.path.join(WD, 'data','mpst-movie-plot-synopses-with-tags','mpst_full_data.csv')

In [124]:
data = pd.read_csv(DATA_DIR)
data = data.drop(['synopsis_source'],axis=1)
data.shape

(14828, 5)

In [125]:
data.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val


In [126]:
split = data['tags'].str.split(', ')
lens = split.str.len()


In [127]:
 np.concatenate(split)

array(['cult', 'horror', 'gothic', ..., 'anti war', 'murder',
       'christian film'], dtype='<U18')

In [128]:
temp_df = pd.DataFrame({'imdb_id': np.repeat(data['imdb_id'].values, lens), 
                        'category': np.concatenate(split),
                       'values': 1})

print(temp_df['category'].unique())
print(len(temp_df['category'].unique()))

temp_df = temp_df.pivot(index='imdb_id', columns='category', values='values').fillna(0).reset_index()



['cult' 'horror' 'gothic' 'murder' 'atmospheric' 'violence' 'romantic'
 'inspiring' 'stupid' 'feel-good' 'cruelty' 'dramatic' 'action' 'revenge'
 'sadist' 'queer' 'flashback' 'mystery' 'suspenseful' 'neo noir' 'prank'
 'psychedelic' 'tragedy' 'autobiographical' 'home movie'
 'good versus evil' 'depressing' 'realism' 'boring' 'haunting'
 'sentimental' 'paranormal' 'historical' 'storytelling' 'comedy' 'fantasy'
 'philosophical' 'adult comedy' 'cute' 'entertaining' 'bleak' 'humor'
 'plot twist' 'christian film' 'pornographic' 'insanity' 'brainwashing'
 'sci-fi' 'dark' 'claustrophobic' 'psychological' 'melodrama'
 'historical fiction' 'absurd' 'satire' 'alternate reality'
 'alternate history' 'comic' 'grindhouse film' 'thought-provoking'
 'clever' 'western' 'blaxploitation' 'whimsical' 'intrigue' 'allegory'
 'anti war' 'avant garde' 'suicidal' 'magical realism' 'non fiction']
71


In [129]:
data_separate = data.merge(temp_df, how='left', on='imdb_id')
data_separate.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,absurd,action,adult comedy,allegory,alternate history,...,sentimental,storytelling,stupid,suicidal,suspenseful,thought-provoking,tragedy,violence,western,whimsical
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [130]:
# inbalance class ratio
1-(data_separate.iloc[:,5:].sum(axis=1)/71).mean()

0.9580105396338104

In [131]:
train_df = data_separate[data_separate['split'] == 'train']
val_df = data_separate[data_separate['split'] == 'val']
test_df = data_separate[data_separate['split'] == 'test']

train_df.shape, val_df.shape, test_df.shape

((9489, 76), (2373, 76), (2966, 76))

# BERT Modeling

In [132]:
import torch
from pytorch_transformers import *
from pytorch_transformers.modeling_bert import BertPreTrainedModel
from pytorch_transformers.optimization import AdamW

from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

In [133]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# bert_model = BertModel.from_pretrained('bert-base-uncased')
# bert_config = BertConfig.from_pretrained('bert-base-uncased')

In [134]:
class MPSTDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_seq_length, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.dataframe)
    
    def get_sample_features(self, sample):
        tokenized_sample = self.tokenizer.tokenize(sample)
        
        tokenized_sample = ["[CLS]"] + tokenized_sample[:self.max_seq_length-2] + ["[SEP]"]
    
        input_ids = self.tokenizer.convert_tokens_to_ids(tokenized_sample)
        segment_ids = [0] * len(input_ids)
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (self.max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        
        assert len(input_ids) == self.max_seq_length
        assert len(input_mask) == self.max_seq_length
        assert len(segment_ids) == self.max_seq_length
        
        return input_ids, input_mask, segment_ids


    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]['plot_synopsis']
        label = self.dataframe.iloc[idx][5:]
        
        input_ids, input_mask, segment_ids = self.get_sample_features(sample)
        
        return torch.tensor(input_ids), torch.tensor(input_mask), torch.tensor(segment_ids), torch.tensor(label)
        

In [136]:
class BertForMultiLabelClassification(torch.nn.Module):
    def __init__(self, num_labels=71):
        super().__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.1)
        self.layer = torch.nn.Linear(768,300)
        self.classifier = torch.nn.Linear(300, num_labels)
        self.batchnorm = torch.nn.BatchNorm1d(300)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
                position_ids=None, head_mask=None):
        
        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
                            attention_mask=attention_mask, head_mask=head_mask)

        # pooled output
        pooled_output = outputs[1]
        
        x = self.dropout(pooled_output)
        x = torch.nn.functional.relu(self.layer(x))
        x = self.batchnorm(x)
        logits = self.classifier(x)

        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss, logits
        else:
            return logits
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [144]:
def precision_k(pred, label, k=[1, 3, 5]):
    batch_size = pred.shape[0]
    
    precision = []
    for _k in k:
        p = 0
        for i in range(batch_size):
            p += label[i, pred[i, :_k]].mean()
        precision.append(p*100/batch_size)
    
    return precision

def ndcg_k(pred, label, k=[1, 3, 5]):
    batch_size = pred.shape[0]
    
    ndcg = []
    for _k in k:
        score = 0
        rank = np.log2(np.arange(2, 2 + _k))
        for i in range(batch_size):
            l = label[i, pred[i, :_k]]
            n = l.sum()
            if(n == 0):
                continue
            
            dcg = (l/rank).sum()
            label_count = label[i].sum()
            norm = 1 / np.log2(np.arange(2, 2 + np.min((_k, label_count))))
            norm = norm.sum()
            score += dcg/norm
            
        ndcg.append(score*100/batch_size)
    
    return ndcg

In [185]:
def train_model(dataloaders, model, optimizer, criterion, scheduler, num_epochs=2):
    since = time.time()
    step_sizes = {'train': len(dataloaders['train']), 
                     'valid': len(dataloaders['valid'])}
    
    weight = torch.tensor([0.5, 1.5])

    for epoch in tnrange(int(num_epochs), desc="Epoch"):
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0
            running_acc = 0
            
            r_p1, r_p3, r_p5 = 0,0,0
            r_ndcg1, r_ndcg3, r_ndcg5 = 0,0,0
        
            for step, batch in enumerate(tqdm_notebook(dataloaders[phase], desc=phase)):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                
                logits = model(input_ids, segment_ids, input_mask)
                sigmoid = logits.sigmoid()

                loss = criterion(sigmoid, label_ids)
                
                label_weight = weight[label_ids.data.view(-1).long()].view_as(label_ids).to(device)
                weighted_loss = loss * label_weight
                weighted_loss_average = weighted_loss.mean()
                
                running_loss += weighted_loss_average.item()
                
#                 sigmoid_numpy = sigmoid.detach().cpu().numpy()
#                 labels_numpy = label_ids.detach().cpu().numpy()
#                 acc = jaccard_score(labels_numpy, sigmoid_numpy.round(), average='samples')
#                 running_acc += acc
#                 print(weighted_loss_average.item(), acc, sigmoid_numpy.round().sum(axis=1))

                logits_cpu = logits.data.cpu()
                labels_cpu = label_ids.data.cpu()
                _p1,_p3,_p5=precision_k(logits_cpu.topk(k=5)[1].numpy(), labels_cpu.numpy(), k=[1,3,5])
                r_p1+= _p1
                r_p3+= _p3
                r_p5+= _p5
                
                _ndcg1,_ndcg3,_ndcg5=ndcg_k(logits_cpu.topk(k=5)[1].numpy(), labels_cpu.numpy(), k=[1,3,5])
                r_ndcg1 += _ndcg1
                r_ndcg3 += _ndcg3
                r_ndcg5 += _ndcg5
                
                if phase == 'train':
                    optimizer.zero_grad()
                    weighted_loss_average.backward()
                    optimizer.step()
#                     scheduler.step()
            
            if phase == 'train':
                train_loss = running_loss / step_sizes[phase]
                
                r_p1 = r_p1 / step_sizes[phase]
                r_p3 = r_p3 / step_sizes[phase]
                r_p5 = r_p5 / step_sizes[phase]
                
                r_ndcg1 = r_ndcg1 / step_sizes[phase]
                r_ndcg3= r_ndcg3 / step_sizes[phase]
                r_ndcg5 = r_ndcg5 / step_sizes[phase]
                
                print("precision@1 : %.4f , precision@3 : %.4f , precision@5 : %.4f "%(r_p1,r_p3,r_p5))
                print("ndcg@1 : %.4f , ndcg@3 : %.4f , ndcg@5 : %.4f "%(r_ndcg1,r_ndcg3,r_ndcg5))
            else:
                valid_loss = running_loss / step_sizes[phase]
                
                r_p1 = r_p1 / step_sizes[phase]
                r_p3 = r_p3 / step_sizes[phase]
                r_p5 = r_p5 / step_sizes[phase]
                
                r_ndcg1 = r_ndcg1 / step_sizes[phase]
                r_ndcg3= r_ndcg3 / step_sizes[phase]
                r_ndcg5 = r_ndcg5 / step_sizes[phase]
                
                print("precision@1 : %.4f , precision@3 : %.4f , precision@5 : %.4f "%(r_p1,r_p3,r_p5))
                print("ndcg@1 : %.4f , ndcg@3 : %.4f , ndcg@5 : %.4f "%(r_ndcg1,r_ndcg3,r_ndcg5))

                
        print('Epoch [{}/{}] train loss: {:.4f} acc: {:.4f} ' 
              'valid loss: {:.4f} acc: {:.4f}'.format(
                epoch+1, num_epochs,
                train_loss, train_acc, 
                valid_loss, valid_acc))
                

            
    return model

In [186]:
train_ds = MPSTDataset(train_df, bert_tokenizer, 128)
train_dl = DataLoader(train_ds,batch_size=16, shuffle=True)

val_ds = MPSTDataset(val_df, bert_tokenizer, 128)
val_dl = DataLoader(val_ds,batch_size=16, shuffle=True)

dloaders = {'train':train_dl, 'valid':val_dl}

In [187]:
# EPOCHS = 10
# LEARNING_RATE = 3e-4
# ADAM_EPSILON = 1e-6
# WARMUP_STEPS = 0

# t_total= len(train_dl) * EPOCHS

# model = BertForMultiLabelClassification.from_pretrained("bert-base-uncased")
# model.to(device)

# criterion = BCEWithLogitsLoss()

# param_optimizer = list(model.named_parameters())
# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
#     ]
# optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=ADAM_EPSILON)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total=t_total)

In [188]:
model = BertForMultiLabelClassification()
model.freeze_bert_encoder()

criterion = torch.nn.BCELoss(reduce=False)

optimizer = torch.optim.Adamax(model.parameters(), lr=0.001)


In [189]:
start_time = time.time()
model = train_model(dloaders, model, optimizer,criterion, scheduler=None, num_epochs=10)
print('Training time: {:10f} minutes'.format((time.time()-start_time)/60))

HBox(children=(IntProgress(value=0, description='Epoch', max=10, style=ProgressStyle(description_width='initia…

HBox(children=(IntProgress(value=0, description='train', max=594, style=ProgressStyle(description_width='initi…

precision@1 : 6.2500 , precision@3 : 8.3333 , precision@5 : 7.5000 
ndcg@1 : 6.2500 , ndcg@3 : 12.6920 , ndcg@5 : 14.1504 
precision@1 : 6.2500 , precision@3 : 8.3333 , precision@5 : 6.2500 
ndcg@1 : 6.2500 , ndcg@3 : 12.4344 , ndcg@5 : 13.4038 
precision@1 : 18.7500 , precision@3 : 12.5000 , precision@5 : 10.0000 
ndcg@1 : 18.7500 , ndcg@3 : 14.5338 , ndcg@5 : 15.6007 
precision@1 : 12.5000 , precision@3 : 8.3333 , precision@5 : 7.5000 
ndcg@1 : 12.5000 , ndcg@3 : 12.3080 , ndcg@5 : 16.1570 
precision@1 : 18.7500 , precision@3 : 10.4167 , precision@5 : 7.5000 
ndcg@1 : 18.7500 , ndcg@3 : 14.4817 , ndcg@5 : 16.4337 
precision@1 : 18.7500 , precision@3 : 6.2500 , precision@5 : 8.7500 
ndcg@1 : 18.7500 , ndcg@3 : 12.1160 , ndcg@5 : 15.6700 
precision@1 : 12.5000 , precision@3 : 16.6667 , precision@5 : 12.5000 
ndcg@1 : 12.5000 , ndcg@3 : 18.9267 , ndcg@5 : 17.7997 
precision@1 : 25.0000 , precision@3 : 14.5833 , precision@5 : 11.2500 
ndcg@1 : 25.0000 , ndcg@3 : 21.6830 , ndcg@5 : 22.746

KeyboardInterrupt: 

In [None]:
# single_sample = train_ds[0]
# single_sample

# test_ds = MPSTDataset(test_df, bert_tokenizer, 128)
# test_dl = DataLoader(test_ds,batch_size=2, shuffle=True)
# single_sample = iter(test_dl).next()

In [None]:
# model = BertForMultiLabelClassification()
# model.freeze_bert_encoder()
# model.to(device)

# criterion = BCEWithLogitsLoss()

# optimizer = torch.optim.Adamax(model.parameters(), lr=0.001)


In [170]:
# test_p1, test_p3, test_p5 = 0, 0, 0
# test_ndcg1, test_ndcg3, test_ndcg5=0, 0, 0

# for i in range(1):
#     single_sample = tuple(t.to(device) for t in single_sample)
#     input_ids, input_mask, segment_ids, label_ids = single_sample
    
# #     logits = model(input_ids.unsqueeze(0), segment_ids.unsqueeze(0), input_mask.unsqueeze(0))
#     logits = model(input_ids, segment_ids, input_mask)
    
#     loss = criterion(logits.view(-1, 71), label_ids.view(-1,71))
    
#     logits_cpu = logits.data.cpu()
#     labels_cpu = label_ids.data.cpu()
    
# #     print(logits.topk(5))
#     _p1,_p3=precision_k(logits_cpu.topk(k=5)[1].numpy(), labels_cpu.numpy(), k=[1,3])
#     test_p1+=_p1
#     test_p3+=_p3
#     test_p5+=_p5


#     _ndcg1,_ndcg3,_ndcg5=ndcg_k(pred_cpu.topk(k=5)[1].numpy(), labels_cpu.numpy(), k=[1,3,5])
#     test_ndcg1+=_ndcg1
#     test_ndcg3+=_ndcg3
#     test_ndcg5+=_ndcg5
    
    
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
    
    
# #     print(loss.item())

[1.]
[1.]
[1. 0. 0.]
[1. 1. 0.]
