In [1]:
import pandas as pd
import numpy as np

import os

# https://nbviewer.jupyter.org/github/kaushaltrivedi/bert-toxic-comments-multilabel/blob/master/toxic-bert-multilabel-classification.ipynb?source=post_page-------

In [4]:

WD = os.getcwd()
DATA_DIR = os.path.join(WD, 'data','mpst-movie-plot-synopses-with-tags')

In [64]:
data = pd.read_csv(DATA_DIR+'\\mpst_full_data.csv')
data = data.drop(['synopsis_source'],axis=1)
data.shape

(14828, 5)

In [45]:
data.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val


In [48]:
split = data['tags'].str.split(', ')
lens = split.str.len()


In [49]:
 np.concatenate(split)

array(['cult', 'horror', 'gothic', ..., 'anti war', 'murder',
       'christian film'], dtype='<U18')

In [63]:
temp_df = pd.DataFrame({'imdb_id': np.repeat(data['imdb_id'].values, lens), 
                        'category': np.concatenate(split),
                       'values': 1})

print(temp_df['category'].unique())
print(len(temp_df['category'].unique()))

temp_df = temp_df.pivot(index='imdb_id', columns='category', values='values').fillna(0).reset_index()



['cult' 'horror' 'gothic' 'murder' 'atmospheric' 'violence' 'romantic'
 'inspiring' 'stupid' 'feel-good' 'cruelty' 'dramatic' 'action' 'revenge'
 'sadist' 'queer' 'flashback' 'mystery' 'suspenseful' 'neo noir' 'prank'
 'psychedelic' 'tragedy' 'autobiographical' 'home movie'
 'good versus evil' 'depressing' 'realism' 'boring' 'haunting'
 'sentimental' 'paranormal' 'historical' 'storytelling' 'comedy' 'fantasy'
 'philosophical' 'adult comedy' 'cute' 'entertaining' 'bleak' 'humor'
 'plot twist' 'christian film' 'pornographic' 'insanity' 'brainwashing'
 'sci-fi' 'dark' 'claustrophobic' 'psychological' 'melodrama'
 'historical fiction' 'absurd' 'satire' 'alternate reality'
 'alternate history' 'comic' 'grindhouse film' 'thought-provoking'
 'clever' 'western' 'blaxploitation' 'whimsical' 'intrigue' 'allegory'
 'anti war' 'avant garde' 'suicidal' 'magical realism' 'non fiction']
71


In [61]:
data_separate = data.merge(temp_df, how='left', on='imdb_id')
data_separate.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,absurd,action,adult comedy,allegory,alternate history,...,sentimental,storytelling,stupid,suicidal,suspenseful,thought-provoking,tragedy,violence,western,whimsical
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [67]:
train_df = data_separate[data_separate['split'] == 'train']
val_df = data_separate[data_separate['split'] == 'val']
test_df = data_separate[data_separate['split'] == 'test']

train_df.shape, val_df.shape, test_df.shape

((9489, 76), (2373, 76), (2966, 76))

# BERT Modeling

In [159]:
import torch
from pytorch_transformers import *
from pytorch_transformers.modeling_bert import BertPreTrainedModel
from pytorch_transformers.optimization import AdamW

from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [122]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_config = BertConfig.from_pretrained('bert-base-uncased')

In [77]:
bert_tokenizer.tokenize(train_df['plot_synopsis'].iloc[0])

1661

In [78]:
train_df

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,absurd,action,adult comedy,allegory,alternate history,...,sentimental,storytelling,stupid,suicidal,suspenseful,thought-provoking,tragedy,violence,western,whimsical
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,tt0249380,Baise-moi,Baise-moi tells the story of Nadine and Manu w...,"gothic, cruelty, violence, cult, revenge, sadist",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,tt0408790,Flightplan,Kyle Pratt (Jodie Foster) is a propulsion engi...,"mystery, suspenseful, action, murder, flashback",train,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,tt0021079,Little Caesar,Small-time Italian-American criminals Caesar E...,violence,train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,tt1615065,Savages,The movie begins with a video being shot of me...,"revenge, neo noir, murder, violence, flashback",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
10,tt0089606,Mitt liv som hund,The action takes place in the years 1958-1959 ...,"cult, prank",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,tt0078908,The Brood,"At the Somafree Institute, Dr. Hal Raglan humi...","cult, psychedelic, murder, violence",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12,tt0795493,Cassandra's Dream,Brothers Terry (Colin Farrell) and Ian (Ewan M...,"tragedy, dramatic, murder",train,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [81]:
torch.tensor([bert_tokenizer.encode("Let's see all hidden-states and attentions on this text")]).shape

torch.Size([1, 14])

In [82]:
bert_tokenizer.encode("Let's see all hidden-states and attentions on this text")

[2292,
 1005,
 1055,
 2156,
 2035,
 5023,
 1011,
 2163,
 1998,
 3086,
 2015,
 2006,
 2023,
 3793]

In [94]:
class MPSTDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_seq_length, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.dataframe)
    
    def get_sample_features(self, sample):
        tokenized_sample = self.tokenizer.tokenize(sample)
        
        tokenized_sample = ["[CLS]"] + tokenized_sample[:self.max_seq_length-2] + ["[SEP]"]
    
        input_ids = self.tokenizer.convert_tokens_to_ids(tokenized_sample)
        segment_ids = [0] * len(input_ids)
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (self.max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding
        
        assert len(input_ids) == self.max_seq_length
        assert len(input_mask) == self.max_seq_length
        assert len(segment_ids) == self.max_seq_length
        
        return input_ids, input_mask, segment_ids


    def __getitem__(self, idx):
        sample = self.dataframe.iloc[idx]['plot_synopsis']
        label = self.dataframe.iloc[idx][5:]
        
        input_ids, input_mask, segment_ids = self.get_sample_features(sample)
        
        return torch.tensor(input_ids), torch.tensor(input_mask), torch.tensor(segment_ids), torch.tensor(label)
        

In [144]:
class BertForMultiLabelClassification(BertPreTrainedModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(config, num_labels)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config, num_labels=71):
        super(BertForMultiLabelClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_weights)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
                position_ids=None, head_mask=None):
        
        outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
                            attention_mask=attention_mask, head_mask=head_mask)
        print(len(outputs))
        print(outputs[0].shape)
        pooled_output = outputs[1]
        
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss
        else:
            return logits
        
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False
    
    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [187]:
def train_model(dataloaders, model, optimizer, scheduler, num_epochs=2):
    since = time.time()
    step_sizes = {'train': len(dataloaders['train']), 
                     'valid': len(dataloaders['valid'])}

    for epoch in tqdm(range(int(num_epochs)), desc="Epoch"):
        for phase in ['train', 'valid']:
            if phase == 'train':
                scheduler.step()
                model.train()
            else:
                model.eval()

            tr_loss, val_loss = 0,0
        
            for step, batch in enumerate(tqdm(dataloaders[phase], desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                
                tr_loss += loss.item()
    
                if phase == 'train':
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
            
            if phase == 'train':
                train_loss = tr_loss / step_sizes[phase]
            else:
                valid_loss = val_loss / step_sizes[phase]
                
        print('Epoch [{}/{}] train loss: {:.4f} valid loss: {:.4f}'.format(
                epoch+1, num_epochs, train_loss, valid_loss))
            
    return model

In [191]:
train_ds = MPSTDataset(train_df, bert_tokenizer, 128)
train_dl = DataLoader(train_ds,batch_size=2, shuffle=True)

val_ds = MPSTDataset(val_df, bert_tokenizer, 128)
val_dl = DataLoader(val_ds,batch_size=2, shuffle=True)

dloaders = {'train':train_dl, 'valid':val_dl}

In [192]:
EPOCHS = 2
LEARNING_RATE = 3e-5
ADAM_EPSILON = 1e-6
WARMUP_STEPS = 0

t_total= len(train_dl) * EPOCHS

model = BertForMultiLabelClassification(bert_config)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE, eps=ADAM_EPSILON)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=WARMUP_STEPS, t_total=t_total)



RuntimeError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 4.00 GiB total capacity; 2.87 GiB already allocated; 29.05 MiB free; 142.10 MiB cached)

In [None]:
start_time = time.time()
model = train_model(dloaders, model, optimizer, scheduler=scheduler, num_epochs=2)
print('Training time: {:10f} minutes'.format((time.time()-start_time)/60))

In [176]:
import time
from tqdm import tqdm