In [3]:
import torch
import transformers
MAX_LEN = 512
TOKENIZER = transformers.BertTokenizer.from_pretrained(
    pretrained_model_name_or_path= 'bert-base-uncased',
    do_lower_case = True
)


class BERTdataset:
    def __init__(self, review, sentiment):
        self.review = review # input
        self.sentiment = sentiment # target
        self.tokenizer = TOKENIZER
        self.max_length = MAX_LEN
        
        
    def __len__(self):
        return len(self.review)
    
    
    
    def __getitem__(self, item_index):
        review = str(self.review[item_index])
        review = ' '.join(review.split()) # first make a list out of sentences than make a sentnces with only one space between words
                                          #this just removes if there are some weired spaces between words
            
        # BERT can take as input either one or two sentences, and uses [SEP] token to separate them.
        # [CLS] token always appears at start of sentences
        # Both tokens are always required even if we only have one sentences becasue thats how BERT was pretrained and how expects input
        inputs = self.tokenizer.encode_plus(
            review,
            None,
            add_special_tokens = True,
            max_length = self.max_length,
            truncation = True
        )
        
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        
        # we could have done padding  as parametar in encode_plus but lets act fancy
        padding_length = self.max_length - len(input_ids)
        
        input_ids = input_ids + ([0] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'sentiments': torch.tensor(self.sentiment[item_index], dtype=torch.float)
        }

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [4]:
import pandas as pd
dataframe = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [5]:
dataframe.sentiment = dataframe.sentiment.apply(lambda x: 1 if x == 'positive' else 0)
dataset = BERTdataset(
    review = dataframe.review.values,
    sentiment = dataframe.sentiment.values
)

In [7]:
len(dataset)

50000

In [12]:
dataset[0].keys()

dict_keys(['input_ids', 'attention_mask', 'token_type_ids', 'sentiments'])

In [13]:
dataset[0]['input_ids']
# Those zeros are from padding to the right 

tensor([  101,  2028,  1997,  1996,  2060, 15814,  2038,  3855,  2008,  2044,
         3666,  2074,  1015, 11472,  2792,  2017,  1005,  2222,  2022, 13322,
         1012,  2027,  2024,  2157,  1010,  2004,  2023,  2003,  3599,  2054,
         3047,  2007,  2033,  1012,  1026,  7987,  1013,  1028,  1026,  7987,
         1013,  1028,  1996,  2034,  2518,  2008,  4930,  2033,  2055, 11472,
         2001,  2049, 24083,  1998,  4895, 10258,  2378,  8450,  5019,  1997,
         4808,  1010,  2029,  2275,  1999,  2157,  2013,  1996,  2773,  2175,
         1012,  3404,  2033,  1010,  2023,  2003,  2025,  1037,  2265,  2005,
         1996,  8143, 18627,  2030,  5199,  3593,  1012,  2023,  2265,  8005,
         2053, 17957,  2007, 12362,  2000,  5850,  1010,  3348,  2030,  4808,
         1012,  2049,  2003, 13076,  1010,  1999,  1996,  4438,  2224,  1997,
         1996,  2773,  1012,  1026,  7987,  1013,  1028,  1026,  7987,  1013,
         1028,  2009,  2003,  2170, 11472,  2004,  2008,  2003, 

In [14]:
dataset[0]['attention_mask'] # zeros for padding indices

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [15]:
dataset[0]['token_type_ids'] # only one input so one index

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
dataset[0]['sentiments']