### Load dataset from huggingface

In [1]:
from datasets import load_dataset

dataset = load_dataset("multi_nli")
train = dataset['train']

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset multi_nli (/home/cas/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)
100%|██████████| 3/3 [00:00<00:00,  8.81it/s]


### Preprocess
Apply tokenization

In [3]:
from transformers import AutoTokenizer
# load the auto tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [6]:
"""
returns a dict with 3 items:
- numbers representing the tokens
- indicator to which sequence a token belongs to
- indicates whether a token should be masked or not
"""
tokenizer("hello, hello world!")

{'input_ids': [101, 7592, 1010, 7592, 2088, 999, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [21]:
train[0]

{'promptID': 31193,
 'pairID': '31193n',
 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.',
 'premise_binary_parse': '( ( Conceptually ( cream skimming ) ) ( ( has ( ( ( two ( basic dimensions ) ) - ) ( ( product and ) geography ) ) ) . ) )',
 'premise_parse': '(ROOT (S (NP (JJ Conceptually) (NN cream) (NN skimming)) (VP (VBZ has) (NP (NP (CD two) (JJ basic) (NNS dimensions)) (: -) (NP (NN product) (CC and) (NN geography)))) (. .)))',
 'hypothesis': 'Product and geography are what make cream skimming work. ',
 'hypothesis_binary_parse': '( ( ( Product and ) geography ) ( ( are ( what ( make ( cream ( skimming work ) ) ) ) ) . ) )',
 'hypothesis_parse': '(ROOT (S (NP (NN Product) (CC and) (NN geography)) (VP (VBP are) (SBAR (WHNP (WP what)) (S (VP (VBP make) (NP (NP (NN cream)) (VP (VBG skimming) (NP (NN work)))))))) (. .)))',
 'genre': 'government',
 'label': 1}

In [9]:
from sentence_transformers import SentenceTransformer, util
import torch
# load the tranformer to create embeddings
st = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [78]:
embeds = st.encode([train[0]['premise'], train[0]['hypothesis']], convert_to_tensor=True)
# creates embeddings for both sentences in the array and returns array of size 2

#util.pytorch_cos_sim(embeds[0], embeds[1])
x = torch.cat((embeds[0], embeds[1]), -1)
print(x.shape)

torch.Size([768])


In [47]:
# first ten rows for testing
sample = load_dataset("multi_nli", split='train[:10]')

Found cached dataset multi_nli (/home/cas/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


In [82]:
# map to tokenize & embed the entire dataset
def tokenize(batch):
    embeds = st.encode([batch['premise'], batch['hypothesis']], convert_to_tensor=True)
    #embeds = torch.cat((embeds[0], embeds[1]))
    return {'embed': torch.cat((embeds[0], embeds[1]), -1)}


tk_train = sample.map(
    tokenize, 
    batched=True
)
#remove_columns=sample.column_names,

  0%|          | 0/1 [00:00<?, ?ba/s]


ArrowInvalid: Column 10 named embed expected length 10 but got length 1

In [80]:
tk_train

Dataset({
    features: ['embed'],
    num_rows: 768
})

In [31]:
# set dataset format to pytorch
tk_train.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "label"])
tk_train.format['type']

'torch'

In [32]:
next(iter(tk_train))

{'label': tensor(1),
 'input_ids': tensor([  101, 17158,  2135,  6949,  8301, 25057,  2038,  2048,  3937,  9646,
          1011,  4031,  1998, 10505,  1012,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [11]:
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe

# load pretrained GloVe embeddings
tokenizer = get_tokenizer("basic_english")
max_words = 20
embed_len=300
global_vectors = GloVe(name='840B', dim=300)

In [123]:
import torch
# load to GPU for speed up
device = 'cuda'
# takes string, returns 6000 dim GloVe vector
def to_vector(s):
    X = tokenizer(s[0])
    # fill / cut tokens to max size
    if len(X) < max_words:
        X = X+[""]*(max_words-len(X))
    else:
        X = X[:max_words]

    X_tensor = torch.zeros(1, max_words, embed_len).to(device)
    for i, j in enumerate(X):
        X_tensor[0][i] = global_vectors.get_vecs_by_tokens(j)
    return(X_tensor.reshape(1, -1))

def combined_tensor(row):
    x = to_vector(row['premise'])
    y = to_vector(row['hypothesis'])

    #return torch.cat((x, y), 1)
    return {
        'vector': torch.cat((x, y), 1),
        'label': row['label']
    }


In [125]:
combined_tensor(
    dataset['train'][0]
)

{'vector': tensor([[-0.3423, -0.0060, -0.4845,  ...,  0.0000,  0.0000,  0.0000]],
        device='cuda:0'),
 'label': 1}

In [126]:
sample_data = load_dataset("multi_nli", split='train[:1]')

Found cached dataset multi_nli (/home/cas/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


In [129]:
tf_datta = sample_data.with_transform(
    combined_tensor,
    columns=['premise', 'hypothesis', 'label'],
    output_all_columns=False
)

In [134]:
# the with_transform applies the custom preprocessing function on-the-fly
tf_datta.__getitem__(0)

{'vector': tensor([-0.0168, -0.2356,  0.0769,  ...,  0.0000,  0.0000,  0.0000],
        device='cuda:0'),
 'label': 1}

In [None]:
# map tokenizer to data
tk_data = sample_data.map(
    # creates a 12k GloVe of premise + hypothesis
    combined_tensor,
    # speed up by only keeping important columns
    input_columns=['premise', 'hypothesis', 'label'],
    remove_columns=sample_data.column_names,
)

In [66]:
tk_data

Dataset({
    features: ['label', 'vector'],
    num_rows: 10
})

In [155]:
# pytorch data loading
from torch.utils.data import DataLoader

def load_dataloader(split: str, batch: int):
    ds = dataset[split].with_format('torch')
    dl = DataLoader(ds, batch_size=batch, shuffle=True)   
    return dl

train_dl = load_dataloader('train', 4)

"""
Since we have batch=4, we get 4 datapoints. 
Each index of the label tensor belongs to one text pair.
"""

# test and display dataloader for 1 batch
for idx, batch in enumerate(train_dl):
    print(batch['hypothesis'], '\n\n', batch['premise'], '\n\n', batch['label'])
    break

['The objectives were realistic.', 'I enjoyed talking with you and must go home. ', 'One of the monsters cried out in English.', 'Road camping trips will be the last thing you have left.'] 

 ['The objectives of our research were to (1) define and describe the characteristics of a worldclass finance organization, (2) identify the factors that are essential for finance organizations to improve their financial management and move towards worldclass standards, and (3) provide case studies which illustrate the efforts of leading finance organizations from private sector companies and state governments to improve their financial management and the overall performance of their organizations.', 'yeah yeah well i need to run i enjoyed talking to you', 'One of the sharp-toothed monsters cried out in a low desert language.', "and then all you'll have will be road camping trips you know let's just go out to the mountains with the car honey and uh"] 

 tensor([1, 1, 2, 0])
