Load dataset from huggingface

In [1]:
from datasets import load_dataset

# load nli dataset
dataset = load_dataset("multi_nli")
# or load from disk:
# dataset = load_dataset("data/multi_nli")

# save to file:
#dataset.save_to_disk("data/multi_nli")

# variable to easily get split
train = dataset["train"]

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset multi_nli (/home/cas/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)
100%|██████████| 3/3 [00:00<00:00, 247.19it/s]


DatasetDict({
    train: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['promptID', 'pairID', 'premise', 'premise_binary_parse', 'premise_parse', 'hypothesis', 'hypothesis_binary_parse', 'hypothesis_parse', 'genre', 'label'],
        num_rows: 9832
    })
})

Preprocess (create embeddings)

In [21]:
from sentence_transformers import SentenceTransformer
import torch
# load the tranformer to create embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') # all-MiniLM-L6-v2

In [25]:
# test embeddings for 2 sentences, returns array of size 2
embeds = model.encode(
    [train[0]['premise'], 
    train[0]['hypothesis']],
    batch_size=2,
    convert_to_tensor=True)

# you can easily use cosine similarity:
#util.pytorch_cos_sim(embeds[0], embeds[1])

embeds

tensor([[ 0.0299, -0.0437, -0.0104,  ..., -0.0308,  0.0235, -0.0164],
        [ 0.0601,  0.0186, -0.0191,  ..., -0.0342,  0.0361, -0.0153]],
       device='cuda:0')

In [68]:
# first ten rows for testing
sample = load_dataset('multi_nli', split='train[:10]')

Found cached dataset multi_nli (/home/cas/.cache/huggingface/datasets/multi_nli/default/0.0.0/591f72eb6263d1ab527561777936b199b714cda156d35716881158a2bd144f39)


In [69]:
# map to tokenize & embed the entire dataset
def embed(batch):
    hypothesis_embeds = model.encode(
        batch['hypothesis'],
        batch_size=100
    )
    premise_embeds = model.encode(
        batch['premise'],
        batch_size=100
    )
    return {
        'hypothesis_embed': hypothesis_embeds,
        'premise_embed': premise_embeds,
        'label': batch['label']
    }
    
# features: ['label', 'hypothesis_embed', 'premise_embed']
tk_train = sample.map(
    embed, 
    batched=True,
    remove_columns=sample.column_names
)

100%|██████████| 1/1 [00:00<00:00,  6.70ba/s]


In [72]:
# set dataset format to pytorch
tk_train.set_format(type="torch", columns=['label', 'hypothesis_embed', 'premise_embed'])
tk_train.format['type']

'torch'

In [57]:
# what the dataloader will see
next(iter(tk_train))

{'label': tensor(1),
 'hypothesis_embeds': tensor([ 6.0065e-02,  1.8641e-02, -1.9076e-02,  7.3474e-03, -3.2977e-02,
         -4.0697e-02,  2.9380e-02, -3.2624e-03,  3.7807e-02,  2.1106e-02,
          3.7621e-02,  2.3885e-03,  3.9276e-02,  6.5623e-02,  2.4037e-02,
         -1.3892e-02, -1.0651e-02,  5.3798e-02, -5.9899e-02,  1.8707e-02,
          1.1617e-02, -3.8418e-02,  2.6686e-02,  5.6760e-03,  7.1867e-02,
          4.1261e-02,  1.0124e-02, -1.7631e-02,  2.6792e-03, -5.3405e-02,
          2.7820e-02, -1.8199e-02, -5.0365e-03,  8.1359e-03,  1.4809e-06,
         -7.8175e-03,  2.2834e-03,  1.1378e-02,  2.8299e-02,  5.2910e-02,
          5.2077e-02, -4.1096e-02, -1.7400e-02, -2.3719e-02,  5.6434e-03,
          1.0195e-01, -1.8178e-02, -4.3675e-02,  7.7006e-03,  3.1065e-02,
         -3.9787e-02,  4.0388e-02, -5.6131e-02, -1.2108e-02, -1.6662e-02,
         -5.0520e-03,  1.0267e-02, -3.4122e-02,  6.5921e-02, -1.6812e-02,
         -1.4708e-02,  1.2280e-02, -2.2507e-02,  2.9897e-02,  1.7379e-

Create data loader

In [62]:
# pytorch data loading
from torch.utils.data import DataLoader

def load_dataloader(dataset, batch: int):
    dl = DataLoader(dataset, batch_size=batch, shuffle=True)   
    return dl

train_dl = load_dataloader(tk_train, 4)

In [65]:
"""
Since we have batch=4, we get 4 datapoints. 
Each index of the label tensor belongs to one text pair.
"""

# test and display dataloader for 1 batch
for idx, batch in enumerate(train_dl):
    print(idx, batch)
    break

0 {'label': tensor([0, 2, 1, 1]), 'hypothesis_embed': tensor([[-0.0315,  0.0099,  0.0073,  ..., -0.0189, -0.0758, -0.0218],
        [ 0.0003,  0.0687,  0.0397,  ..., -0.0538, -0.0435,  0.0168],
        [-0.0455, -0.0296, -0.0246,  ...,  0.0424,  0.0385,  0.0029],
        [ 0.0601,  0.0186, -0.0191,  ..., -0.0342,  0.0361, -0.0153]]), 'premise_embed': tensor([[-0.0546,  0.0178, -0.0055,  ..., -0.0144, -0.0707, -0.0269],
        [-0.0092,  0.0466, -0.0016,  ...,  0.0184,  0.0111, -0.0341],
        [-0.0519,  0.0407, -0.0065,  ...,  0.0528,  0.0076,  0.0056],
        [ 0.0299, -0.0437, -0.0104,  ..., -0.0308,  0.0235, -0.0164]])}


In [75]:
# how we could concat the tensors easily
# maybe add cos inbetween?
torch.cat((tk_train[0]['premise_embed'], tk_train[0]['hypothesis_embed']))

tensor([ 0.0299, -0.0437, -0.0104,  ..., -0.0342,  0.0361, -0.0153])