For the Conversational Entailment dataset you will need to connect your google drive to the Colab environments. This can be achieved by clicking on the folder icon on the left and then the Google Drive icon. You will be granted access to the class' shared folder.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
try:
  from transformers import BertTokenizer, BertModel, BertForMaskedLM
except:
  !pip install transformers
  from transformers import BertTokenizer, BertModel, BertForMaskedLM
import torch
import numpy as np
import json
from glob import glob


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
embedder = BertModel.from_pretrained('bert-base-uncased')
embedder.eval()

dev_file = '/content/drive/Shared drives/EECS595-Fall2020/Final_Project_Common/Conversational_Entailment/dev_set.json'
dev_data = json.load(open(dev_file))

print(json.dumps(dev_data[10], indent=4))
sentences = [x['text'] for x in dev_data[10]['items'][0]['items']]
tokenized_texts = [['[CLS]'] + tokenizer.tokenize(sent) + ['[SEP]'] for sent in sentences]

# Pad input
MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
MAX_LEN = max([len(x) for x in input_ids])
for i in range(len(input_ids)):
  input_ids[i] += [0]*(MAX_LEN - len(input_ids[i]))
input_ids = torch.tensor(input_ids)

segments = torch.zeros(input_ids.shape)
with torch.no_grad():
  _, sentence_embeddings = embedder(input_ids)

print('\n'.join(sentences))
print(sentence_embeddings)
print(sentence_embeddings.shape)






Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/f4/9f93f06dd2c57c7cd7aa515ffbf9fcfd8a084b92285732289f4a5696dd91/transformers-3.2.0-py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.8MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 17.5MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 21.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


{
    "id": 426,
    "entailment": "1",
    "type": "belief",
    "tag": "pair",
    "items": [
        {
            "source": "SW2433",
            "tag": "dialog",
            "items": [
                {
                    "num": "3",
                    "speaker": "A",
                    "text": "I ripped the ligaments in my right ankle.",
                    "tag": "turn"
                },
                {
                    "num": "4",
                    "speaker": "B",
                    "text": "Gosh.",
                    "tag": "turn"
                },
                {
                    "num": "5",
                    "speaker": "A",
                    "text": "Yeah so,",
                    "tag": "turn"
                },
                {
                    "num": "6",
                    "speaker": "B",
                    "text": "Exercise is not supposed to do that to you.",
                    "tag": "turn"
                }
            ]
        },
    

To access the Commonsense QA dataset you have example code below.

In [4]:
try:
  from datasets import load_dataset
except:
  !pip install datasets
  from datasets import load_dataset
dataset = load_dataset('commonsense_qa')
print(dataset.keys())
print(dir(dataset['train']))

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/83/7e/8d9e2fd30e3819e6042927d379f3668a0b49fe38b92d5639194808a1d877/datasets-1.0.2-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 2.8MB/s 
Collecting pyarrow>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/f3/99/0a605f016121ca314d1469dc9069e4978395bc46fda40f73099d90ad3ba4/pyarrow-1.0.1-cp36-cp36m-manylinux2014_x86_64.whl (17.3MB)
[K     |████████████████████████████████| 17.3MB 245kB/s 
Collecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)
[K     |████████████████████████████████| 245kB 48.9MB/s 
Installing collected packages: pyarrow, xxhash, datasets
  Found existing installation: pyarrow 0.14.1
    Uninstalling pyarrow-0.14.1:
      Successfully uninstalled pyarrow-0.14.1
Successfully installed datasets-1.0.2 py

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1586.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1055.0, style=ProgressStyle(description…

Using custom data configuration default



Downloading and preparing dataset commonsense_qa/default (download: 4.46 MiB, generated: 2.08 MiB, post-processed: Unknown size, total: 6.54 MiB) to /root/.cache/huggingface/datasets/commonsense_qa/default/0.1.0/0e60f0ee8c8509e854ed897f65eb5b2e6ca22578d64cbc3812c79b527d7a7a29...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3785890.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=423148.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=471653.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset commonsense_qa downloaded and prepared to /root/.cache/huggingface/datasets/commonsense_qa/default/0.1.0/0e60f0ee8c8509e854ed897f65eb5b2e6ca22578d64cbc3812c79b527d7a7a29. Subsequent calls will reuse this data.
dict_keys(['train', 'validation', 'test'])
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_index_is_initialized', '_convert_outputs', '_data', '_data_files', '_fingerprint', '_format_columns', '_format_kwargs', '_format_type', '_get_cache_file_path', '_getitem', '_indexes', '_indices', '_indices_data_files', '_info', '_inplace_history', '_map_indices', '_map_single', '_nest', '_new_dataset_with_indices'