### Imports

In [44]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import torchtext
# import torchtext.data as ttd
import torchtext.legacy.data as ttd
%matplotlib inline

### Functions

In [69]:
def index_to_token(itos,sent):
    out = []
    for i in sent:
        out.append(itos[int(i)])

    return out

### Create data

In [45]:
data = {
    "label" : [0,1,1],
    "data" : [
              "I like eggs and ham",
              "Eggs I like!",
              "Ham and eggs or just ham?",
    ]
}

In [46]:
df = pd.DataFrame(data)

In [47]:
df.head()

Unnamed: 0,label,data
0,0,I like eggs and ham
1,1,Eggs I like!
2,1,Ham and eggs or just ham?


In [48]:
df.to_csv('thedata.csv', index=False)

In [49]:
TEXT = ttd.Field(
    sequential = True,
    batch_first = True,
    lower = True,
    tokenize = 'spacy',
    pad_first = True
)
LABEL = ttd.Field(sequential = False, use_vocab = False, is_target = True)

In [50]:
dataset = ttd.TabularDataset(
    path = 'thedata.csv',
    format = 'csv',
    skip_header = True,
    fields = [('label',LABEL),('data',TEXT)]
    # order is important
)

In [51]:
ex = dataset.examples[0]

In [52]:
type(ex)

torchtext.legacy.data.example.Example

In [53]:
ex.data

['i', 'like', 'eggs', 'and', 'ham']

In [54]:
ex.label

'0'

In [55]:
train_dataset , test_dataset = dataset.split(0.66) # default 0.7

In [56]:
TEXT.build_vocab(train_dataset)

In [57]:
vocab = TEXT.vocab

In [58]:
torchtext.vocab.Vocab

torchtext.vocab.Vocab

In [59]:
vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.legacy.vocab.Vocab object at 0x7f6f4d627dd0>>,
            {'!': 4,
             '<pad>': 1,
             '<unk>': 0,
             '?': 5,
             'and': 6,
             'eggs': 2,
             'ham': 3,
             'i': 7,
             'just': 8,
             'like': 9,
             'or': 10})

In [66]:
vocab.itos

['<unk>', '<pad>', 'eggs', 'ham', '!', '?', 'and', 'i', 'just', 'like', 'or']

In [61]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [62]:
train_iter, test_iter = ttd.Iterator.splits(
    (train_dataset,test_dataset), sort_key = lambda x: len(x.data),
    batch_sizes = (2,2), device=device
)

In [74]:
for inputs, targets in train_iter:
    print("inputs", inputs, "shape", inputs.shape)
    for input in inputs:
        print(index_to_token(vocab.itos,input))
    print("targets", inputs, "shape", targets.shape)
    break

inputs tensor([[ 1,  1,  1,  2,  7,  9,  4],
        [ 3,  6,  2, 10,  8,  3,  5]]) shape torch.Size([2, 7])
['<pad>', '<pad>', '<pad>', 'eggs', 'i', 'like', '!']
['ham', 'and', 'eggs', 'or', 'just', 'ham', '?']
targets tensor([[ 1,  1,  1,  2,  7,  9,  4],
        [ 3,  6,  2, 10,  8,  3,  5]]) shape torch.Size([2])


In [75]:
for inputs, targets in test_iter:
    print("inputs", inputs, "shape", inputs.shape)
    for input in inputs:
        print(index_to_token(vocab.itos,input))
    print("targets", inputs, "shape", targets.shape)
    break

inputs tensor([[7, 9, 2, 6, 3]]) shape torch.Size([1, 5])
['i', 'like', 'eggs', 'and', 'ham']
targets tensor([[7, 9, 2, 6, 3]]) shape torch.Size([1])
