In [1]:
import scipy as sp
import pandas as pd

train_data = pd.read_csv("iwsltTokenizedData/combined_train.tsv", sep="\t")
validate_data = pd.read_csv("iwsltTokenizedData/combined_valid.tsv", sep="\t")
test_data = pd.read_csv("iwsltTokenizedData/combined_test.tsv", sep="\t")

In [2]:
print(train_data.shape, validate_data.shape, test_data.shape)

(149135, 2) (6764, 2) (6384, 2)


In [3]:
train_data.iloc[0,0]

'und diese zwei zusammen zu bringen , erscheint vielleicht wie eine gewal@@ tige aufgabe . aber was ich ihnen zu sagen versuche ist , dass es trotz dieser komplexität einige einfache themen gibt , von denen ich denke , wenn wir diese verstehen , können wir uns wirklich weiter entwickeln .'

In [4]:
train_data.iloc[0,1]

'and bringing those two together might seem a very da@@ un@@ ting task , but what i &apos;m going to try to say is that even in that complexity , there &apos;s some simple the@@ mes that i think , if we understand , we can really move forward .'

In [5]:
def fix_sentence(sentence):  
    new_sentence = []
    cur_word = ''
    for p in sentence:
      if '@@' in p:
        cur_word += p[:-2]
      else:
        if cur_word != '':
          new_sentence.append(cur_word+p)
          cur_word = ''
        elif '&' in p and ';' in p: #this means should be adding this onto last added word 
          if len(new_sentence) >0:
            new_sentence[-1] = new_sentence[-1] + "'"+p.split(';')[1]
          #OTHERWISE NOT SURE WHAT TO DO
          else:
            pass #NEED TO IMPLEMENT
        else:
          new_sentence.append(p)  
    return new_sentence

In [6]:
fix_sentence(train_data.iloc[0,1].split())

['and',
 'bringing',
 'those',
 'two',
 'together',
 'might',
 'seem',
 'a',
 'very',
 'daunting',
 'task',
 ',',
 'but',
 'what',
 "i'm",
 'going',
 'to',
 'try',
 'to',
 'say',
 'is',
 'that',
 'even',
 'in',
 'that',
 'complexity',
 ',',
 "there's",
 'some',
 'simple',
 'themes',
 'that',
 'i',
 'think',
 ',',
 'if',
 'we',
 'understand',
 ',',
 'we',
 'can',
 'really',
 'move',
 'forward',
 '.']

In [10]:
import torch
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext import data, datasets

global BOS_WORD, EOS_WORD, BLANK_WORD,NUM_WORD
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"

def createIterators(batch_size, data_path):
	tokenize = lambda x: x.split(' ') #already have tokenized files so now just split

	SRC = Field(sequential=True, tokenize=tokenize, init_token = BOS_WORD, 
										eos_token = EOS_WORD, pad_token=BLANK_WORD)
	TGT = Field(sequential=True, tokenize=tokenize, init_token = BOS_WORD, 
										eos_token = EOS_WORD, pad_token=BLANK_WORD)

	#MAX_LEN = 30 #start small at max 15 tokens
	train,valid,test = TabularDataset.splits(
               path=data_path, # the root directory where the data lies
               train='combined_train.tsv',
               validation='combined_valid.tsv',
               test = 'combined_test.tsv',
               fields=[('de',SRC),('en',TGT)],
               format='TSV',
               skip_header=False) 
 

	SRC.build_vocab(train.de)
	TGT.build_vocab(train.en)

	src_padding_ind = SRC.vocab.stoi[BLANK_WORD]
	tgt_padding_ind = TGT.vocab.stoi[BLANK_WORD]
	tgt_eos_ind = TGT.vocab.stoi[EOS_WORD]
	#for each token, check if tokenized version of that token is the same (if so then 
	#spacy contains that token
  #This portion has been tested decently
	
	#docs for Iterator: https://github.com/pytorch/text/blob/c839a7934930819be7e240ea972e4d600966afdc/torchtext/data/iterator.py
	train_iter, val_iter, test_iter = data.BucketIterator.splits((train, valid, test), batch_size=batch_size,
										sort_key=lambda x: data.interleave_keys(len(x.en), len(x.de)), 
                    sort_within_batch=True, shuffle=True,repeat=False) #,device=0)

	#NOTE: each batch in train_iter has shape (max numtokens in batch, batch size) but for each batch
	# may have different max num tokens
	'''
	#now trying to print words from iter to see that this makes sense
	for t in train_iter:
		trgTensor = vars(t)['trg']
		print(trgTensor)
		print([TGT.vocab.itos[i] for i in trgTensor[:,0]])
		break
	'''
	datasetDict = {'train_iter':train_iter,'val_iter':val_iter,'test_iter':test_iter,
					'SRC':SRC, 'TGT':TGT,'src_padding_ind':src_padding_ind, 'tgt_padding_ind':tgt_padding_ind,
					'tgt_eos_ind':tgt_eos_ind}
	
	return datasetDict


In [11]:
batch_size = 64
data_path = 'iwsltTokenizedData/'
dataset_dict = createIterators(batch_size, data_path)

In [13]:
print(len(dataset_dict['TGT'].vocab.itos))
print(len(dataset_dict['SRC'].vocab.itos))
allStrings = [k.lower() for k in dataset_dict['TGT'].vocab.stoi.keys()]
print(allStrings[:50])
print(len(set(allStrings)))
print(len(dataset_dict['train_iter'])*64)
print(len(dataset_dict['val_iter'])*64)
print(len(dataset_dict['test_iter'])*64)

6565
8713
['<unk>', '<blank>', '<s>', '</s>', '.', ',', 'the', 'and', 'to', 'of', 'a', 'that', 'i', 'it', 'in', 'you', 'is', 'we', '&apos;s', 'this', 'so', '&quot;', 'they', 'was', 'for', 'are', 'on', 'have', 'what', 'but', '?', '--', 'can', 'with', '&apos;t', 'there', 'about', 'be', 'at', 'as', 'all', 'do', 'not', 'one', 'my', '&apos;re', 'an', 'people', 'like', 'now']
6565
149184
6784
6400


In [19]:
import torch

In [17]:
class MainParams:
    def __init__(self, dropout, src_vocab_size, tgt_vocab_size, batch_size):
        use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda:0" if use_cuda else "cpu")
        print(self.device)
        self.model_params = dict(d_model=128,nhead=8,num_encoder_layers=4, num_decoder_layers=4,
                dim_feedforward=512, dropout=0.2,activation='relu',src_vocab_size=src_vocab_size,
                tgt_vocab_size=tgt_vocab_size)

        self.batch_size = batch_size
        self.num_decode_steps = 60 #MAX NUMBER OF DECODING STEPS WE'LL do (can increase this)
        
src_vocab_size = len(dataset_dict['SRC'].vocab.itos)
tgt_vocab_size = len(dataset_dict['TGT'].vocab.itos)
main_params = MainParams(dropout=0.2,src_vocab_size=src_vocab_size,
              tgt_vocab_size=tgt_vocab_size,batch_size=batch_size)

cpu


In [21]:
from transformer_model import TransformerModel

path = 'savedModel/'
path_to_policy = path + 'policy_supervised_small.pt'
path_to_value = path + 'value_supervised_small.pt'

policy = TransformerModel(**(main_params.model_params)).to(main_params.device).double()
policy.load_state_dict(torch.load(path_to_policy))



NameError: name 'torch' is not defined