In [6]:
import math
import random
import torch
import numpy as np
import pandas as pd
from torch.autograd import Variable

class Dataloader(object):
    """Class to Load Language Pairs and Make Batch
    """   
    def __init__(self, Filename, batch_size, src_lang='en', tgt_lang='zh', v_feat='i3d', max_len=40, cuda=False, volatile=False, sort=True):
        # Need to reload every time because memory error in pickle
        df = pd.read_csv(Filename)
        print(df.shape)
        src_t = []
        src_v = []
        tgt = []
        nb_pairs = 0
        for index, row in df.iterrows():
            src_line, tgt_line = row[src_lang], row[tgt_lang]
            if src_line=='' and tgt_line=='':
                break            
            src_ids = list(map(int, src_line.strip().split()))
            # #Remove SOS and EOS for source 
            # src_ids = src_ids[1:-1]
            tgt_ids = list(map(int, tgt_line.strip().split()))
            if (0 in src_ids or 0 in tgt_ids):
                continue
            if len(src_ids)>0 and len(tgt_ids)>0:
                # Truncate instead of discarding the sentence
                src_t.append(src_ids if len(src_ids)<max_len+1 else src_ids[:max_len]+[3])
                if v_feat == 'i3d':
                    src_v.append(row['i3d_path'])
                tgt.append(tgt_ids if len(tgt_ids)<max_len+1 else tgt_ids[:max_len]+[3])
                nb_pairs += 1
        print('%d pairs are converted in the data' %nb_pairs)
        if sort:
            sorted_idx = sorted(range(nb_pairs), key=lambda i: len(src_t[i]))
        else:
            sorted_idx = [i for i in range(nb_pairs)]
        self.src_t = [src_t[i] for i in sorted_idx]
        self.src_v = [src_v[i] for i in sorted_idx] if src_v else []
        self.tgt = [tgt[i] for i in sorted_idx]
        self.batch_size = batch_size
        self.nb_pairs = nb_pairs
        self.nb_batches = math.ceil(nb_pairs/batch_size)
        self.v_feat = v_feat
        self.cuda = cuda
        self.volatile = volatile
        
    def __len__(self):
        return self.nb_batches  

    def _shuffle_index(self, n, m):
        """Yield indexes for shuffling a length n seq within every m elements"""
        indexes = []
        for i in range(n):
            indexes.append(i)
            if (i+1)%m ==0 or i==n-1:
                random.shuffle(indexes)
                for index in indexes:
                    yield index
                indexes = []
            
    def shuffle(self, m):
        """Shuffle the language pairs within every m elements
        
        This will make sure pairs in the same batch still have similr length.
        """
        shuffled_indexes = self._shuffle_index(self.nb_pairs, m)
        src_t, src_v, tgt = [], [], []
        for index in shuffled_indexes:
            src_t.append(self.src_t[index])
            tgt.append(self.tgt[index])
            if self.src_v:
                src_v.append(sefl.src_v[index])
        self.src_t = src_t
        self.src_v = src_v
        self.tgt = tgt
        
    def _wrap(self, sentences):
        """Pad sentences to same length and wrap into Variable"""
        max_size = max([len(s) for s in sentences])
        out = [s + [0]*(max_size-len(s)) for s in sentences]
        out = torch.LongTensor(out)
        if self.cuda:
            out = out.cuda()
        return Variable(out, volatile=self.volatile)
    
    def _v_feat_preprocess(self, paths):
        out = None
        if self.v_feat == 'i3d':
            # shape:(1, *, 1024)
            arrays = [np.load(path) for path in paths]
            # Pad zeros to make features have same size
            max_size = max([a.shape[1] for a in arrays])
            out = [np.pad(a,[(0, 0), (0, max_size-a.shape[1]), (0, 0)]) for a in arrays]
            out = torch.tensor(out).float()
            out = torch.squeeze(out, 1)
        return out
        # TODO: preprocessing for raw videos or other encoder
        #As shapes of raw video or feature are not fixed, put them in list
        # elif self.v_feat == 'raw'
        # elif self.v_feat == 's3d'

    def __getitem__(self, i): 
        """Generate the i-th batch and wrap in Variable"""
        src_t_batch = self.src_t[i*self.batch_size:(i+1)*self.batch_size]
        src_v_batch = self.src_v[i*self.batch_size:(i+1)*self.batch_size]
        tgt_batch = self.tgt[i*self.batch_size:(i+1)*self.batch_size]

        return [self._wrap(src_t_batch), self._v_feat_preprocess(src_v_batch)], self._wrap(tgt_batch)

In [2]:
import pickle
from preprocess import Lang
from Dataloader import Dataloader
num_epochs = 100
batch_size = 32
MAX_LEN = 40
src_lang = 'en'
tgt_lang = 'zh'
run_testing_during_training = True
preprocessing_type = 'jieba'
print('Loading dict')
src_dict = pickle.load(open('./data/{}/{}_dict.pkl'.format(preprocessing_type, src_lang), 'rb'))
tgt_dict = pickle.load(open('./data/{}/{}_dict.pkl'.format(preprocessing_type, tgt_lang), 'rb'))
print("Building Dataloader ...")
train_path = './data/{}/train.id'.format(preprocessing_type)
valid_path = './data/{}/valid.id'.format(preprocessing_type)
test_path = './data/{}/test.id'.format(preprocessing_type)

# traindataloader = Dataloader(train_path, 1, src_lang=src_lang, tgt_lang=tgt_lang,
#                                 v_feat='None',max_len=MAX_LEN, cuda=True)
devdataloader = Dataloader(valid_path, 1, src_lang=src_lang, tgt_lang=tgt_lang,
                                v_feat='None',max_len=MAX_LEN, cuda=True)
testdataloader = Dataloader(test_path, 1, src_lang=src_lang, tgt_lang=tgt_lang,
    at='None',max_len=MAX_LEN, cuda=True), ososort=False

SyntaxError: can't assign to function call (<ipython-input-2-cf3d05612818>, line 23)

In [3]:
import pickle
from preprocess import Lang
from Dataloader import Dataloader
num_epochs = 100
batch_size = 32
MAX_LEN = 40
src_lang = 'en'
tgt_lang = 'zh'
run_testing_during_training = True
preprocessing_type = 'jieba'
print('Loading dict')
src_dict = pickle.load(open('./data/{}/{}_dict.pkl'.format(preprocessing_type, src_lang), 'rb'))
tgt_dict = pickle.load(open('./data/{}/{}_dict.pkl'.format(preprocessing_type, tgt_lang), 'rb'))
print("Building Dataloader ...")
train_path = './data/{}/train.id'.format(preprocessing_type)
valid_path = './data/{}/valid.id'.format(preprocessing_type)
test_path = './data/{}/test.id'.format(preprocessing_type)

traindataloader = Dataloader(train_path, 1, src_lang=src_lang, tgt_lang=tgt_lang,
                                v_feat='None',max_len=MAX_LEN, cuda=True)
# devdataloader = Dataloader(valid_path, 1, src_lang=src_lang, tgt_lang=tgt_lang,
#                                 v_feat='None',max_len=MAX_LEN, cuda=True)
testdataloader = Dataloader(test_path, 1, src_lang=src_lang, tgt_lang=tgt_lang, v_feat='None', max_len=MAX_LEN, cuda=True, sort=False)

Loading dict
Building Dataloader ...
114955 pairs are converted in the data
15000 pairs are converted in the data


In [7]:
from tqdm import tqdm
from time import sleep
for i in range(len(testdataloader)):
    src, tgt = testdataloader[i]
    x_t, x_v = src[0], src[1]
    x_t = x_t.tolist()[0]
    y = tgt.tolist()[0]


 15
15 13
29 24
17 17
27 30
34 15
18 20
15 12
16 14
23 19
19 12
21 16
18 22
20 16
20 15
22 14
25 20
14 14
20 16
18 18
17 17
21 16
16 18
14 13
17 16
21 14
17 12
13 14
16 14
16 16
16 17
17 16
14 13
16 17
14 16
14 14
13 16
16 13
24 18
19 13
15 16
17 15
19 20
28 19
17 15
15 18
16 13
14 19
15 15
14 18
21 21
15 19
17 14
15 15
15 18
24 16
17 19
18 20
19 15
17 21
27 27
16 19
15 16
22 14
18 17
22 17
16 16
17 18
14 13
20 17
18 19
25 15
21 19
16 12
17 15
26 12
15 14
17 15
15 16
13 13
15 18
35 14
17 22
16 15
13 16
17 17
19 12
16 15
16 19
15 15
18 13
17 13
15 15
16 13
14 16
16 13
14 19
16 16
20 19
16 20
15 17
12 15
19 25
17 15
15 17
21 22
20 17
16 14
19 16
19 19
18 14
22 18
24 15
17 17
17 17
16 14
16 15
17 18
19 19
18 17
26 18
21 16
19 16
16 16
21 18
21 21
16 12
17 13
16 14
17 16
18 19
24 19
20 18
21 17
16 13
16 14
19 19
26 14
23 17
17 13
19 16
18 17
16 17
20 15
19 15
23 17
18 17
16 15
17 19
15 15
19 20
15 13
16 15
18 18
21 16
20 15
18 16
17 20
15 20
17 17
17 13
24 19
16 12
13 13
18 18
18 15
14 18


KeyboardInterrupt: 

In [8]:
max([len(x) for x in traindataloader.tgt])

76

In [16]:
for i in tqdm(range(len(testdataloader))):
    src_batch, tgt_batch = testdataloader[i]





  0%|          | 0/15000 [00:00<?, ?it/s][A[A[A[A



  6%|▌         | 878/15000 [00:00<00:01, 8773.80it/s][A[A[A[A



 13%|█▎        | 1981/15000 [00:00<00:01, 9346.32it/s][A[A[A[A



 21%|██        | 3104/15000 [00:00<00:01, 9839.46it/s][A[A[A[A



 28%|██▊       | 4244/15000 [00:00<00:01, 10259.98it/s][A[A[A[A



 36%|███▌      | 5328/15000 [00:00<00:00, 10425.27it/s][A[A[A[A



 42%|████▏     | 6234/15000 [00:00<00:00, 9804.96it/s][A[A[A[A



 49%|████▉     | 7368/15000 [00:00<00:00, 10217.67it/s][A[A[A[A



 57%|█████▋    | 8526/15000 [00:00<00:00, 10590.58it/s][A[A[A[A



 65%|██████▍   | 9690/15000 [00:00<00:00, 10882.84it/s][A[A[A[A



 72%|███████▏  | 10848/15000 [00:01<00:00, 11081.81it/s][A[A[A[A



 80%|███████▉  | 11956/15000 [00:01<00:00, 11078.77it/s][A[A[A[A



 87%|████████▋ | 13098/15000 [00:01<00:00, 11178.09it/s][A[A[A[A



100%|██████████| 15000/15000 [00:01<00:00, 11045.62it/s]


In [8]:
for i in range(len(testdataloader)):
    src_t_batch = testdataloader.src_t[i:(i+1)]
    tgt_batch = testdataloader.tgt[i:(i+1)]
    testdataloader._wrap(src_t_batch)
    testdataloader._wrap(tgt_batch)
    # if not (torch.tensor(src_batch_t) >=0).all():
    #     print(i, src_batch_t)
    # if not (torch.tensor(tgt_batch) >=0).all():
    #     print(i, tgt_batch)

In [3]:
testdataloader.batch_size


1

In [89]:
testdataloader = Dataloader(test_path, 1, src_lang=src_lang, tgt_lang=tgt_lang, 
                                v_feat='None', max_len=MAX_LEN, cuda=True, volatile=True ,sort=False)
    

(15000, 6)
15000 pairs are converted in the data


In [1]:
from Layers import *
from Model import *
import pickle
from preprocess import Lang
from Dataloader import Dataloader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(123)
torch.cuda.manual_seed(123)
# params
num_epochs = 100
batch_size = 128
MAX_LEN = 40
src_lang = 'en'
tgt_lang = 'zh'
run_testing_during_training = True
preprocessing_type = 'jieba'
print('Loading dict')
src_dict = pickle.load(open('./data/{}/{}_dict.pkl'.format(preprocessing_type, src_lang), 'rb'))
tgt_dict = pickle.load(open('./data/{}/{}_dict.pkl'.format(preprocessing_type, tgt_lang), 'rb'))
print("Building Dataloader ...")
train_path = './data/{}/train.id'.format(preprocessing_type)
valid_path = './data/{}/valid.id'.format(preprocessing_type)
test_path = './data/{}/test.id'.format(preprocessing_type)

# traindataloader = Dataloader(train_path, batch_size, src_lang=src_lang, tgt_lang=tgt_lang,
#                                 v_feat='None',max_len=MAX_LEN, cuda=True)
# devdataloader = Dataloader(valid_path, batch_size, src_lang=src_lang, tgt_lang=tgt_lang, 
#                             v_feat='None', max_len=MAX_LEN, cuda=True, volatile=True)
if run_testing_during_training:  
    testdataloader = Dataloader(test_path, 1, src_lang=src_lang, tgt_lang=tgt_lang, 
                            v_feat='i3d', max_len=MAX_LEN, cuda=True, volatile=True, sort=False)  # test sentences one by one

print("Building Model ...")
INPUT_DIM = src_dict.n_words + 1
OUTPUT_DIM = tgt_dict.n_words + 1
ENC_EMB_DIM = 512
ENC_V_DIM = 1024
DEC_EMB_DIM = 512
ENC_HID_DIM = 512
DEC_HID_DIM = 1024

attn = Attention(ENC_HID_DIM*2, DEC_HID_DIM)
enc_t = EncoderRNN(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM)
enc_v = EncoderRNN_VFeat(ENC_V_DIM, ENC_HID_DIM)
dec = AttnDecoderRNN_V(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM*2, DEC_HID_DIM, attn)

model = Seq2Seq_VFeat(enc_t, enc_v, dec, device).to(device)

Loading dict
Building Dataloader ...
15000 pairs are converted in the data
Building Model ...


In [2]:
from Translator_s2s import Translator
src_batch, tgt_batch = testdataloader[0]
x_t, x_v = src_batch[0], src_batch[1]
x_t = x_t.tolist()[0]
src_id = Variable(torch.LongTensor(x_t).unsqueeze(0).cuda(), volatile=True) 
# model.load_state_dict(torch.load('checkpoints/jieba/epoch1_acc_0.09_ppl_4851.24.pt'))
translator = Translator(model)
pred = translator.translate_v(x_t, x_v)


torch.Size([1, 512]) torch.Size([1, 1, 1024]) torch.Size([1, 1, 1024])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [93]:
src_id.shape

torch.Size([1, 14])

In [64]:
(torch.tensor([1,2,3]) >0).all()


asd


In [31]:
import torch
src_len = hidden[0].shape[1]
hidden = torch.tanh(nn.Linear(1024, 1024)(hidden[0].permute(1,0,2).view(src_len,-1)))

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

In [36]:
hidden[0].permute(1,0,2).contiguous().view(32,-1).shape

torch.Size([32, 1024])

In [40]:
a = {1: "<unk>", 2: "<BOS>", 3: "<EOS>", 4: 'asda', 7: 'asda'}
a = {i+1:value for i, (key,value) in enumerate(a.items()) if i+1 > 3}
a.update({1: "<unk>", 2: "<BOS>", 3: "<EOS>"})
a

{4: 'asda', 5: 'asda', 1: '<unk>', 2: '<BOS>', 3: '<EOS>'}

In [46]:
 import torch.nn.functional as F
 F.softmax(hidden[0][0], dim=1).shape

torch.Size([32, 256])

In [49]:
hidden[0][0].shape == hidden[0][0].shape

True

In [53]:
hidden[0].shape

torch.Size([4, 32, 256])

In [58]:
import pickle
preprocessing_type = 'jieba'
zh_dict = pickle.load(open('./data/'+preprocessing_type +'/zh_dict.pkl', 'rb'))

AttributeError: Can't get attribute 'Lang' on <module '__main__'>

In [63]:
from preprocess import Lang
with open('./data/'+preprocessing_type +'/zh_dict.pkl', 'rb') as f:
    zh_dict = pickle.load(f)

In [65]:
zh_dict.n_words

8908

In [68]:
import pandas as pd
df = pd.read_csv('/mnt/md0/yingchen_ntu/VMT/VMT/yc_VMT/data/jieba/test.id')

In [69]:
df.shape

(15000, 6)

In [77]:
import tqdm
for i, row in tqdm.tqdm(enumerate(df.iterrows())):
    src_line, tgt_line = row[1]['en'], row[1]['zh']

15000it [00:01, 8332.19it/s]


In [82]:
src_batch[0].tolist()[0]

[82, 33, 613, 188, 958, 84, 4, 1764]

In [83]:
'余  因个'.replace(" ","")

'余因个'