In [1]:
import nltk
import pickle
import argparse
from collections import Counter
from pycocotools.coco import COCO

import pandas as pd

In [3]:
class Vocabulary(object):
    """Simple vocabulary wrapper."""
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if not word in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if not word in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)

def build_vocab(data='files_sentence.csv', threshold=4):
    """Build a simple vocabulary wrapper."""
    sentences=pd.read_csv('files_sentence.csv')
    mydict = dict(zip(sentences.id, sentences.sentence))
    ids=list(mydict)
    counter = Counter()
    for i, id in enumerate(ids):
        caption = str(mydict[id])
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if i % 1000 == 0:
            print("[%d/%d] Tokenized the captions." %(i, len(ids)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Creates a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Adds the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab

def main():
    vocab = build_vocab('files_sentence.csv',
                        threshold=4)
    vocab_path = 'data/vocab_try.pkl'
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print("Total vocabulary size: %d" %len(vocab))
    print("Saved the vocabulary wrapper to '%s'" %vocab_path)


In [4]:
sentences=pd.read_csv('files_sentence.csv')

In [5]:
sentences['file']=None
sentences['folder']=None

In [4]:
import glob

In [190]:
def files(string):
    (folder,name)=string.split("_")
    return glob.glob('D:/cources/1008/image_caption/data/mouths/'+str(string)+'/*.png')
def folder(string):
    (folder,name)=string.split("_")
    return 'D:/cources/1008/Project/lip-reading-deeplearning/data/'+folder+'/'+name+'/mouth/'

In [191]:
sentences['file']=sentences['id'].apply(files)
sentences['folder']=sentences['id'].apply(folder)

In [5]:
import argparse
import os
from PIL import Image

In [6]:
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from data_loader import get_loader 
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN 
from torch.autograd import Variable 
from torch.nn.utils.rnn import pack_padded_sequence
from torchvision import transforms

In [7]:
with open('data/vocab_try.pkl', 'rb') as f:
    vocab = pickle.load(f)

In [193]:
sentences[sentences.file.apply(lambda x: len(x))!=0].reset_index().to_pickle('sentence_nonempty.pkl')

In [21]:
with open('sentence_nonempty.pkl', 'rb') as f:
    table = pickle.load(f)

In [1]:
table.head()

NameError: name 'table' is not defined

In [160]:
sentences.file.apply(lambda x: len(x)).max()

151

In [27]:
import torch
import torchvision.transforms as transforms
import torch.utils.data as data
import os
import pickle
import numpy as np
import nltk
from PIL import Image
from build_vocab import Vocabulary
import pandas as pd


class Dataset(data.Dataset):
    """COCO Custom Dataset compatible with torch.utils.data.DataLoader."""
    def __init__(self, root, table, vocab, transform=None):
        """Set the path for images, captions and vocabulary wrapper.
        
        Args:
            root: image directory.
            json: coco annotation file path.
            vocab: vocabulary wrapper.
            transform: image transformer.
        """
        self.root = root
        with open(table, 'rb') as f:
            self.table = pickle.load(f)
        self.ids = list(self.table.id)
        self.vocab = vocab
        self.transform = transform

    def __getitem__(self, index):
        """Returns one data pair (image and caption)."""
        table = self.table
        vocab = self.vocab
        ann_id = self.ids[index]
        caption = table[table.id==ann_id]['sentence'].item()
        folder = table[table.id==ann_id]['folder'].item()
        path = table[table.id==ann_id]['file'].item()
        images=[]
        for i in range(151):
            try:
                p=path[i]
                image = Image.open(p)        
                if self.transform is not None:
                    image = self.transform(image)
                images.append(np.array(image))
            except IndexError:
                images.append(np.zeros((224,224)))                

        seq_img=np.stack(images,0)
        seq_img=torch.from_numpy(seq_img).float()

        # Convert caption (string) to word ids.
        tokens = nltk.tokenize.word_tokenize(str(caption).lower())
        caption = []
        caption.append(vocab('<start>'))
        caption.extend([vocab(token) for token in tokens])
        caption.append(vocab('<end>'))
        target = torch.Tensor(caption)
        return seq_img, target

    def __len__(self):
        return len(self.ids)


def collate_fn(data):
    """Creates mini-batch tensors from the list of tuples (image, caption).
    
    We should build custom collate_fn rather than using default collate_fn, 
    because merging caption (including padding) is not supported in default.

    Args:
        data: list of tuple (image, caption). 
            - image: torch tensor of shape (3, 256, 256).
            - caption: torch tensor of shape (?); variable length.

    Returns:
        images: torch tensor of shape (batch_size, 3, 256, 256).
        targets: torch tensor of shape (batch_size, padded_length).
        lengths: list; valid length for each padded caption.
    """
    # Sort a data list by caption length (descending order).
    data.sort(key=lambda x: len(x[1]), reverse=True)
    images, captions = zip(*data)

    # Merge images (from tuple of 3D tensor to 4D tensor).
#     print(len(images))
#     print(images[0].shape)
    images = torch.stack(images, 0)

    # Merge captions (from tuple of 1D tensor to 2D tensor).
    lengths = [len(cap) for cap in captions]
    targets = torch.zeros(len(captions), max(lengths)).long()
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]        
    return images, targets, lengths

In [28]:
def get_loader(root, table, vocab,transform, batch_size, shuffle, num_workers):
    """Returns torch.utils.data.DataLoader for custom coco dataset."""
    # COCO caption dataset
    mvlrs = Dataset(root=root,
                       table=table,
                       vocab=vocab,
                       transform=transform)
    
    # Data loader for COCO dataset
    # This will return (images, captions, lengths) for every iteration.
    # images: tensor of shape (batch_size, 3, 224, 224).
    # captions: tensor of shape (batch_size, padded_length).
    # lengths: list indicating valid length for each caption. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=mvlrs, 
                                              batch_size=batch_size,
                                              shuffle=shuffle,
                                              num_workers=num_workers,
                                              collate_fn=collate_fn)
    return data_loader

In [29]:
loader=get_loader('data/', 'sentence_nonempty.pkl', vocab, None, 6,shuffle=True, num_workers=0) 

In [30]:
for i, (images, captions, lengths) in enumerate(loader):
    print(images)
    break


( 0 , 0 ,.,.) = 
   67   67   67  ...   155  154  154
   67   67   67  ...   155  154  154
   67   67   67  ...   155  154  154
      ...         ⋱        ...      
  118  118  119  ...   136  136  136
  118  118  119  ...   136  136  136
  118  118  119  ...   136  136  136

( 0 , 1 ,.,.) = 
  126  128  131  ...    95   91   89
  126  128  131  ...    95   91   89
  126  128  131  ...    95   91   90
      ...         ⋱        ...      
  160  160  160  ...    96   96   96
  160  160  160  ...    96   96   96
  160  160  160  ...    96   96   96

( 0 , 2 ,.,.) = 
  121  122  125  ...    94   93   93
  121  122  125  ...    95   94   94
  121  122  125  ...    96   96   96
      ...         ⋱        ...      
  159  159  159  ...    99   99   99
  159  159  159  ...   100  100  100
  159  159  159  ...   100  100  100
    ... 

( 0 ,148,.,.) = 
    0    0    0  ...     0    0    0
    0    0    0  ...     0    0    0
    0    0    0  ...     0    0    0
      ...         ⋱        ... 