# 处理问题
    更具单词便 token 一下，然后变成最长长度。
# 处理图像特征
    转化为hd5f 文件
# 处理context 
    fasttext, 编码每一个context
    补全到最大context的内容
    采用预处理的方法，先将图片中的context都处理一下，处理完了保存为hd5f 文件

In [12]:
import numpy as np
from torch.utils.data import Dataset
from fasttext import load_model
import os
import numpy as np
import h5py
import pickle
import re

SENTENCE_SPLIT_REGEX = re.compile(r"(\W+)")

#### 文本处理函数

In [13]:
# 删除单词中逗号，问好， 's 加上空格
def word_tokenize(word):
    word = word.lower()
    word = word.replace(",", "").replace("?", "").replace("'s", " 's") 
    return word.strip()

# 处理句子
def tokenize(sentence, regex=SENTENCE_SPLIT_REGEX, keep=["'s"], remove=[",", "?"]):
    sentence = sentence.lower()

    for token in keep:
        sentence = sentence.replace(token, " " + token)

    for token in remove:
        sentence = sentence.replace(token, "")

    tokens = regex.split(sentence)
    tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
    return tokens

#### fasttext 处理

In [3]:
# 加载fasttext 模型
def load_fasttext_model():
    fasttext_model_file_path = os.path.join('../pythia', ".vector_cache", "wiki.en.bin")
    fasttext_model = load_model(fasttext_model_file_path)
    return fasttext_model

def wordToVector(context):
    fasttext_model = load_fasttext_model()
    if len(context)==1:
        return np.mean([fasttext_model.get_word_vector(w) for w in context.split(" ")], axis=0)
    else:
        w_embedding = np.zeros((len(context), 300))
        for i in range(len(context)):
            w_embedding[i] = np.mean([fasttext_model.get_word_vector(w) for w in context[i].split(" ")], axis=0)
        return w_embedding

#### 处理 ocr 文本，将文本存储为hdf5文件

In [4]:
'''
(1) 获得每个图片的context
(2) 将所有的context，使用fasttext 编码
(3) 编码完了，保存所有的fasttext 编码到一个hdf5 文件中
    1. 保存特征文件
    2. 保存image id 和 context index 的map 文件
文件路径：

'''
data_dir = {
    'test' :'../data/imdb/textvqa_0.5/imdb_textvqa_test.npy', 
    'train':'../data/imdb/textvqa_0.5/imdb_textvqa_train.npy',
    'val'  : '../data/imdb/textvqa_0.5/imdb_textvqa_val.npy'}

# 统计所有的图片id，和总数量
def statistic_image_id():
    imageid_2_ocrnum = {}
    ocr_statistic = {}
    for key in data_dir.keys():
        data = np.load(open(data_dir[key],'rb'), allow_pickle=True)
        for i in range(1,data.size):
            if imageid_2_ocrnum.get(data[i]['image_id'], None) == None:
                imageid_2_ocrnum[data[i]['image_id']] = len(data[i]['ocr_tokens'])
                ocr_statistic[len(data[i]['ocr_tokens'])] = ocr_statistic.get(len(data[i]['ocr_tokens']),0) + 1
    return imageid_2_ocrnum, len(imageid_2_ocrnum), ocr_statistic

def ContextToVectorByFasttext(save = False):
    # load fasttext model 
    fasttext_model = load_fasttext_model()
    # 目录
    context_feature_hdf5 = '../data/imdb/textvqa_0.5/context_embeddding.hdf5'
    context_imageid2index = "../data/imdb/textvqa_0.5/context_imageid2index.pkl"
    imageid2contextnum = "../data/imdb/textvqa_0.5/imageid2contextnum.pkl"
    context_file = h5py.File(context_feature_hdf5, "w")
    # 统计所有图片id
    imageid_2_ocrnum, num, ocr_num = statistic_image_id()
    print("总数：", num)
    max_num = 50
    em_dim = 300
    # create context embedding file
    imageid_2_contextindex = {}
    context_embedding = context_file.create_dataset('context_embedding', (sum(imageid_2_ocrnum.values()), em_dim), 'f')
    
    num_without_ocr = 0
    num_image = 0
    index = 0
    for key in data_dir.keys():
        # loda textvqa data
        data = np.load(open(data_dir[key],'rb'), allow_pickle=True)
        for i in range(1,data.size):
            ocr_tokens = data[i]['ocr_tokens']
            image_id = data[i]['image_id']
            
    #         ocr_info = data[i]['ocr_info']  文本框的信息后面再处理
            if imageid_2_contextindex.get(image_id,None) == None:
                num_image += 1
                
                imageid_2_contextindex[image_id] = index
                
                if len(ocr_tokens)==0:
                    num_without_ocr += 1
                
                for j in range(len(ocr_tokens)):
                    word = ocr_tokens[j]
                    words_embedding = np.mean([fasttext_model.get_word_vector(w) for w in word_tokenize(word).split(" ")], axis=0)
                    context_embedding[index, :] = words_embedding # ,100,2048
                    index += 1
                
    context_file.close()
    pickle.dump(imageid_2_contextindex, open(context_imageid2index, 'wb'))
    pickle.dump(imageid_2_ocrnum, open(imageid2contextnum, 'wb'))
    print("没有ocr的图片个数：%d, 总共%d"%(num_without_ocr, num_image))

# save data
# ContextToVectorByFasttext()

In [9]:
## read ocr 文件 according image_id
data_dir = '../data/imdb/textvqa_0.5/'
context_feature_file = data_dir + "context_embeddding.hdf5"
context_imageid2index = data_dir + "context_imageid2index.pkl"
imageid2contextnum = data_dir + "imageid2contextnum.pkl"

c_imageid_i = pickle.load(open(context_imageid2index,'rb'))
c_imageid_num = pickle.load(open(imageid2contextnum,'rb'))
with h5py.File(context_feature_file, 'r') as hf:
    c_features = np.array(hf.get('context_embedding'))
image_id = val_data[1]['image_id']
feature = c_features[c_imageid_i[image_id]:c_imageid_i[image_id]+c_imageid_num[image_id]]  # ocr 相互联系起来

In [5]:
val_data = np.load(open(data_dir['val'],'rb'), allow_pickle=True)
# d.get_question_sequence(val_data[1]['question_tokens'])

In [29]:
val_data[0]

{'creation_time': 1550905257.4633052,
 'version': 0.5,
 'dataset_type': 'val',
 'has_answer': True}

#### 数据加载模型

In [17]:
# 产生每个内容对应的顺序
order_vectors = torch.eye(10)
order_vectors[context["length"] :] = 0
sample.order_vectors = order_vectors

In [None]:
length = min(len(tokens), self.max_length)
tokens = tokens[:length]

output = torch.full(
            (self.max_length, self.model.get_dimension()),
            fill_value=self.PAD_INDEX,
            dtype=torch.float,
        )

for idx, token in enumerate(tokens):
    output[idx] = torch.from_numpy(self.stov[token])

In [1]:
from dataset import TextVQA, Dictionary
from basemodel import build_model
from torch.utils.data import DataLoader, ConcatDataset
from torch.autograd import Variable
import yaml
import torch.nn as nn
import os

d = Dictionary()
val_data = TextVQA('val', d)
train_data = TextVQA('train', d)

weight = d.create_glove_embedding_init(pre=True,pre_dir='../data/vocabs/embedding_weight.npy')
with open('options/9/29/9_29_3.yml', 'r') as handle:
    config = yaml.load(handle, Loader=yaml.FullLoader)

model = build_model(val_data, config['model_attributes'])
# model = nn.DataParallel(model)
# model.cuda()
val_loader = DataLoader(ConcatDataset([train_data, val_data]), 1, shuffle=False, num_workers=2)

In [2]:
gpu = False
for i, sample in enumerate(val_loader):
    question = Variable(sample["question"])
    img = Variable(sample["img_feature"])
    context = Variable(sample["context_feature"])
    context_order = Variable(sample["order_vectors"])
    label = Variable(sample['answer'])
    
    if gpu:
        question = question.cuda()
        img = img.cuda()
        context = context.cuda()
        context_order = context_order.cuda()
        label = label.cuda()
    
    score = model(img, question, context, context_order)#.cuda()
    break

In [20]:
answer_source = 0
answer_index = []
question_id = []
for i in range(len(val_data)):
    if val_data.entries[i]["answer_source"] == {}:
        answer_source += 1
        answer_index.append(i)
        question_id.append(val_data.entries[i]["question_id"])

In [3]:
sample["answer"]

tensor([[0., 0., 0.,  ..., 0., 0., 0.]])

In [3]:
from collections import Counter
counter = Counter()
counter.update(answer_source)
counter

Counter({2: 568,
         3: 1033,
         0: 1072,
         1: 1963,
         7: 48,
         5: 176,
         4: 88,
         6: 19,
         9: 14,
         11: 5,
         12: 3,
         23: 1,
         17: 1,
         14: 1,
         16: 2,
         8: 2,
         19: 2,
         32: 1,
         15: 1})

In [4]:
a = torch.randn(1,10)

In [10]:
a.new_ones(1,10).eq(0)

tensor([[False, False, False, False, False, False, False, False, False, False]])

In [6]:
import torch.nn.functional as F

In [7]:
def _mask_attentions(attention, image_locs):
    batch_size, num_loc, n_att = attention.size()
    tmp1 = attention.new_zeros(num_loc)
    tmp1[:num_loc] = torch.arange(0, num_loc, dtype=attention.dtype).unsqueeze(
        dim=0
    )

    tmp1 = tmp1.expand(batch_size, num_loc)
    tmp2 = image_locs.type(tmp1.type())
    tmp2 = tmp2.unsqueeze(dim=1).expand(batch_size, num_loc)
    mask = torch.ge(tmp1, tmp2)
    mask = mask.unsqueeze(dim=2).expand_as(attention)
    attention = attention.masked_fill(mask, 0)
    return attention

[0;31mSignature:[0m [0mF[0m[0;34m.[0m[0msigmoid[0m[0;34m([0m[0minput[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
sigmoid(input) -> Tensor

Applies the element-wise function :math:`\text{Sigmoid}(x) = \frac{1}{1 + \exp(-x)}`

See :class:`~torch.nn.Sigmoid` for more details.
[0;31mFile:[0m      ~/anaconda2/envs/py3_pt4/lib/python3.7/site-packages/torch/nn/functional.py
[0;31mType:[0m      function


In [13]:
import torch

In [22]:
image_att_feature.type(text_encode.type()).shape

torch.Size([10, 2048])

In [8]:
# from utils import compute_result
# model_dir = os.path.join("save/9/exp_9_17_1/", "model_best.pth")
# compute_result("val", model, model_dir , val_loader, "save/9/exp_9_17_1/", 10, gpu=False)

In [5]:
import numpy as np

In [21]:
data = np.load(open("../data/imdb/textvqa_0.5/imdb_textvqa_train.npy","rb"), allow_pickle=True)

In [7]:
import torch
from torch.autograd import Variable

In [9]:
import os
import json
a = [{"12":"34"}]
path_rslt = os.path.join("./save/", "test.json")
with open(path_rslt, 'w') as handle:
    json.dump(a, handle)

In [14]:
channel_attention = Channel_attention(dim_x = 100, dim_y=200, h_dim=1280)
x = torch.rand(10,20,100)
y = torch.rand(10,30,200)
x1, y1 = channel_attention(x,y)

In [27]:
model = ContextEmbedding(300,1280,1280,1,[0.2,0],0.2,1)

In [29]:
c = torch.rand(2,50,300)
mask = torch.rand(2,50) == torch.rand(2,50)
q = torch.rand(2,1280)
o = torch.rand(2,50,50)

##### 图像和ocr文本之间进行交互

In [1]:
import torch
from attention import DenseCoAttn

In [2]:
net = DenseCoAttn(1024, 1024, 2, 0, 2, 0.0)
x = torch.randn(128, 100, 1024)
x_m = torch.zeros(128, 100).float()
x_m[:,80] = 1
y = torch.randn(128, 50, 1024)
y_m = torch.zeros(128, 50).float()
y_m[:,40] = 1
weighted1, weighted2 = net(x,y,x_m,y_m)

In [7]:
import shutil