In [1]:
# Imports
from keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Reshape
from keras.layers import RepeatVector, Dense, Activation, Lambda
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.models import load_model, Model
from keras.callbacks import LearningRateScheduler
import keras.backend as K

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

import random
import math
import json

# Pinkie Pie was here

Using TensorFlow backend.


In [9]:
with open('C:/Users/Ax/Desktop/tqx/关系抽取/Attention_Network_With_Keras-master/data/Time Dataset.json','r') as f:
    dataset = json.loads(f.read())
with open('C:/Users/Ax/Desktop/tqx/关系抽取/Attention_Network_With_Keras-master/data/Time Vocabs.json','r') as f:
    human_vocab, machine_vocab = json.loads(f.read())
    
human_vocab_size = len(human_vocab)
machine_vocab_size = len(machine_vocab)

# Number of training examples
m = len(dataset)

In [10]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    """
    A method for tokenizing data.
    
    Inputs:
    dataset - A list of sentence data pairs.
    human_vocab - A dictionary of tokens (char) to id's.
    machine_vocab - A dictionary of tokens (char) to id's.
    Tx - X data size
    Ty - Y data size
    
    Outputs:
    X - Sparse tokens for X data
    Y - Sparse tokens for Y data
    Xoh - One hot tokens for X data
    Yoh - One hot tokens for Y data
    """
    
    # Metadata
    m = len(dataset)
    
    # Initialize
    X = np.zeros([m, Tx], dtype='int32')
    Y = np.zeros([m, Ty], dtype='int32')
    
    # Process data
    for i in range(m):
        data = dataset[i]
        X[i] = np.array(tokenize(data[0], human_vocab, Tx))
        Y[i] = np.array(tokenize(data[1], machine_vocab, Ty))
    
    # Expand one hots
    Xoh = oh_2d(X, len(human_vocab))
    Yoh = oh_2d(Y, len(machine_vocab))
    
    return (X, Y, Xoh, Yoh)
    
def tokenize(sentence, vocab, length):
    """
    Returns a series of id's for a given input token sequence.
    
    It is advised that the vocab supports <pad> and <unk>.
    
    Inputs:
    sentence - Series of tokens
    vocab - A dictionary from token to id
    length - Max number of tokens to consider
    
    Outputs:
    tokens - 
    """
    tokens = [0]*length
    for i in range(length):
        char = sentence[i] if i < len(sentence) else "<pad>"
        char = char if (char in vocab) else "<unk>"
        tokens[i] = vocab[char]
        
    return tokens

def ids_to_keys(sentence, vocab):
    """
    Converts a series of id's into the keys of a dictionary.
    """
    return [list(vocab.keys())[id] for id in sentence]

def oh_2d(dense, max_value):
    """
    Create a one hot array for the 2D input dense array.
    """
    # Initialize
    oh = np.zeros(np.append(dense.shape, [max_value]))
    
    # Set correct indices
    ids1, ids2 = np.meshgrid(np.arange(dense.shape[0]), np.arange(dense.shape[1]))
    
    oh[ids1.flatten(), ids2.flatten(), dense.flatten('F').astype(int)] = 1
    
    return oh

In [11]:
Tx = 41 # Max x sequence length
Ty = 5 # y sequence length
X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

# Split data 80-20 between training and test
train_size = int(0.8*m)
Xoh_train = Xoh[:train_size]
Yoh_train = Yoh[:train_size]
Xoh_test = Xoh[train_size:]
Yoh_test = Yoh[train_size:]

In [13]:
i = 4
print("Input data point " + str(i) + ".")
print("")
print("The data input is: " + str(dataset[i][0]))
print("The data output is: " + str(dataset[i][1]))
print("")
print("The tokenized input is:" + str(X[i]))
print("The tokenized output is: " + str(Y[i]))
print("")
print("The one-hot input is:", Xoh[i])
print("The one-hot output is:", Yoh[i])

Input data point 4.

The data input is: 8:25
The data output is: 08:25

The tokenized input is:[11 13  5  8 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40]
The tokenized output is: [ 0  8 10  2  5]

The one-hot input is: [[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]
 [ 0.  0.  0. ...,  0.  0.  1.]]
The one-hot output is: [[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]]
