In [39]:
import matchzoo as mz
from matchzoo import DataPack
from matchzoo.engine.base_preprocessor import BasePreprocessor
from matchzoo.preprocessors.units.unit import Unit
from matchzoo.engine.base_model import BaseModel
from matchzoo.preprocessors import units
from nltk.tokenize import RegexpTokenizer
import gensim.downloader as gensim
import tensorflow as tf
import numpy as np
import pandas as pd
import keras

In [37]:
class AttentionTextMatchingModel(BaseModel):
       
    def build(self):
       
        def select_weight(docs):
            #both inputs have same size: [batchsize, 30, 300]
            query, value = docs
           
            #reshape inputs
            #query reshaped size is [batchsize, 30, 1, 300]
            query_reshaped = tf.expand_dims(query, axis=2)
            #value reshaped size is [batchsize, 1, 30, 300]
            value_reshaped = tf.expand_dims(value, axis=1)
           
            #sum of query reshaped and value reshaped (broadcasts dimensions 1 and 2 (0-indexed))
            #size of tensor c is [batchsize, 30, 30, 300]
            sum_ = tf.add(query_reshaped,value_reshaped)
           
            #rectified version of sum_
            #same shape as sum_
            rectified = tf.math.tanh(sum_)

            #calculate scores for each word pairing
            #shape of scores is [batchsize, 30, 30, 1]
            scores = keras.layers.Dense(1,activation='linear')(rectified)
           
            #each query word (each word in the document text) is given a weighting over the value words (each word in the query text)
            #weights has same shape as scores [batchsize, 30, 30, 1]
            weights = tf.nn.softmax(scores,axis=2)

            #multiply the weights and the values
            #value reshaped size is [batchsize, 1, 30, 300]
            #weights has same shape as scores [batchsize, 30, 30, 1]
            #weighted values has shape [batchsize, 30, 30, 300]
            weighted_values = keras.layers.Multiply()([weights, value_reshaped])
           
            #output has shape of [batchsize, 30, 300]
            output = tf.reduce_sum(weighted_values,axis=2)

            return output

        def select_weight_shape(shapes):
            shape1, shape2 = shapes
            return (shape1[0],30, 300) #(batch size, length of words (hard coded for simplification), length of embeddings)
       
        '''Inputs are left and right text'''
        input_left = keras.Input(name='text_left', shape=(300,30))
        input_right= keras.Input(name='text_right', shape=(300,30))
       
        '''Attention layer'''
        Weighted_right = keras.layers.Lambda(select_weight,
                    output_shape=select_weight_shape)([input_left, input_right])
       
        '''Flatten matrices before feeding to dense layer'''
        flatten_layer = keras.layers.Flatten()
        flat_left = flatten_layer(input_left)
        flat_right = flatten_layer(Weighted_right)
       
        '''Pass through a dense layer for dimensionality reduction'''
        dense = keras.layers.Dense(
        64, activation='elu', use_bias=True,
        kernel_initializer='glorot_uniform',
        bias_initializer='zeros', kernel_regularizer=None,
        bias_regularizer=None, activity_regularizer=None
        )
       
        dense_left_result = dense(flat_left)

        dense_right_result = dense(flat_right)
       
        '''Calculate cosine similarity'''
        dotted = keras.layers.Dot(axes=1, normalize=True)([dense_left_result, dense_right_result])
       
        self._backend = keras.Model(inputs=[input_left,input_right], outputs=dotted, name="attention_model")


In [22]:
wordembeddingmodel = gensim.load('word2vec-google-news-300')

In [28]:
class FixedLengthUnit(Unit):
    def __init__(self, fixed_length: int):
        self._fixed_length = fixed_length
   
    def transform(self, input_: tf.Tensor) -> tf.Tensor:
        input_length = input_.shape[1]
        if input_length == 0:
            input_ = tf.zeros([300,0])
        pre_ct = (self._fixed_length-input_length)//2
        post_ct = self._fixed_length-pre_ct-input_length
        pre = tf.zeros([300,pre_ct])
        post = tf.zeros([300,post_ct])
        return tf.concat([pre, input_, post], axis=1)

class AttentionModelPreprocessor(BasePreprocessor):
    def __init__(self, fixed_length: int):
        super().__init__()
        self._fixed_length= fixed_length
        self._fixedlength_unit = FixedLengthUnit(self._fixed_length)
        self._tokenizer = RegexpTokenizer('[^\d\W]+')
    
   
    def _filter_vocab(self, input_: list) ->list:
        return list(word for word in input_ if word in wordembeddingmodel.vocab)
       
    def _process(self, input_: list) ->list:
        #get word embeddings
        vectors = list(wordembeddingmodel[word] for word in input_)
        vectors = tf.transpose(tf.convert_to_tensor(vectors))
        return vectors
    
    def fit(self, data_pack: DataPack, verbose: int=1):
        return self
   
    def transform(self, data_pack: DataPack, verbose: int=1) -> DataPack:
        data_pack = data_pack.copy()
        data_pack.apply_on_text(self._tokenizer.tokenize, inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._filter_vocab, inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._process, inplace=True, verbose=verbose)
        data_pack.apply_on_text(self._fixedlength_unit.transform, inplace=True, verbose=verbose)
        return data_pack

In [29]:
import pandas as pd

preprocessor = AttentionModelPreprocessor(30)

In [30]:
df = pd.read_csv(r"matchzoo\international-goldstandard.csv")
train_size = .8
valid_size = .1
test_size = .1
train_offset = int(len(df)*train_size)
valid_offset = int(len(df)*(valid_size) + train_offset)
test_offset = int(len(df)*(test_size))
train_df = pd.DataFrame(df.iloc[0:train_offset])
valid_df = pd.DataFrame(df.iloc[train_offset: valid_offset])
test_df = pd.DataFrame(df.iloc[valid_offset:])
valid_df.reset_index(inplace = True, drop=True)
test_df.reset_index(inplace =True, drop=True)
train_pack = mz.pack(train_df)
valid_pack = mz.pack(valid_df)
test_pack = mz.pack(test_df)

In [31]:
train_pack_processed = preprocessor.transform(train_pack, verbose=0)
valid_pack_processed = preprocessor.transform(valid_pack, verbose=0)
test_pack_processed = preprocessor.transform(test_pack, verbose=0)

In [32]:
ranking_task = mz.tasks.Ranking(loss=mz.losses.RankHingeLoss())
ranking_task.metrics = [
        mz.metrics.MeanAveragePrecision(),
        mz.metrics.MeanReciprocalRank(),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=1),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
        mz.metrics.NormalizedDiscountedCumulativeGain(k=5)
    ]


In [33]:
train_generator = mz.DataGenerator(
            train_pack_processed,
            mode='pair',
            num_dup=5,
            num_neg=1,
            batch_size=10
        )

In [40]:
model = AttentionTextMatchingModel()
model.params['task'] = ranking_task
model.build()
model.compile()

In [41]:
valid_x, valid_y = valid_pack_processed.unpack()
evaluate = mz.callbacks.EvaluateAllMetrics(model, x=valid_x, y=valid_y, batch_size=len(valid_y))

In [None]:
history = model.fit_generator(train_generator, epochs=10, callbacks=[evaluate], workers=4, use_multiprocessing=True, shuffle=True)

In [None]:
test_x, test_y = test_pack_processed.unpack()

performances=model.evaluate(test_x, test_y)

In [None]:
test_df['predictions'] = model.predict(test_x)
test_df

In [None]:
test_df.to_csv(‘AttentionModelResults.csv')