# Hate Intensity Prediction (HIP): Regression

HIP Module takes a sentence (whether normalised or not) and predicts the hateful intensity of the sentence.

The hate intensity is annotated on a scale of 1-10, 0 is reserved for non-hateful sentences which we do not use in our dataset.
1 is the lowest hate intensity and 10 is the highest.

If using final activation layer is linear then range stays same.
If using sigmoid activation layer then input label is normalised to 0-1 range.


## Install these inside colab

In [1]:
# %pip install numpy==1.19.5
%pip install tensorflow==2.2.0
# %pip install transformers==3.4.0
# %pip install sklearn scipy

import numpy
# assert numpy.__version__=="1.19.5"

[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.2.0 (from versions: 2.8.0rc0, 2.8.0rc1, 2.8.0, 2.8.1, 2.8.2, 2.8.3, 2.8.4, 2.9.0rc0, 2.9.0rc1, 2.9.0rc2, 2.9.0, 2.9.1, 2.9.2, 2.9.3, 2.10.0rc0, 2.10.0rc1, 2.10.0rc2, 2.10.0rc3, 2.10.0, 2.10.1, 2.11.0rc0, 2.11.0rc1, 2.11.0rc2, 2.11.0, 2.11.1, 2.12.0rc0, 2.12.0rc1, 2.12.0, 2.12.1, 2.13.0rc0, 2.13.0rc1, 2.13.0rc2, 2.13.0, 2.13.1, 2.14.0rc0, 2.14.0rc1, 2.14.0, 2.15.0rc0, 2.15.0rc1)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.2.0[0m[31m
[0m

In [2]:
%pip install pandas



In [3]:
## The folder is setup to from google drive. If used else only the following lines needs commenting

from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install transformers



In [5]:
import torch

In [6]:
import tensorflow as tf
from tqdm import tqdm
import numpy as np
from tensorflow import keras
from transformers import BertTokenizer
from transformers import DistilBertTokenizer, RobertaTokenizer, BertConfig, TFBertModel
from sklearn.model_selection import train_test_split
import pickle
import random
import sys
import math
from scipy import stats
from scipy.spatial import distance
import random
import os

In [7]:
BASE_FOLDER = "data/"
INPUT_FILE = "hate_norm_combined.pkl"
OUTPUT_FOLDER = "hate_intensity_linear_weights_att/"
OUTPUT_FILE = "hate_int_linear_trans42_ATT"
BERT_MODEL = "distilbert-base-uncased"
MAX_LENGTH = 128
TEST_SIZE = 0.2
SEED = 42

USE_ATT = True

BERT_DROPOUT = 0.2
LSTM_UNITS = 50
DENSE_UNITS = 50
LSTM_DROPOUT = 0.1
DENSE_DROPOUT = 0.2
EPOCHS = 2 #(Default 10)
BATCH_SIZE = 32


def random_seed(SEED):
    random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)
    np.random.seed(SEED)
    tf.random.set_seed(SEED)

random_seed(SEED)

### Base TRANSFORMER MODEL definitions

In [8]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks, input_segments = [], [], []
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence,
                                       add_special_tokens=True,
                                       max_length=MAX_LENGTH,
                                       pad_to_max_length=True,
                                       return_attention_mask=True,
                                       return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        input_segments.append(inputs['token_type_ids'])

    return np.asarray(input_ids, dtype='int32'), np.asarray(
        input_masks, dtype='int32'), np.asarray(input_segments, dtype='int32')


## Define base bert configs
config = BertConfig(dropout=BERT_DROPOUT,
                    attention_dropout=BERT_DROPOUT,
                    output_attentions=True)
config.output_hidden_states = False
transformer_model = TFBertModel.from_pretrained(BERT_MODEL, config=config)
for layer in transformer_model.layers[:3]:  ## We are freezing first 3 layers
    layer.trainable = False

# Defining tokonizer
tokenizer = DistilBertTokenizer.from_pretrained(BERT_MODEL,
                                                do_lower_case=True,
                                                add_special_tokens=True,
                                                max_length=MAX_LENGTH,
                                                pad_to_max_length=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['distilbert.transformer.layer.5.output_layer_norm.bias', 'distilbert.embeddings.LayerNorm.bias', 'distilbert.transformer.layer.0.attention.out_lin.weight', 'distilbert.transformer.layer.1.attention.v_lin.bias', 'distilbert.transformer.layer.5.output_layer_norm.weight', 'distilbert.transformer.layer.3.attention.out_lin.bias', 'distilbert.transformer.layer.3.ffn.lin1.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.transformer.layer.5.attention.q_lin.weight', 'distilbert.transformer.layer.3.ffn.lin2.weight', 'distilbert.transformer.layer.1.attention.q_lin.bias', 'distilbert.transformer.layer.0.attention.q_lin.bias', 'distilbert.transformer.layer.2.ffn.lin2.bias', 'distilbert.transformer.layer.0.ffn.lin2.weight', 'vocab_layer_norm.bias', 'distilbert.transformer.layer.2.sa_layer_norm.bias', 'distilbert.transformer.layer.5.attention.k_lin.bias', 'distilbert.transformer.layer.4.ffn.

### Model Design

In [9]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                     name='input_token',
                                     dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                       name='masked_token',
                                       dtype='int32')
embedding_layer = transformer_model(input_ids_in,
                                    attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS,
                         return_sequences=True,
                         dropout=LSTM_DROPOUT,
                         recurrent_dropout=LSTM_DROPOUT,
                         kernel_initializer='normal'))(embedding_layer)
if USE_ATT:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(DENSE_UNITS,
                          activation='relu',
                          kernel_initializer='normal')(X)
X = tf.keras.layers.Dropout(DENSE_DROPOUT)(X)
X = tf.keras.layers.Dense(
    1,
    activation='linear',  # Can be with activation="sigmoid" here.
    kernel_initializer='normal')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=X)
model.compile(
    optimizer='adam',
    loss='mean_squared_error',  # Treat HIP as a regression problem
    metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model.summary()



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_token (InputLayer)    [(None, 128)]                0         []                            
                                                                                                  
 masked_token (InputLayer)   [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_token[0][0]',         
 )                           ngAndCrossAttentions(last_   40         'masked_token[0][0]']        
                             hidden_state=(None, 128, 7                                           
                             68),                                                             

In [10]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                     name='input_token',
                                     dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                       name='masked_token',
                                       dtype='int32')
embedding_layer = transformer_model(input_ids_in,
                                    attention_mask=input_masks_in)[0]

# embedding_layer = tf.keras.layers.Dense(DENSE_UNITS,
#                           activation='relu',
#                           kernel_initializer='normal')(embedding_layer)
# X = tf.keras.layers.Bidirectional(
#     tf.keras.layers.LSTM(LSTM_UNITS,
#                          return_sequences=True,
#                          dropout=LSTM_DROPOUT,
#                          recurrent_dropout=LSTM_DROPOUT,
#                          kernel_initializer='normal'))(embedding_layer)
# if USE_ATT:
#     X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.
# X = tf.keras.layers.GlobalMaxPool1D()(X)
# X = tf.keras.layers.Dense(DENSE_UNITS,
#                           activation='relu',
#                           kernel_initializer='normal')(X)
# X = tf.keras.layers.Dropout(DENSE_DROPOUT)(X)
# X = tf.keras.layers.Dense(
#     1,
#     activation='linear',  # Can be with activation="sigmoid" here.
#     kernel_initializer='normal')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=embedding_layer)
model.compile(
    optimizer='adam',
    loss='mean_squared_error',  # Treat HIP as a regression problem
    metrics=['acc', tf.keras.metrics.RootMeanSquaredError()],run_eagerly=True)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_token (InputLayer)    [(None, 128)]                0         []                            
                                                                                                  
 masked_token (InputLayer)   [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_token[0][0]',         
 )                           ngAndCrossAttentions(last_   40         'masked_token[0][0]']        
                             hidden_state=(None, 128, 7                                           
                             68),                                                           

In [11]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                     name='input_token',
                                     dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                       name='masked_token',
                                       dtype='int32')
embedding_layer = transformer_model(input_ids_in,
                                    attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(LSTM_UNITS,
                         return_sequences=True,
                         dropout=LSTM_DROPOUT,
                         recurrent_dropout=LSTM_DROPOUT,
                         kernel_initializer='normal'))(embedding_layer)
if USE_ATT:
    X = tf.keras.layers.Attention(use_scale=True)([X, X])  # Use attention.
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(DENSE_UNITS,
                          activation='relu',
                          kernel_initializer='normal')(X)
X = tf.keras.layers.Dropout(DENSE_DROPOUT)(X)
X = tf.keras.layers.Dense(
    1,
    activation='linear',  # Can be with activation="sigmoid" here.
    kernel_initializer='normal')(X)
model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=embedding_layer)
model.compile(
    optimizer='adam',
    loss='mean_squared_error',  # Treat HIP as a regression problem
    metrics=['acc', tf.keras.metrics.RootMeanSquaredError()],
    run_eagerly=True)
model.summary()



Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_token (InputLayer)    [(None, 128)]                0         []                            
                                                                                                  
 masked_token (InputLayer)   [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_token[0][0]',         
 )                           ngAndCrossAttentions(last_   40         'masked_token[0][0]']        
                             hidden_state=(None, 128, 7                                           
                             68),                                                           

In [12]:
!ls
%cd drive
%cd MyDrive
%cd data

drive  sample_data
/content/drive
/content/drive/MyDrive
/content/drive/MyDrive/data


In [13]:
!ls


hate_norm_combined.pkl	hate_norm_with_span.csv  pkl_to_csv.py


### Dataset prep

In [14]:
print(BASE_FOLDER, INPUT_FILE)
# START = "./drive/MyDrive/"
with open(INPUT_FILE, 'rb') as f:
# with open("./hate_norm_with_span.pkl", 'rb') as f:
    input_data = pickle.load(f)

intensity_value = []
hate_sentences = []

for i in range(len(input_data)):
    intensity_value.append(int(input_data['Original_Intensity'][i]))
    hate_sentences.append(input_data['Sentence'][i])
    intensity_value.append(int(input_data['Normalized_Intensity'][i]))
    hate_sentences.append(input_data['Normalized_Sentence'][i])

c = list(zip(intensity_value, hate_sentences))
random.shuffle(c)
intensity_value, hate_sentences = zip(*c)

X_tr, X_te, y_tr, y_te = train_test_split(hate_sentences,
                                          intensity_value,
                                          test_size=TEST_SIZE,
                                          random_state=1)

train_input_ids, train_input_masks, train_input_segment = tokenize(
    X_tr, tokenizer)
test_input_ids, test_input_masks, test_input_segment = tokenize(
    X_te, tokenizer)
y_tr = np.asarray(y_tr)
y_te = np.asarray(y_te)

data/ hate_norm_combined.pkl


  0%|          | 0/4843 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 4843/4843 [00:02<00:00, 1795.01it/s]
100%|██████████| 1211/1211 [00:00<00:00, 1851.43it/s]


In [15]:
print(X_tr) # Training sentences
print("\n\nTrain input ids", train_input_ids, "\n\nAttention masks", train_input_masks, "\n\nToken type ids", train_input_segment) # input_ids, attention_masks, token_type_ids



Train input ids [[  101 18520 20464 ...     0     0     0]
 [  101  3198  2068 ...     0     0     0]
 [  101  1045  2245 ...     0     0     0]
 ...
 [  101  1045 10657 ...     0     0     0]
 [  101  2023  6638 ...     0     0     0]
 [  101  2053  2008 ...     0     0     0]] 

Attention masks [[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]] 

Token type ids [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [16]:
input_ids_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                     name='input_token',
                                     dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(MAX_LENGTH, ),
                                       name='masked_token',
                                       dtype='int32')
embedding_layer = transformer_model(input_ids_in,
                                    attention_mask=input_masks_in)[0]

model2 = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs=embedding_layer)
model2.compile(
    optimizer='adam',
    loss='mean_squared_error',  # Treat HIP as a regression problem
    metrics=['acc', tf.keras.metrics.RootMeanSquaredError()])
model2.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_token (InputLayer)    [(None, 128)]                0         []                            
                                                                                                  
 masked_token (InputLayer)   [(None, 128)]                0         []                            
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1094822   ['input_token[0][0]',         
 )                           ngAndCrossAttentions(last_   40         'masked_token[0][0]']        
                             hidden_state=(None, 128, 7                                           
                             68),                                                           

In [17]:
pip install tqdm boto3 requests regex sentencepiece sacremoses



In [18]:
# tf.config.run_functions_eagerly(True)
print(train_input_ids.shape,"TRAINSHAPE")
# print(embedding_layer.shape, "embedding layer shape")
# print(train_input_ids.shape, train_input_masks.shape,train_input_segment.shape)
input_ids_in = train_input_ids[:5]
input_masks_in = train_input_masks[:5]
# print(input_ids_in)
model3 = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')

# tf.enable_eager_execution()
tf.executing_eagerly()

input_y = y_tr[:5]
input_ids_in = torch.tensor(input_ids_in)
input_masks_in = torch.tensor(input_masks_in)
pred = model3(input_ids_in, input_masks_in)[0]
print(pred.shape,"pred value")
# print(pred[0][0])


(4843, 128) TRAINSHAPE


Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


torch.Size([5, 128, 768]) pred value


In [19]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable
import matplotlib.pyplot as plt
import seaborn

def piecewise_index(relative_position, alpha, beta, gamma, dtype):
    rp_abs = relative_position.abs()
    mask = rp_abs <= alpha
    not_mask = ~mask
    rp_out = relative_position[not_mask]
    rp_abs_out = rp_abs[not_mask]
    y_out = (torch.sign(rp_out)* (alpha +
                                 torch.log(rp_abs_out/alpha)/
                                 math.log(gamma/alpha)*
                                 (beta - alpha)).round().clip(max=beta)).to(dtype)
    idx = relative_position.clone()
    if idx.dtype in [torch.float32,torch.float64]:
        idx = idx.round().to(dtype)

    idx[not_mask] = y_out
    return idx

class PositionalEncoding2(nn.Module):
    def __init__(self, n_functions):
        super(PositionalEncoding2, self).__init__()

        self.register_buffer('frequencies', 2.0 ** torch.arange(n_functions))

    def forward(self, x):
        """
        Args:
            x: tensor of shape [..., dim]

        Returns:
            embedding: a temporal embedding of `x` of shape [..., n_functions * dim * 2]
        """
        freq = (x[..., None] * self.frequencies).view(*x.shape[:-1], -1)

        embedding = torch.zeros(*freq.shape[:-1], freq.shape[-1] * 2).cuda()
        embedding[..., 0::2] = freq.sin()
        embedding[..., 1::2] = freq.cos()

        return embedding

class SirenBlock(nn.Module):
    def __init__(self, in_features, out_features, w0=30, c=6, is_first=False, use_bias=True, activation=None):
        super(SirenBlock, self).__init__()

        self.in_features = in_features
        self.is_first = is_first

        weight = torch.zeros(out_features, in_features)
        bias = torch.zeros(out_features) if use_bias else None
        self.init(weight, bias, c=c, w0=w0)

        self.weight = nn.Parameter(weight)
        self.bias = nn.Parameter(bias) if use_bias else None
        self.activation = Sine(w0) if activation is None else activation

    def init(self, weight, bias, c, w0):
        n = self.in_features

        w_std = (1 / n) if self.is_first else (np.sqrt(c / n) / w0)
        weight.uniform_(-w_std, w_std)

        if bias is not None:
            bias.uniform_(-w_std, w_std)

    def forward(self, x):
        out = F.linear(x, self.weight, self.bias)

        return self.activation(out)


class FCBlock(nn.Module):
    def __init__(self, in_features, out_features, norm_layer=False, activation=None):
        super(FCBlock, self).__init__()

        self.fc = nn.Linear(in_features, out_features)
        self.residual = (in_features == out_features)  # when the input and output have the same dimensions, build a residual block
        self.norm_layer = nn.LayerNorm(out_features) if norm_layer else None
        self.activation = nn.ReLU(inplace=True) if activation is None else activation
        dropout = 0.1
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Args:
            x: (B, T, D), features are in the last dimension.
        """
        out = self.fc(x)

        if self.norm_layer is not None:
            out = self.norm_layer(out)

        if self.residual:
            return self.activation(out) + x

        # return self.activation(out)
        return self.dropout(self.activation(out))

class EncoderDecoder2(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder2, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        self.relu = torch.nn.ReLU()
        # self.fl1 = nn.Linear(512,512)
        # self.fl = nn.Linear(512*2,18*3*2)
        self.fl = nn.Linear(128*128,1)
        # d_model = 256
        # self.inskele = 100
        # self.ingli = 100
        # self.convskele = nn.Sequential(nn.Conv1d(in_channels=self.inskele, out_channels=d_model, kernel_size=1, bias=False),
        #                            nn.ReLU(),
        #                            nn.Conv1d(in_channels=d_model, out_channels=d_model, kernel_size=6, bias=False),
        #                            nn.ReLU(),
        #                            nn.Conv1d(in_channels=d_model, out_channels=self.ingli, kernel_size=5, bias=False),
        #                            nn.ReLU())
        # self.convgli = nn.Sequential(nn.Conv1d(in_channels=self.ingli, out_channels=d_model, kernel_size=1, bias=False),
        #                            nn.ReLU(),
        #                            nn.Conv1d(in_channels=d_model, out_channels=d_model, kernel_size=6, bias=False),
        #                            nn.ReLU(),
        #                            nn.Conv1d(in_channels=d_model, out_channels=self.ingli, kernel_size=5, bias=False),
        #                            nn.ReLU())

        self.bandwidth = 7
        if self.bandwidth != 0:
            # print(self.bandwidth,"BANDWITH")
            self.positional_encoding = PositionalEncoding2(self.bandwidth)
            embedding_dim = self.bandwidth * 2
        else:
            embedding_dim = 1

        global_z = 50
        hidden_neuron = 256
        local_z = 18*3 + 15
        skip_connection = True
        siren = False
        norm_layer = True
        hidden_neuron = hidden_neuron
        in_features = 1 + local_z if siren else embedding_dim + local_z
        local_in_features = hidden_neuron + in_features if skip_connection else hidden_neuron
        global_in_features = local_in_features + global_z if skip_connection else hidden_neuron
        local_output = 50

        layers = [
            SirenBlock(in_features, hidden_neuron, is_first=True) if siren else FCBlock(in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(global_in_features, hidden_neuron) if siren else FCBlock(global_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(global_in_features, hidden_neuron) if siren else FCBlock(global_in_features, hidden_neuron, norm_layer=norm_layer),
            SirenBlock(global_in_features, hidden_neuron) if siren else FCBlock(global_in_features, hidden_neuron, norm_layer=norm_layer)
        ]
        # layers = [
        #     SirenBlock(in_features, hidden_neuron, is_first=True) if siren else FCBlock(in_features, hidden_neuron, norm_layer=norm_layer),
        #     SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
        #     SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
        #     SirenBlock(local_in_features, hidden_neuron) if siren else FCBlock(local_in_features, hidden_neuron, norm_layer=norm_layer),
        #     SirenBlock(global_in_features, hidden_neuron) if siren else FCBlock(global_in_features, hidden_neuron, norm_layer=norm_layer),
        #     SirenBlock(global_in_features, hidden_neuron) if siren else FCBlock(global_in_features, hidden_neuron, norm_layer=norm_layer),
        #     SirenBlock(global_in_features, hidden_neuron) if siren else FCBlock(global_in_features, hidden_neuron, norm_layer=norm_layer)
        # ]
        self.mlp = nn.ModuleList(layers)
        self.skip_layers = [] if not skip_connection else list(range(1, len(self.mlp)))
        self.local_layers = list(range(8))
        # self.local_layers = list(range(4))
        self.local_linear = nn.Sequential(nn.Linear(hidden_neuron, local_output))

        # w = 100
        # self.dctstackdec = []
        # self.dctstackdec.append(0)
        # for w in range(1,101):
        #     self.dct_m, self.idct_m = util.get_dct_matrix(w) # (20, 20)
        #     # self.dct_m = torch.from_numpy(self.dct_m).float().cuda()
        #     self.idct_m = torch.from_numpy(self.idct_m).float().cuda()
        #     self.dctstackdec.append(self.idct_m)

    # def forward(self, src, tgt, src_mask, tgt_mask):
    #     "Take in and process masked src and target sequences."
    #     y =self.decode(self.encode(src, src_mask), src_mask,
    #                         tgt, tgt_mask)
    #     # y = self.fl1(y)
    #     # y = self.relu(y)
    #     y = self.fl(y)

    #     # q,w,e = y.shape

    #     # self.dct_m, self.idct_m = util.get_dct_matrix(w) # (20, 20)
    #     # self.dct_m = torch.from_numpy(self.dct_m).float().cuda()
    #     # self.idct_m = torch.from_numpy(self.idct_m).float().cuda()
    #     # y = torch.matmul(self.idct_m,y)
    #     # y = torch.matmul(self.dctstackdec[w],y)
    #     return y

    def forward(self, src, relenc, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."



        y = self.encode(src,relenc ,src_mask)
        a,b,c = y.shape
        y = y.reshape(a,b*c)
        # y =self.decode(self.encode(src,relenc ,src_mask), src_mask,
        #                     tgt, tgt_mask)
        # y = self.fl1(y)
        # y = self.relu(y)
        y = self.fl(y)

        # q,w,e = y.shape

        # self.dct_m, self.idct_m = util.get_dct_matrix(w) # (20, 20)
        # self.dct_m = torch.from_numpy(self.dct_m).float().cuda()
        # self.idct_m = torch.from_numpy(self.idct_m).float().cuda()
        # y = torch.matmul(self.idct_m,y)
        # y = torch.matmul(self.dctstackdec[w],y)
        print(y.shape,"Y SHAPE")
        return y
    # def encode(self, src, src_mask):
        # return self.encoder(self.src_embed(src), src_mask)

    def encode(self, src, relenc,src_mask):
        return self.encoder(self.src_embed(src),relenc,src_mask)

    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class Encoder2(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder2, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    # def forward(self, x, mask):
    #     "Pass the input (and mask) through each layer in turn."
    #     for layer in self.layers:
    #         x = layer(x, mask)
    #     return self.norm(x)

    def forward(self, x, relenc,mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, relenc, mask)
        return self.norm(x)

class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

#     def forward(self, x, mask):
#         "Follow Figure 1 (left) for connections."
# #         print(x.shape,"X SHAPE HEJRE")
#         x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
#         return self.sublayer[1](x, self.feed_forward)

    def forward(self, x,relenc, mask):
        "Follow Figure 1 (left) for connections."
#         print(x.shape,"X SHAPE HEJRE")
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, relenc,mask))
        return self.sublayer[1](x, self.feed_forward)

class Decoder2(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder2, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
#         self.fl = nn.Linear(512,108)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
#         print(x.shape,"X OUT DEC SHAPE")
#         print(x.dtype,"X DTYPE HERE")
        return self.norm(x)

class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

# plt.figure(figsize=(5,5))
# plt.imshow(subsequent_mask(20)[0])
# None

def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
#         print(scores.shape,"scores")
#         print(mask.shape,"mask shape")
#         exit()
        scores = scores.masked_fill(mask == 0, -1e9)
    # print(scores.shape,"SCORES SHAPE HERE")
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

def attentionrel(query, key, value,relencq,relenck,relencv,mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    a =torch.matmul(query,relencq.transpose(-2,-1))
    # print(a.shape,"A SHAPE")
    b = torch.matmul(relenck,key.transpose(-2,-1))
    # print(b.shape,"B SHAPE")
    pos_bias = a + b
    # global flagenc
    # global lastposbias
    # flagenc = flagenc + 1
    # if(flagenc==600):
    #     lastposbias = pos_bias
    #     lastposbias.retain_grad()
    # print(flagenc,"FLAGENC HERE CHECK IT OUT")
    # pos_bias = torch.matmul(query,relencq) + torch.matmul(relenck,key.transpose(-2,-1))
    # temp = (torch.matmul(query, key.transpose(-2, -1)) + pos_bias) \
            #  / math.sqrt(d_k)
    # print(temp.shape,"TEMP SHAPE")
    # exit()
    # scores = torch.matmul((query, key.transpose(-2, -1))) \
            #  / math.sqrt(d_k)
    scores = (torch.matmul(query, key.transpose(-2, -1)) + pos_bias) \
             / math.sqrt(d_k)
    if mask is not None:
#         print(scores.shape,"scores")
#         print(mask.shape,"mask shape")
#         exit()
        scores = scores.masked_fill(mask == 0, -1e9)

    p_attn = F.softmax(scores, dim = -1)
    # print(scores.shape,"SCORES SHAPE HERE")
    # exit()
    if dropout is not None:
        p_attn = dropout(p_attn)
    # print(torch.matmul(p_attn, value).shape,"FINAL SHAPE HERE")
    # exit()
    return torch.matmul(p_attn, value+relencv), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
#         print(query.shape,key.shape,value.shape,"before linear trans")
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
#         print(query.shape,key.shape,value.shape,mask.shape,"attention stuff here")
        x, self.attn = attention(query, key, value, mask=mask,
                                 dropout=self.dropout)
#         print(query.shape,key.shape,value.shape,mask.shape,"later attention stuff here")
        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

class MultiHeadedAttentionRel(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttentionRel, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        # self.relfeat = 15*15
        self.relfeat = 15*30
        self.linearrelposq = nn.Linear(self.relfeat,d_model)
        self.linearrelposk = nn.Linear(self.relfeat,d_model)
        self.linearrelposv = nn.Linear(self.relfeat,d_model)

    def forward(self, query, key, value,relenc, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
#         print(query.shape,key.shape,value.shape,"before linear trans")
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        # print(query.shape,"QUERY SHAPE")
        # print(relenc.shape,"RELENC SHAPE")
        relencq = self.linearrelposq(relenc).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        relenck = self.linearrelposk(relenc).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        relencv = self.linearrelposv(relenc).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
        # print(relencq.shape,"RELENCqqq   SHAPE")
        # 2) Apply attention on all the projected vectors in batch.
#         print(query.shape,key.shape,value.shape,mask.shape,"attention stuff here")
        x, self.attn = attentionrel(query, key, value,relencq,relenck,relencv, mask=mask,
                                 dropout=self.dropout)
#         print(query.shape,key.shape,value.shape,mask.shape,"later attention stuff here")
        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
#         print(vocab,"VOCAB SIZE")
#         self.lut = nn.Embedding(vocab, d_model)
        self.lut1 = nn.Linear(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
#         print(x.shape,"EMBEDDING X SHAPE")
#         print(x.dtype,"EMBEDDING TYPE")
#         y = self.lut(x) * math.sqrt(self.d_model)
        y = self.lut1(x)*math.sqrt(self.d_model)
#         print(y.shape,"EMBEDDING Y SHAPE")
        return y


class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
#         print(pe[:,0::2].shape)
#         print(pe[:,1::2].shape)
#         print(pe.shape,"PE shape")
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
#         (print(pe.shape))
        self.register_buffer('pe', pe)
        # w = 100
        # self.dctstack = []
        # self.dctstack.append(0)
        # for w in range(1,101):
        #     self.dct_m, self.idct_m = util.get_dct_matrix(w) # (20, 20)
        #     self.dct_m = torch.from_numpy(self.dct_m).float().cuda()
        #     self.dctstack.append(self.dct_m)
        # self.dct_m, self.idct_m = util.get_dct_matrix(w) # (20, 20)
        # self.dct_m = torch.from_numpy(self.dct_m).float().cuda()
        # self.idct_m = torch.from_numpy(self.idct_m).float().cuda()

    def forward(self, x):
#         print(x.shape,"XSHAPE")
#         print(Variable(self.pe[:, :x.size(1)],
#                          requires_grad=False).repeat(1,1,108).shape,"pe shape")
        a,b,c = Variable(self.pe[:, :x.size(1)],
                         requires_grad=False).repeat(1,1,18*3).shape
#         print(x.shape,"x pos shape")
#         x = x + Variable(self.pe[:, :x.size(1)],
#                          requires_grad=False).repeat(1,1,108).reshape(a,b,108,-1)
        q,w,e = x.shape
        # self.dct_m, self.idct_m = util.get_dct_matrix(w) # (20, 20)
        # self.dct_m = torch.from_numpy(self.dct_m).float().cuda()
        # self.idct_m = torch.from_numpy(self.idct_m).float().cuda()
        # x = torch.matmul(self.dct_m,x)
        x = x + Variable(self.pe[:, :x.size(1)],
                         requires_grad=False)
        # self.dct_m, self.idct_m = util.get_dct_matrix(w) # (20, 20)
        # self.dct_m = torch.from_numpy(self.dct_m).float().cuda()
        # self.idct_m = torch.from_numpy(self.idct_m).float().cuda()
        # x = torch.matmul(self.dct_m,x)
        # print(w,"W VALUE")
        # x = torch.matmul(self.dctstack[w],x)
        # x = x
        # print(x.shape,"x pos hspae")
        return self.dropout(x)

# plt.figure(figsize=(15, 5))
# pe = PositionalEncoding(20, 0)
# y = pe.forward(Variable(torch.zeros(1, 100, 20)))
# print(y.shape)
# plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
# plt.legend(["dim %d"%p for p in [4,5,6,7]])
# None

def make_modeltrans(src_vocab, tgt_vocab, N=6,
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."

    d_model=128
    h=4
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    attnRel = MultiHeadedAttentionRel(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    # model = EncoderDecoder2(
    #     Encoder2(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
    #     Decoder2(DecoderLayer(d_model, c(attn), c(attn),
    #                          c(ff), dropout), N),
    #     # nn.Sequential(Embeddings(d_model, src_vocab+7+36+15+15), c(position)),
    #     nn.Sequential(Embeddings(d_model, src_vocab+7+15), c(position)),
    #     nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
    #     Generator(d_model, tgt_vocab))

    # model = EncoderDecoder2(
    #     Encoder2(EncoderLayer(d_model, c(attnRel), c(ff), dropout), N),
    #     Decoder2(DecoderLayer(d_model, c(attn), c(attn),
    #                          c(ff), dropout), N),
    #     # nn.Sequential(Embeddings(d_model, src_vocab+7+36+15+15), c(position)),
    #     nn.Sequential(Embeddings(d_model, 58), c(position)),
    #     nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
    #     Generator(d_model, tgt_vocab))

    model = EncoderDecoder2(
        Encoder2(EncoderLayer(d_model, c(attnRel), c(ff), dropout), N),
        Decoder2(DecoderLayer(d_model, c(attn), c(attn),
                             c(ff), dropout), N),
        # nn.Sequential(Embeddings(d_model, src_vocab+7+36+15+15), c(position)),
        nn.Sequential(Embeddings(d_model, 768), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model

In [20]:
V = 18*3*2
model_trans = make_modeltrans(V, V, N=6).cuda()
model_opt = torch.optim.Adam(model_trans.parameters(),lr=0.00003)
criterion = nn.MSELoss()
data_ = pred
a,b,c = data_.shape
src = torch.randn(a,b)
pad = 0
src_mask = (src != pad).unsqueeze(-2).cuda()
dtrg = src
dtrg_y = src[:,1:]

def make_std_mask(tgt, pad):
  "Create a mask to hide padding and future words."
  tgt_mask = (tgt != pad).unsqueeze(-2)
  tgt_mask = tgt_mask & Variable(
      subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
  return tgt_mask

trg_mask = make_std_mask(dtrg, pad).cuda()

  nn.init.xavier_uniform(p)


In [21]:
model3 = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-uncased')
batch, b = train_input_ids.shape
epochs = 10
batch_size = 15
criterion = nn.MSELoss()
for epoch in range(epochs):
  for part in range(int(batch/batch_size)-1):

    input_ids_in = train_input_ids[part*batch_size:(part+1)*batch_size]
    input_masks_in = train_input_masks[part*batch_size:(part+1)*batch_size]

    print(input_ids_in.shape,input_masks_in.shape)
    tf.executing_eagerly()

    input_y = y_tr[part*batch_size:(part+1)*batch_size]
    input_ids_in = torch.tensor(input_ids_in)
    input_masks_in = torch.tensor(input_masks_in)
    input_ids_in = torch.tensor(input_ids_in)
    input_masks_in = torch.tensor(input_masks_in)
    pred = model3(input_ids_in, input_masks_in)[0]
    data_ = pred
    a,b,c = data_.shape
    src = torch.randn(a,b)
    pad = 0
    src_mask = (src != pad).unsqueeze(-2).cuda()
    dtrg = src
    dtrg_y = src[:,1:]

    def make_std_mask(tgt, pad):
      "Create a mask to hide padding and future words."
      tgt_mask = (tgt != pad).unsqueeze(-2)
      tgt_mask = tgt_mask & Variable(
          subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
      return tgt_mask

    trg_mask = make_std_mask(dtrg, pad).cuda()
    i = 127
    A_traind = data_.cuda()
    Bpad = torch.ones(a,i+1,18*3).cuda()
    relenc = torch.ones(a,i+1,15*30).cuda()
    dtrg = src[:,:i+1]
    trg_mask = make_std_mask(dtrg, pad).cuda()
    out = model_trans.forward(A_traind,relenc,Bpad,src_mask,trg_mask)
    input_y = torch.tensor(input_y).cuda().float()
    loss = criterion(out,input_y)
    print("loss value",loss.item())

    model_opt.zero_grad()
    loss.backward()
    model_opt.step()
  if(epoch%3==0):
    torch.save(model_trans.state_dict(),"./svotransformer"+str(epoch))

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


(15, 128) (15, 128)


  input_ids_in = torch.tensor(input_ids_in)
  input_masks_in = torch.tensor(input_masks_in)


torch.Size([15, 1]) Y SHAPE
loss value 47.86116409301758


  return F.mse_loss(input, target, reduction=self.reduction)


(15, 128) (15, 128)
torch.Size([15, 1]) Y SHAPE
loss value 46.9682731628418
(15, 128) (15, 128)
torch.Size([15, 1]) Y SHAPE
loss value 39.77360153198242
(15, 128) (15, 128)
torch.Size([15, 1]) Y SHAPE
loss value 38.516632080078125
(15, 128) (15, 128)
torch.Size([15, 1]) Y SHAPE
loss value 20.418434143066406
(15, 128) (15, 128)
torch.Size([15, 1]) Y SHAPE
loss value 24.377490997314453
(15, 128) (15, 128)
torch.Size([15, 1]) Y SHAPE
loss value 13.269116401672363


KeyboardInterrupt: ignored

### Train and evlauate

In [None]:
EPOCHS = 1
model.fit(x=[train_input_ids, train_input_masks],
          y=y_tr,
          epochs=EPOCHS,
          validation_split=0.1,
          batch_size=BATCH_SIZE)

print("TEST split", TEST_SIZE)
results = model.evaluate(x=[test_input_ids, test_input_masks], y=y_te)
print(results)
result = model.predict(x=[test_input_ids, test_input_masks])
result = np.array(result, dtype=np.float)
result = result.flatten()
print("pear", stats.pearsonr(result, y_te))
print("cosine", 1 - distance.cosine(result, y_te))

### To save model
Run
```
# model.save_weights(BASE_FOLDER + OUTPUT_FOLDER + OUTPUT_FILE)
```

### To load model
Run upto the cells up till `model_design` part and then do
```
model.load_weights(BASE_FOLDER+OUTPUT_FOLDER+OUTPUT_FILE)
```