In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../input/sentlen/sentence_length.txt', sep = "\t", header = None)
df

Unnamed: 0,0,1,2
0,tr,0,But it was not here .
1,tr,0,The one no one cared about .
2,tr,0,The Bible would be staying here .
3,tr,0,"It 's a long hike . """
4,tr,0,A very bad idea .
...,...,...,...
104563,te,5,The Shadow Stone cannot bring people back fro...
104564,te,5,His shadow that until this very moment had alw...
104565,te,5,"They each seemed frozen on the spot , their we..."
104566,te,5,"The Red Army and the Sack Swords , both formid..."


In [3]:
#shuffling the dataset
df = df.sample(104568)
df

Unnamed: 0,0,1,2
5301,tr,0,Hang it !
59035,tr,4,Taylor knew that they had almost reached the f...
4621,tr,0,Tell us another one .\ntr\t0\tHow were you sa...
25546,tr,1,But the prospect now confronting his congealed...
98182,te,1,Then the wagon came out with a loud pop .
...,...,...,...
33747,tr,2,Lord Brogan quelled a sudden rush of anger at ...
51748,tr,3,"' They 're trying to kill you and Lucas , ' he..."
32677,tr,2,She could hear his laugh following her all the...
45778,tr,3,Dusk turned into night we kept a lookout expec...


In [4]:
#using a subset of the entire data, using the entire data resulted in excess memory allocation
#splitting the data into train,validation and test set
task_train = df[2][:60000].to_numpy()
task_val = df[2][60000:61000].to_numpy()
task_test = df[2][61000:66000].to_numpy()

task_train_labels = df[1][:60000].to_numpy()
task_val_labels = df[1][60000:61000].to_numpy()
task_test_labels = df[1][61000:66000].to_numpy()

In [5]:
task_train[0],task_train_labels[0]

(' Hang it ! ', 0)

In [6]:
task_train = task_train.reshape(-1,1)
task_val = task_val.reshape(-1,1)
task_test = task_test.reshape(-1,1)


task_train_labels = task_train_labels.reshape(-1,1)
task_val_labels = task_val_labels.reshape(-1,1)
task_test_labels = task_test_labels.reshape(-1,1)

In [7]:
x_concat = np.vstack((task_train,task_val))

In [8]:
#Transforming a batch of strings (one sample = one string) into a list of token indices using TextVectorization layer.
#Applying text vectoriztion to the entire text dataset, then feeding it to a model that expects integer sequences as inputs.

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=30000, output_sequence_length=20)

#calling the vectorization layer's adapt() method on the dataset. When this layer is adapted, it analyzes the dataset, determine the frequency of individual string values, and creates a 'vocabulary' from them.
vectorizer.adapt(x_concat)


task_train = task_train.reshape(-1)
task_val = task_val.reshape(-1)
task_test = task_test.reshape(-1)


#vectorizing the dataset
task_train_1 = vectorizer(np.array([[s] for s in task_train])).numpy()
task_val_1 = vectorizer(np.array([[s] for s in task_val])).numpy()
task_test_1 = vectorizer(np.array([[s] for s in task_test])).numpy()




In [9]:
#creating a dict mapping words to their indices:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [10]:
#loading pre-trained word embeddings
path_to_glove_file = '../input/sentlen/glove.6B.100d.txt'

embeddings_index = {}
with open(path_to_glove_file,encoding="utf-8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [11]:
#Preparing a corresponding embedding matrix that we can use in a Keras Embedding layer.
#The embedding matrix is a NumPy matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary.num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 26227 words (534 misses)


In [12]:
#Loading the pre-trained word embeddings matrix into an Embedding layer.
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False, name = "embed_layer"
)

In [13]:
#using Keras callbacks function to save the model with the best validation set accuracy
from keras.callbacks import ModelCheckpoint
filepath="model.{epoch:03d}-{val_accuracy:.3f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor = 'val_accuracy', save_best_only = True, mode = 'max')

In [14]:
#loading the model used for the NLI task and building another model over it to test whether the length of the sentence is encoded in the representation output by the NLI model
def get_model():
        model = tf.keras.models.load_model('../input/sentlen/NLI_model.030-0.793.h5')
        layer_name = "encoder_layer"
        probe_intermediate_layer = tf.keras.Model(inputs=model.get_layer('input_layer').input, outputs=model.get_layer(layer_name).output)
        probe_intermediate_layer.trainable = False
        probe_input = tf.keras.layers.Input(shape=(None,), dtype="int64")

        p = probe_intermediate_layer(probe_input)
        p = tf.keras.layers.Dense(300, activation = 'relu')(p)
        p = tf.keras.layers.Dropout(0.3)(p)
        p = tf.keras.layers.Dense(300, activation = 'relu')(p)
        p = tf.keras.layers.Dropout(0.3)(p)
        outputs = tf.keras.layers.Dense(6, activation = 'softmax')(p)

        probe_model = tf.keras.Model(inputs = probe_input, outputs = outputs)
        return probe_model

In [15]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# instantiating the model in the strategy scope creates the model on the TPU
with tpu_strategy.scope():
        probe_model = get_model()
        probe_model.compile(optimizer='rmsprop',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])   



In [16]:
probe_model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
model (Functional)           (None, 300)               2376550   
_________________________________________________________________
dense (Dense)                (None, 300)               90300     
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 300)               90300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 1806

In [17]:
#summary of the pre-trained model (this model was used for the primary task)
model = tf.keras.models.load_model('../input/sentlen/NLI_model.030-0.793.h5')
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_layer (InputLayer)        [(None, None)]       0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embed_layer (Embedding)         (None, None, 100)    2000200     input_layer[0][0]                
                                                                 input_1[0][0]                    
__________________________________________________________________________________________________
dense_layer (Dense)             (None, None, 150)    15150       embed_layer[0][0]            

In [18]:
from keras.callbacks import ModelCheckpoint
filepath="probe_model.{epoch:03d}-{val_accuracy:.3f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor = 'val_accuracy', save_best_only = True, mode = 'max')

In [19]:
probe_model.fit(task_train_1,task_train_labels,validation_data = (task_val_1,task_val_labels), epochs = 20, batch_size = 1024, callbacks = [checkpoint])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fed4615bad0>

In [20]:
task_train_1[2],task_train_labels[0]

(array([ 164,   84,  161,   47,   18,   73,   95,   41,   13, 1391,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0]),
 array([0]))

In [21]:
probe_model.evaluate(task_test_1,task_test_labels)



[0.7221677899360657, 0.6743999719619751]

In [22]:
from keras import backend as K

#saving the word representations output by the model (which was trained on the primary task) and using it to test the probe's performance

# with a Sequential model
get_3rd_layer_output = K.function([probe_model.layers[0].input],
                                  [probe_model.layers[3].output])
x_test_representation = get_3rd_layer_output(task_test_1)[0]
x_test_representation

array([[0.        , 0.        , 0.01833662, ..., 0.        , 0.13808796,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.3209723 , 0.23934594,
        0.10782446],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.11698917, 0.        , ..., 0.18727249, 0.2615693 ,
        0.        ]], dtype=float32)

In [23]:
x_test_representation = x_test_representation.reshape(-1)

In [24]:
x_test_representation.shape

(1500000,)

In [25]:
#saving the output of the model
df_test_1 = pd.DataFrame(x_test_representation, columns = ['test_sentences'])

df_test_1.to_csv('x_test.csv', index = False)
