In [None]:
from google.colab import drive
drive.mount('/content/drive/')


import os
os.chdir("drive/My Drive/Advanced NLP/Exam")

import pickle
import random
import time
from collections import Counter, defaultdict

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import torch
import torch.nn as nn
from torch import optim

plt.switch_backend('agg')
import numpy as np
from tqdm import tqdm

from models import EncoderGRU, AttnDecoderGRU, EncoderLSTM, DecoderLSTM, AttnDecoderLSTM
from utils import Lang, tensorsFromPair, timeSince, showPlot


Mounted at /content/drive/


In [None]:
SOS_token = 0
EOS_token = 1

class Format:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.index2word = {SOS_token: "SOS", EOS_token: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.n_words += 1
     
     
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def readFile(filename):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('../SCAN-master/%s.txt' % filename, encoding='utf-8').read().strip().split('\n')

    pairs = [s[4:].split(' OUT: ') for s in lines]

    input_lang = Format("input")
    output_lang = Format("output")

    return input_lang, output_lang, pairs

def prepareData(filename):
    input_lang, output_lang, pairs = readFile(filename)
    print("Read %s sentence pairs" % len(pairs))

    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)

    return input_lang, output_lang, pairs

#input_lang, output_lang, pairs = prepareData("simple_split/size_variations/tasks_train_simple_p2")
#input_lang_test, output_lang_test, pairs_test = prepareData("simple_split/size_variations/tasks_test_simple_p2")
input_lang, output_lang, pairs = prepareData("add_prim_split/tasks_train_addprim_turn_left")
input_lang_test, output_lang_test, pairs_test = prepareData("add_prim_split/tasks_test_addprim_turn_left")
print(random.choice(pairs))
print(random.choice(pairs_test))



Reading lines...
Read 21890 sentence pairs
Counting words...
Counted words:
input 15
output 8
Reading lines...
Read 1208 sentence pairs
Counting words...
Counted words:
input 15
output 8
['look right twice after walk', 'I_WALK I_TURN_RIGHT I_LOOK I_TURN_RIGHT I_LOOK']
['jump around right thrice and turn left thrice', 'I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_RIGHT I_JUMP I_TURN_LEFT I_TURN_LEFT I_TURN_LEFT']


In [None]:
# Load saved models and results
input_size = input_lang.n_words
output_size = output_lang.n_words
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import pickle
cosineEncoder = EncoderLSTM(input_size, 200, num_layers=2, dropout=0.5).to(device)
cosineEncoder.load_state_dict(torch.load(f"encoder_3b2_lstm_False.pt"))
cosineEncoder.to(device)


EncoderLSTM(
  (embedding): Embedding(15, 200)
  (lstm): LSTM(200, 200, num_layers=2, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [None]:
# Cosine similarities
# focusing on the median-performance run of the overall-best model

train_unique = set([p[0] for p in pairs])
all_train =  [p[0] for p in pairs]
keywords = ["run", "jump", "run twice", "jump twice"]

def cosineSimilariy(keywords, cosineEncoder):

    hidden_representations = {}

    with torch.no_grad():
        for keyword in keywords:
            hidden_representations[keyword] = {}
            keyword_tensor = tensorFromSentence(input_lang, keyword)

            key_input_length = keyword_tensor.size()[0]
            encoder_hidden_a = cosineEncoder.initHidden()

            for ei in range(key_input_length):
                encoder_output_k, encoder_hidden_a = cosineEncoder(keyword_tensor[ei],
                                                        encoder_hidden_a)
            
            for query in all_train: #train_unique
                encoder_hidden_b = cosineEncoder.initHidden()
                query_tensor = tensorFromSentence(input_lang, query)

                query_input_length = query_tensor.size()[0]

                for ei in range(query_input_length):
                    encoder_output_q, encoder_hidden_b = cosineEncoder(query_tensor[ei],
                                                            encoder_hidden_b)
                cos = nn.CosineSimilarity()
                output = cos(encoder_output_k, encoder_output_q)
                hidden_representations[keyword][query] = torch.mean(output)#            
            
    from collections import Counter
    for i in keywords:
        top5 = Counter(hidden_representations[i]).most_common()[:6]
        print(i)
        print()
        print(top5)
        print()
        
    return hidden_representations

In [None]:
cosineSimilariy(keywords, cosineEncoder)

run

[('walk left after run opposite right', tensor(0.4900, device='cuda:0')), ('run opposite left thrice and run twice', tensor(0.4800, device='cuda:0')), ('run right twice after walk twice', tensor(0.4800, device='cuda:0')), ('walk right twice and run twice', tensor(0.4700, device='cuda:0')), ('run right twice and run right twice', tensor(0.4700, device='cuda:0')), ('run right twice and run right', tensor(0.4600, device='cuda:0'))]

jump

[('jump opposite left twice after jump', tensor(0.4400, device='cuda:0')), ('jump right twice and jump opposite left', tensor(0.4400, device='cuda:0')), ('walk and jump opposite left', tensor(0.4300, device='cuda:0')), ('run opposite left twice after jump opposite left', tensor(0.4100, device='cuda:0')), ('jump', tensor(0.4100, device='cuda:0')), ('jump twice and jump', tensor(0.4000, device='cuda:0'))]

run twice

[('jump twice and run right twice', tensor(0.4800, device='cuda:0')), ('run opposite right twice after jump left twice', tensor(0.4800, 

{'run': {'turn left': tensor(0.0700, device='cuda:0'),
  'walk opposite right thrice after walk opposite left twice': tensor(0.0800, device='cuda:0'),
  'turn opposite right after jump opposite left': tensor(0.1200, device='cuda:0'),
  'walk thrice and run opposite left twice': tensor(0.1900, device='cuda:0'),
  'look after turn opposite right twice': tensor(0.1000, device='cuda:0'),
  'walk twice and look opposite right': tensor(0.1600, device='cuda:0'),
  'look around right twice and walk': tensor(0.2200, device='cuda:0'),
  'turn opposite left twice and run around left thrice': tensor(0.1300, device='cuda:0'),
  'turn right twice after run opposite left twice': tensor(0.2700, device='cuda:0'),
  'walk left after walk thrice': tensor(0.0400, device='cuda:0'),
  'turn opposite right after run left thrice': tensor(0.1400, device='cuda:0'),
  'jump right twice after walk opposite left twice': tensor(0.2000, device='cuda:0'),
  'turn opposite left thrice and run left thrice': tensor(0.22