In [1]:
import math
import datetime

import torch
from torch import nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import numpy as np
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sentence_embedding_model import SentenceEmbedding
from train_sentence_embedding import (
    CFG,
    FIXED_SEQUENCE_LENGTH,
    vectorise_sequence,
    get_word_vector_matrix,
    dataset_map_fn,
    train
)


In [3]:
BATCH_SIZE = 10
EPOCHS = 1
CURRENT_EPOCH = 0
DATASET_SIZE = 3534
CHECKPOINT_NUM = 5

# BATCH_SIZE = 512
# EPOCHS = 200
# CURRENT_EPOCH = 48
# DATASET_SIZE = 1466076
# CHECKPOINT_NUM = 2

# BATCH_SIZE = 3
# EPOCHS = 1
# CURRENT_EPOCH = 0
# DATASET_SIZE = 8
# CHECKPOINT_NUM = 2

LEARNING_RATE = 0.001
DEVICE = 'cuda'
NUM_WORKERS = 8

print('training configurations: ', CFG)

nlp = spacy.load('en_core_web_lg')
nlp.vocab.vectors.resize((nlp.vocab.vectors.shape[0] + 1, nlp.vocab.vectors.shape[1]))
item_id = nlp.vocab.strings.add("vector_zero")
nlp.vocab.vectors.add(item_id, vector=np.zeros(300, dtype=np.float32))

dataset_map = dataset_map_fn(nlp, FIXED_SEQUENCE_LENGTH)
word_embedding = nn.Embedding.from_pretrained(get_word_vector_matrix(nlp, DEVICE))

data_files = [
    # 'sentence_embedding_training_data/samples.csv',
    # 'sentence_embedding_training_data/abcnews-date-text.csv',
    # 'sentence_embedding_training_data/cnbc_headlines.csv',
    'sentence_embedding_training_data/guardian_headlines.csv',
    # 'sentence_embedding_training_data/reuters_headlines.csv',
    # 'sentence_embedding_training_data/processed-imdb-movie-rating.csv',
]
ds = load_dataset(
    'csv',
    data_files=data_files,
    delimiter='\t', split='train'
)

# ds = ds.to_iterable_dataset(num_shards=NUM_WORKERS).map(
#     dataset_map, batched=True).shuffle(
#         seed=42, buffer_size=math.ceil(BATCH_SIZE * 2.1))
# ds = ds.with_format('torch')
# dataloader = DataLoader(ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS,
#                         persistent_workers=True, pin_memory=True,
#                         pin_memory_device=DEVICE)

encoder1 = SentenceEmbedding(CFG).to(DEVICE)
encoder2 = SentenceEmbedding(CFG).to(DEVICE)

if CURRENT_EPOCH > 0:
    checkpoint1 = f'tmp/checkpoints/v{CHECKPOINT_NUM}/epoch{CURRENT_EPOCH}_encoder1'
    checkpoint2 = f'tmp/checkpoints/v{CHECKPOINT_NUM}/epoch{CURRENT_EPOCH}_encoder2'
    print(f'Load checkpoint: {checkpoint1}')
    print(f'Load checkpoint: {checkpoint2}')
    encoder1.load_state_dict(torch.load(checkpoint1))
    encoder2.load_state_dict(torch.load(checkpoint2))

ds = ds.map(dataset_map, keep_in_memory=True, batched=True, num_proc=NUM_WORKERS)
ds = ds.with_format('torch', device=DEVICE)
dataloader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)

loss_fn = nn.L1Loss()
optimizer1 = torch.optim.Adam(encoder1.parameters(), lr=LEARNING_RATE)
optimizer2 = torch.optim.Adam(encoder2.parameters(), lr=LEARNING_RATE)

for epoch in range(CURRENT_EPOCH + 1, EPOCHS + CURRENT_EPOCH + 1):
    print(f'\n\nEpoch {epoch}\n----------------------------------')

    train(dataloader, word_embedding, nlp, encoder1, encoder2, loss_fn,
          optimizer1, optimizer2, CFG, DATASET_SIZE, epoch, CHECKPOINT_NUM)

    torch.save(encoder1.state_dict(),
               f'tmp/checkpoints/v{CHECKPOINT_NUM}/epoch{epoch}_encoder1')
    torch.save(encoder2.state_dict(),
               f'tmp/checkpoints/v{CHECKPOINT_NUM}/epoch{epoch}_encoder2')

    torch.save(encoder1.state_dict(), f'tmp/encoder1_v{CHECKPOINT_NUM}')
    torch.save(encoder2.state_dict(), f'tmp/encoder2_v{CHECKPOINT_NUM}')


training configurations:  {'embed_size': 300, 'hidden_size1': 32, 'hidden_size2': 512, 'dropout1': 0.23, 'dropout2': 0.89, 'num_layers1': 2, 'num_layers2': 2, 'device': 'cuda', 'batch_size': 10, 'fixed_sequence_length': 40, 'token_noise_magnitue': 100, 'sequence_noise_ratio': 0.31}


Map (num_proc=8): 100%|██████████████████████████████████████████████████████████████████████████████| 3534/3534 [00:48<00:00, 73.51 examples/s]




Epoch 1
----------------------------------
data['sample1'][0][0:10]:  tensor([  22, 2337, 1917,  135,    3, 5715,   12], device='cuda:0')
noised_transposed_s1:  tensor([-7.2681, -0.8572,  5.8105,  1.9771,  8.8147, -5.8579,  3.7143],
       device='cuda:0')
noised_transposed_s1:  tensor([-0.3384, -0.9287, -2.3704,  0.8171,  0.8808, -0.7651, -0.3098],
       device='cuda:0')
noised_transposed_s1:  tensor([1.3152, 0.0778, 0.4965, 2.0356, 3.0808, 0.7814, 0.0461],
       device='cuda:0')
noised_transposed_s1:  tensor([-16.1231,  34.4578, -19.0979,  29.2738, 139.0638,  87.6554, 150.4476],
       device='cuda:0')
noised_transposed_s1:  tensor([-5.1043,  2.3496,  3.2472,  2.8424, 11.4590, -2.4137,  0.5106],
       device='cuda:0')
noised_transposed_s1:  tensor([-3.6701,  0.2606,  0.6184,  2.1362,  4.9983,  2.1625,  1.8479],
       device='cuda:0')
noised_transposed_s1:  tensor([ 1.4750,  6.0078,  1.1205, -3.5874,  3.7638,  3.1987, -2.2060],
       device='cuda:0')
noised_transposed_s1:  tens

In [17]:
encoder1.eval
encoder2.eval()

SentenceEmbedding(
  (compress1): GRU(300, 32, num_layers=2)
  (decode1): GRU(32, 32, num_layers=2, dropout=0.23)
  (decode2): GRU(32, 512, num_layers=2, dropout=0.89)
  (cos): CosineSimilarity()
)

In [9]:
encoder1.batch_size = 1
encoder1.dropout1 = 0.0
encoder1.dropout2 = 0.0

In [10]:
encoder2.batch_size = 1
encoder2.dropout1 = 0.0
encoder2.dropout2 = 0.0

In [13]:
tests = [
    (
        "Is Donald Trump 's Iran strategy working ?",
        "Why Facebook 's Libra currency gets the thumbs down",
    ),

]

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
word_embedding = nn.Embedding.from_pretrained(get_word_vector_matrix(nlp, 'cuda'))


for text1, text2 in tests:
    v1 = vectorise_sequence(nlp, FIXED_SEQUENCE_LENGTH, [text1])
    t1 = torch.from_numpy(np.array(v1)).to('cuda')
    t1 = word_embedding(t1)
    t1 = t1.transpose(0, 1)

    v2 = vectorise_sequence(nlp, FIXED_SEQUENCE_LENGTH, [text2])
    t2 = torch.from_numpy(np.array(v2)).to('cuda')
    t2 = word_embedding(t2)
    t2 = t2.transpose(0, 1)

    en1_embedding1 = encoder1(t1)
    en1_embedding2 = encoder1(t2)

    
    #print(t1.shape)
    #print(transposed_t1.shape)
    #print(en1_embedding1.shape)

    #print('v1: ', v1[0][0:10])
    #for idx in range(t1.shape[1]):
    #    print('t1: ', t1[0, idx, 0:7])
    #print('en1_embedding1: ', en1_embedding1[0][0:10])
    
    #print('v2: ', v2[0][0:10])
    #for idx in range(t2.shape[1]):
    #    print('t2: ', t2[0, idx, 0:7])
    #print('en1_embedding2: ', en1_embedding2[0][0:10])
    
    score = cos(en1_embedding1, en1_embedding2)
    score = score[0].item()

    print(f'{text1} - {text2} - {score}')


Is Donald Trump 's Iran strategy working ? - Why Facebook 's Libra currency gets the thumbs down - 0.09094953536987305
