In [None]:
!pip install -U fasttext pyonmttok
!pip install torch==1.5.0+cu101 torchvision==0.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
!wget -nc https://www.dropbox.com/s/c0lun02076n968t/en_tg_1101_0510.jsonl -O - | tar -xz > en_tg_1101_0510.jsonl
!wget https://www.dropbox.com/s/e8ewd75cc3yagim/en_vectors_v1.bin

--2021-07-26 14:33:55--  https://www.dropbox.com/s/c0lun02076n968t/en_tg_1101_0510.jsonl
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/c0lun02076n968t/en_tg_1101_0510.jsonl [following]
--2021-07-26 14:33:55--  https://www.dropbox.com/s/raw/c0lun02076n968t/en_tg_1101_0510.jsonl
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc4281e4b663ad017eadc54f60e8.dl.dropboxusercontent.com/cd/0/inline/BTCVRZdFCQ2uz4FmgKNv5cvF6DEMIbtS4wRJ4dgk8FLlLtaq99otbdy1G9BovXUIaDjBuAdcnrsHGcl621okt4em4JElecUZzrVu2oVMe9bCkMhlwRfyzOOb3BhP0LCc_ByQb3IW2qlO80oLzIL7rb3i/file# [following]
--2021-07-26 14:33:55--  https://uc4281e4b663ad017eadc54f60e8.dl.dropboxusercontent.com/cd/0/inline/BTCVRZdFCQ2uz4FmgKNv5cvF6DEMIbtS4wRJ4dgk8FLlLtaq99otbdy1G9BovXU

In [None]:
!rm -f all-the-news.zip
!wget https://www.dropbox.com/s/y5kblk5a5x35odg/all-the-news.zip
!unzip all-the-news.zip

In [4]:
import fasttext

ft_model = fasttext.load_model('en_vectors_v1.bin')



In [5]:
import json

tg_data = []
with open("en_tg_1101_0510.jsonl", "r") as r:
    for line in r:
        tg_data.append(json.loads(line))
tg_data.sort(key=lambda x: x['timestamp'])

In [6]:
import csv
import sys
import re
csv.field_size_limit(sys.maxsize)

def get_date(st):
    dates = re.findall(r"\d\d\d\d\-\d\d\-\d\d", st)
    return next(iter(dates), None)

all_the_news_files = ("articles1.csv", "articles2.csv", "articles3.csv")
atn_data = []
for file_name in all_the_news_files:
    with open(file_name, "r") as r:
        next(r)
        reader = csv.reader(r, delimiter=',')
        for row in reader:
            _, _, title, host, _, date, _, _, _, text = row
            if date == 'nan' or get_date(date) is None:
                continue
            atn_data.append({"title": title, "text": text, "site_name": host, "date": date})
atn_data.sort(key=lambda x: x["date"])
print(atn_data[0])
print(len(atn_data))

{'title': 'How Nirvana’s ’Smells Like Teen Spirit’ Became An Anthem', 'text': 'In the early 1990s, Seattle stood at the center of a new rock ’n’ roll genre called grunge. The music was loud, pared down, and largely unrestrained. Hundreds of garage bands formed in Seattle over a short period of time. One of them, Nirvana, achieved mammoth success with its first   single, ”Smells Like Teen Spirit.” That song, the band and lead singer Kurt Cobain would come to represent the genre.  By the time Nirvana started playing in the small clubs around Pioneer Square in Seattle, pop music was witnessing the decline of the highly produced synthesized sound that had dominated it for years. At that time, rock subgenres were pretty  . Heavy metal was loud, and the   alternative bands from England cranked out such earnest tunes that critics called the style New Romantic music. American punk, while still vital, wasn’t commercially viable. In 1991, all that was about to change. Nirvana performed a version

In [7]:
def words_to_embed(model, words):
    vectors = [model.get_word_vector(w) for w in words]
    norm_vectors = [x / np.linalg.norm(x) for x in vectors]
    avg_wv = np.mean(norm_vectors, axis=0)
    max_wv = np.max(norm_vectors, axis=0)
    min_wv = np.min(norm_vectors, axis=0)
    return np.concatenate((avg_wv, max_wv, min_wv))

In [8]:
import pyonmttok
tokenizer = pyonmttok.Tokenizer("conservative", joiner_annotate=False)

def preprocess(text):
    text = str(text).strip().replace("\n", " ").replace("\xa0", " ").lower()
    tokens, _ = tokenizer.tokenize(text)
    text = " ".join(tokens)
    return text

In [None]:
import numpy as np

def get_samples(data, ft_model, count, min_words=4, max_words=300):
    last_host_end = {}
    samples = []
    for count, row in enumerate(data[:count]):
        if count % 10000 == 0:
            print(count)
        
        host = row["site_name"]
        text = preprocess(row["title"] + " " + row["text"])
        words = text.split(" ")
        if len(words) < min_words:
            continue
        words = words[:max_words]
            
        border = len(words) // 2
        begin_words = words[:border]
        end_words = words[border:]

        left_vector = words_to_embed(ft_model, begin_words)
        left_text = " ".join(begin_words)
        right_vector = words_to_embed(ft_model, end_words)
        right_text = " ".join(end_words)

        if host in last_host_end:
            samples.append((left_vector, right_vector, last_host_end[host][0]))
        last_host_end[host] = (right_vector, right_text)
    return samples

tg_samples = get_samples(tg_data, ft_model, 250000)
atn_samples = get_samples(atn_data, ft_model, 135000)

0
10000
20000
30000
40000
50000


In [None]:
tg_test_size = len(tg_samples) // 10
atn_test_size = len(atn_samples) // 10
train_samples = tg_samples[:-tg_test_size] + atn_samples[:-atn_test_size]
test_samples = tg_samples[-tg_test_size:] + atn_samples[-atn_test_size:]
tg_test_samples = tg_samples[-tg_test_size:]

In [None]:
from sklearn import metrics
from scipy import spatial

scores = []
test_y = []
for sample in test_samples:
    left_vector, pos_right_vector, neg_right_vector = sample
    test_y += [1, 0]
    scores.append(-spatial.distance.cosine(left_vector, pos_right_vector))
    scores.append(-spatial.distance.cosine(left_vector, neg_right_vector))
metrics.roc_auc_score(test_y, scores)

# Model

In [None]:
import torch
import torch.nn as nn

class SiamiseModelTripletLoss(nn.Module):
    def __init__(self, embedding_dim=384, hidden_dim=50):
        super().__init__()
        
        self.mapping_layer = nn.Linear(embedding_dim, hidden_dim)
        self.distance = nn.PairwiseDistance(p=2)
        self.margin = 0.3
    
    def build_projections(self, in_vectors):
        projections = self.mapping_layer(in_vectors)
        norm = projections.norm(p=2, dim=1, keepdim=True)
        projections = projections.div(norm)
        return projections

    def forward(self, pivot_vectors, positive_vectors, negative_vectors):
        pivot = self.build_projections(pivot_vectors)
        positive = self.build_projections(positive_vectors)
        negative = self.build_projections(negative_vectors)
        distances = self.distance(pivot, positive) - self.distance(pivot, negative) + self.margin
        loss = torch.mean(torch.max(distances, torch.zeros_like(distances)))
        return loss
    
    def apply(self, vectors):
        return self.build_projections(vectors)

# Training

In [None]:
import time
import random
import torch.optim as optim

def get_next_gen_batch(samples, batch_size=64):
    indices = np.arange(len(samples))
    np.random.shuffle(indices)
    batch_begin = 0
    while batch_begin < len(samples):
        batch_indices = indices[batch_begin: batch_begin + batch_size]
        pivot_vectors = []
        positive_vectors = []
        negative_vectors = []
        for data_ind in batch_indices:
            pivot, positive, negative = samples[data_ind]
            pivot_vectors.append(pivot)
            positive_vectors.append(positive)
            negative_vectors.append(negative)
        batch_begin += batch_size
        yield torch.cuda.FloatTensor(pivot_vectors), torch.cuda.FloatTensor(positive_vectors), torch.cuda.FloatTensor(negative_vectors)

def train_model(model, train_samples, val_samples, epochs_count=10, 
                loss_every_nsteps=10000, lr=0.01, device_name="cuda"):
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.BCELoss().cuda()
    prev_avg_val_loss = None
    for epoch in range(epochs_count):
        model.train()
        for step, (pivot, positive, negative) in enumerate(get_next_gen_batch(train_samples)):
            loss = model(pivot, positive, negative)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
            if step % loss_every_nsteps == 0:
                val_total_loss = 0
                val_batch_count = 0
                model.eval()
                for _, (pivot, positive, negative) in enumerate(get_next_gen_batch(val_samples)):
                    val_total_loss += model(pivot, positive, negative)
                    val_batch_count += 1
                avg_val_loss = val_total_loss/val_batch_count
                print("Epoch = {}, Avg Train Loss = {:.4f}, Avg val loss = {:.4f}, Time = {:.2f}s".format(epoch, total_loss / loss_every_nsteps, avg_val_loss, time.time() - start_time))
                total_loss = 0
                start_time = time.time()

random.shuffle(train_samples)
random.shuffle(test_samples)
model = SiamiseModelTripletLoss()
train_model(model, train_samples, test_samples)

# Testing

In [None]:
test_left = []
test_right = []
test_y = []
for sample in test_samples:
    left, pos_right, neg_right = sample
    test_left += [left, left]
    test_right += [pos_right, neg_right]
    test_y += [1, 0]

batch = []
batch_start = 0
nrows = len(test_left)
scores = []
while batch_start < nrows:
    batch_end = batch_start + 32
    left_batch = test_left[batch_start: batch_end]
    right_batch = test_right[batch_start: batch_end]
    left = model.apply(torch.cuda.FloatTensor(left_batch)).cpu().detach().numpy()
    right = model.apply(torch.cuda.FloatTensor(right_batch)).t().cpu().detach().numpy()
    left = left / np.linalg.norm(left)
    right = right / np.linalg.norm(right)
    score = (left.dot(right) + 1.0) / 2.0 - 1.0
    score = np.diag(score)
    scores.extend(score.tolist())
    batch_start = batch_end
metrics.roc_auc_score(test_y, scores)

# Saving

In [None]:
model = model.cpu()

In [None]:
import torch
import torch.nn as nn

class Embedder(nn.Module):
    def __init__(self, embedding_dim=384, hidden_dim=50):
        super().__init__()
        
        self.mapping_layer = nn.Linear(embedding_dim, hidden_dim)
    
    def forward(self, in_vectors):
        projections = self.mapping_layer(in_vectors)
        norm = projections.norm(p=2, dim=1, keepdim=True)
        projections = projections.div(norm)
        return projections

examples = torch.zeros((1, 384))
examples[0][:] = torch.FloatTensor(test_samples[0][0])
embedder = Embedder()
embedder.mapping_layer.weight.data = model.mapping_layer.weight.data
embedder.mapping_layer.bias = model.mapping_layer.bias
traced_embedder = torch.jit.trace(embedder.cpu(), examples)

In [None]:
torch.save(model.state_dict(), "en_full_model.pt")

In [None]:
traced_embedder.save("en_sentence_embedder_v1.pt")