In [1]:
import torch
import requests
from helper import bert_split
import multiprocessing as mp
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

In [33]:
websites = ['www.epfl.ch/en', 
            'www.epfl.ch/fr',
            'www.caltech.edu/', 
            'www.bbc.com', 
            'www.foxnews.com', 
            'www.apple.com/', 
            'www.samsung.com/us/', 
            'www.samsung.com/fr/']

websites_html = [requests.get('https://' + url).text for url in websites]

In [34]:
def embed_xlmr(body):
    sentences = bert_split(body)
    return model.encode(sentences).mean(axis=0)

In [35]:
websites_emb = [embed_xlmr(w) for w in websites_html]

In [36]:
websites_emb[0].min()

-0.41392076

In [37]:
websites_emb[0].max()

0.42447823

In [38]:
sim = cosine_similarity([w for w in websites_emb])

In [42]:
for i in range(len(websites)):
    ix = sim[i].argsort()[-3:][1::-1]
    print('closest to {:<20} : {:>20} ({:.4f}), {:>20} ({:.4f})'.format(websites[i][:22], 
                                                           websites[ix[0]][:22],
                                                           sim[ix[0], i],
                                                           websites[ix[1]][:22],
                                                           sim[ix[1], i]))

closest to www.epfl.ch/en       :       www.epfl.ch/fr (0.9743),     www.caltech.edu/ (0.9317)
closest to www.epfl.ch/fr       :       www.epfl.ch/en (0.9743),     www.caltech.edu/ (0.9181)
closest to www.caltech.edu/     :       www.epfl.ch/en (0.9317),       www.epfl.ch/fr (0.9181)
closest to www.bbc.com          :      www.foxnews.com (0.9325),  www.samsung.com/fr/ (0.8850)
closest to www.foxnews.com      :          www.bbc.com (0.9325),  www.samsung.com/fr/ (0.9005)
closest to www.apple.com/       :  www.samsung.com/us/ (0.8918),  www.samsung.com/fr/ (0.8833)
closest to www.samsung.com/us/  :  www.samsung.com/fr/ (0.9566),       www.apple.com/ (0.8918)
closest to www.samsung.com/fr/  :  www.samsung.com/us/ (0.9566),       www.epfl.ch/fr (0.9113)
