In [1]:
from transformers import BertTokenizer
from razdel import sentenize
from models.model_builder import AbsSummarizer
import torch
import numpy as np
import pandas as pd

In [2]:
class BertData:
    def __init__(self, bert_model, lower, max_src_tokens, max_tgt_tokens):
        self.max_src_tokens = max_src_tokens
        self.max_tgt_tokens = max_tgt_tokens
        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=lower, do_basic_tokenize=False)
        self.sep_token = '[SEP]'
        self.cls_token = '[CLS]'
        self.pad_token = '[PAD]'
        self.tgt_bos = '[unused1] '
        self.tgt_eos = ' [unused2]'
        self.tgt_sent_split = ' [unused3] '
        self.sep_vid = self.tokenizer.vocab[self.sep_token]
        self.cls_vid = self.tokenizer.vocab[self.cls_token]
        self.pad_vid = self.tokenizer.vocab[self.pad_token]

    def preprocess(self, src, tgt):
        src_txt = [' '.join(s) for s in src]
        text = ' {} {} '.format(self.sep_token, self.cls_token).join(src_txt)
        src_tokens = self.tokenizer.tokenize(text)[:self.max_src_tokens]
        src_tokens.insert(0, self.cls_token)
        src_tokens.append(self.sep_token)
        src_indices = self.tokenizer.convert_tokens_to_ids(src_tokens)

        _segs = [-1] + [i for i, t in enumerate(src_indices) if t == self.sep_vid]
        segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
        segments_ids = []
        for i, s in enumerate(segs):
            if i % 2 == 0:
                segments_ids += s * [0]
            else:
                segments_ids += s * [1]

        return src_indices, segments_ids

In [3]:
def doc2bert(text):
    src = [s.text.lower().split() for s in sentenize(text)]
    src_indices, segments_ids = bert_data.preprocess(src, '')
    return { "src": src_indices, "segs": segments_ids }

def doc2vec(text, model, mode='MeanSum'):
    doc_bert = doc2bert(text)
    
    src = torch.tensor([doc_bert['src']])
    segs = torch.tensor([doc_bert['segs']])
    mask_src = ~(src == 0)
    
    output = model.bert(src, segs, mask_src)
    
    if mode == 'FirstCLS':
        return output[0][0]
    elif mode == 'MeanSum':
        return output[0].mean(0)
    else:
        raise Exception('Wrong mode')

In [4]:
checkpoint = torch.load('/data/alolbuhtijarov/rubert_cased_L-12_H-768_A-12_pt/model_step_15000.pt',
                        map_location=lambda storage, loc: storage)

In [5]:
args = lambda a: b

args.model_path = '/data/alolbuhtijarov/rubert_cased_L-12_H-768_A-12_pt'
args.large = False
args.temp_dir = 'temp'
args.finetune_bert = False
args.encoder = 'bert'
args.max_pos = 256
args.dec_layers = 6
args.share_emb = False
args.dec_hidden_size = 768
args.dec_heads = 8
args.dec_ff_size = 2048
args.dec_dropout = 0.2
args.use_bert_emb = False

bert_data = BertData(args.model_path, True, 510, 128)

In [6]:
model = AbsSummarizer(args, 'cpu', checkpoint)
model.eval()

AbsSummarizer(
  (bert): Bert(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(119547, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm(t

### Clustering

In [7]:
import tqdm
from sklearn.cluster import AgglomerativeClustering
from collections import defaultdict
import csv
import json

In [8]:
def read_markup(file_name):
    with open(file_name, "r") as r:
        reader = csv.reader(r, delimiter='\t', quotechar='"')
        header = next(reader)
        for row in reader:
            assert len(header) == len(row)
            record = dict(zip(header, row))
            yield record
            

In [9]:
from sklearn.metrics import classification_report

def calc_metrics(gold_markup, url2label, url2record, output_dict=False):
    not_found_count = 0
    for first_url, second_url in list(gold_markup.keys()):
        not_found_in_labels = first_url not in url2label or second_url not in url2label
        not_found_in_records = first_url not in url2record or second_url not in url2record
        if not_found_in_labels or not_found_in_records:
            not_found_count += 1
            gold_markuo.pop((first_url, second_url))

    targets = []
    predictions = []
    for (first_url, second_url), target in gold_markup.items():
        prediction = int(url2label[first_url] == url2label[second_url])
        first = url2record.get(first_url)
        second = url2record.get(second_url)
        targets.append(target)
        predictions.append(prediction)
    return classification_report(targets, predictions, output_dict=output_dict)

In [10]:
markup_path = "/data/alolbuhtijarov/datasets/ru_threads_target.tsv"
clustering_data_path = "/data/alolbuhtijarov/datasets/ru_clustering_data.jsonl"

In [11]:
markup = defaultdict(dict)
for record in read_markup(markup_path):
    first_url = record["INPUT:first_url"]
    second_url = record["INPUT:second_url"]
    quality = int(record["OUTPUT:quality"] == "OK")
    markup[(first_url, second_url)] = quality

In [12]:
url2record = dict()
filename2url = dict()
with open(clustering_data_path, "r") as r:
    for line in r:
        record = json.loads(line)
        url2record[record["url"]] = record
        filename2url[record["file_name"]] = record["url"]

In [14]:
embeds = np.zeros((len(url2record), 768))

for i, (url, record) in tqdm.tqdm(enumerate(url2record.items()), total=embeds.shape[0]):
    text = record["title"] + ' ' + record["text"]
    text = text.lower().replace('\xa0', ' ')
    embeds[i] = doc2vec(text, model, mode='FirstCLS')


  0%|          | 0/10730 [00:00<?, ?it/s][A
  0%|          | 1/10730 [00:00<20:22,  8.78it/s][A
  0%|          | 3/10730 [00:00<19:19,  9.25it/s][A
  0%|          | 4/10730 [00:00<24:44,  7.23it/s][A
  0%|          | 6/10730 [00:00<22:10,  8.06it/s][A
  0%|          | 8/10730 [00:00<19:57,  8.95it/s][A
  0%|          | 10/10730 [00:01<18:26,  9.69it/s][A
  0%|          | 12/10730 [00:01<17:31, 10.19it/s][A
  0%|          | 14/10730 [00:01<16:34, 10.77it/s][A
  0%|          | 16/10730 [00:01<16:54, 10.56it/s][A
  0%|          | 18/10730 [00:01<15:03, 11.85it/s][A
  0%|          | 20/10730 [00:01<15:04, 11.84it/s][A
  0%|          | 22/10730 [00:02<14:38, 12.19it/s][A
  0%|          | 24/10730 [00:02<15:09, 11.78it/s][A
  0%|          | 26/10730 [00:02<18:24,  9.70it/s][A
  0%|          | 28/10730 [00:02<23:35,  7.56it/s][A
  0%|          | 30/10730 [00:03<21:07,  8.44it/s][A
  0%|          | 32/10730 [00:03<18:50,  9.47it/s][A
  0%|          | 34/10730 [00:03<18:03,  

  2%|▏         | 258/10730 [00:27<14:29, 12.05it/s][A
  2%|▏         | 260/10730 [00:27<16:14, 10.75it/s][A
  2%|▏         | 262/10730 [00:27<20:18,  8.59it/s][A
  2%|▏         | 263/10730 [00:27<22:15,  7.84it/s][A
  2%|▏         | 264/10730 [00:28<25:28,  6.85it/s][A
  2%|▏         | 266/10730 [00:28<24:07,  7.23it/s][A
  2%|▏         | 268/10730 [00:28<21:30,  8.11it/s][A
  3%|▎         | 270/10730 [00:28<18:46,  9.28it/s][A
  3%|▎         | 272/10730 [00:28<17:57,  9.70it/s][A
  3%|▎         | 274/10730 [00:29<20:13,  8.62it/s][A
  3%|▎         | 275/10730 [00:29<24:18,  7.17it/s][A
  3%|▎         | 277/10730 [00:29<20:23,  8.54it/s][A
  3%|▎         | 279/10730 [00:29<17:57,  9.70it/s][A
  3%|▎         | 281/10730 [00:29<18:37,  9.35it/s][A
  3%|▎         | 283/10730 [00:29<18:29,  9.42it/s][A
  3%|▎         | 285/10730 [00:30<17:34,  9.90it/s][A
  3%|▎         | 287/10730 [00:30<16:53, 10.30it/s][A
  3%|▎         | 289/10730 [00:30<16:16, 10.70it/s][A
  3%|▎    

  5%|▍         | 534/10730 [00:55<18:13,  9.33it/s][A
  5%|▍         | 536/10730 [00:55<19:03,  8.91it/s][A
  5%|▌         | 537/10730 [00:55<18:58,  8.95it/s][A
  5%|▌         | 538/10730 [00:55<21:21,  7.96it/s][A
  5%|▌         | 539/10730 [00:55<22:47,  7.45it/s][A
  5%|▌         | 540/10730 [00:55<23:14,  7.31it/s][A
  5%|▌         | 542/10730 [00:56<20:43,  8.19it/s][A
  5%|▌         | 544/10730 [00:56<18:26,  9.20it/s][A
  5%|▌         | 546/10730 [00:56<16:29, 10.29it/s][A
  5%|▌         | 548/10730 [00:56<15:35, 10.88it/s][A
  5%|▌         | 550/10730 [00:56<15:04, 11.25it/s][A
  5%|▌         | 552/10730 [00:56<15:15, 11.12it/s][A
  5%|▌         | 554/10730 [00:56<14:13, 11.92it/s][A
  5%|▌         | 556/10730 [00:57<15:29, 10.95it/s][A
  5%|▌         | 558/10730 [00:57<19:32,  8.68it/s][A
  5%|▌         | 559/10730 [00:57<23:57,  7.08it/s][A
  5%|▌         | 561/10730 [00:57<21:14,  7.98it/s][A
  5%|▌         | 563/10730 [00:58<22:06,  7.66it/s][A
  5%|▌    

  7%|▋         | 801/10730 [01:22<16:09, 10.24it/s][A
  7%|▋         | 803/10730 [01:22<14:47, 11.19it/s][A
  8%|▊         | 805/10730 [01:22<13:36, 12.16it/s][A
  8%|▊         | 807/10730 [01:22<14:01, 11.79it/s][A
  8%|▊         | 809/10730 [01:22<13:36, 12.15it/s][A
  8%|▊         | 811/10730 [01:22<13:02, 12.68it/s][A
  8%|▊         | 813/10730 [01:22<12:41, 13.03it/s][A
  8%|▊         | 815/10730 [01:23<13:21, 12.37it/s][A
  8%|▊         | 817/10730 [01:23<13:23, 12.33it/s][A
  8%|▊         | 819/10730 [01:23<15:00, 11.00it/s][A
  8%|▊         | 821/10730 [01:23<17:12,  9.59it/s][A
  8%|▊         | 823/10730 [01:24<19:20,  8.53it/s][A
  8%|▊         | 825/10730 [01:24<18:13,  9.05it/s][A
  8%|▊         | 827/10730 [01:24<16:12, 10.18it/s][A
  8%|▊         | 829/10730 [01:24<15:02, 10.97it/s][A
  8%|▊         | 831/10730 [01:24<15:15, 10.82it/s][A
  8%|▊         | 833/10730 [01:24<13:42, 12.04it/s][A
  8%|▊         | 835/10730 [01:25<13:58, 11.80it/s][A
  8%|▊    

 10%|█         | 1084/10730 [01:49<15:49, 10.16it/s][A
 10%|█         | 1086/10730 [01:50<15:16, 10.53it/s][A
 10%|█         | 1088/10730 [01:50<14:19, 11.21it/s][A
 10%|█         | 1090/10730 [01:50<14:31, 11.06it/s][A
 10%|█         | 1092/10730 [01:50<13:33, 11.84it/s][A
 10%|█         | 1094/10730 [01:50<12:43, 12.62it/s][A
 10%|█         | 1096/10730 [01:50<13:22, 12.01it/s][A
 10%|█         | 1098/10730 [01:50<13:20, 12.03it/s][A
 10%|█         | 1100/10730 [01:51<13:47, 11.64it/s][A
 10%|█         | 1102/10730 [01:51<16:24,  9.78it/s][A
 10%|█         | 1104/10730 [01:51<17:02,  9.41it/s][A
 10%|█         | 1106/10730 [01:51<15:44, 10.19it/s][A
 10%|█         | 1108/10730 [01:51<14:50, 10.81it/s][A
 10%|█         | 1110/10730 [01:52<14:24, 11.13it/s][A
 10%|█         | 1112/10730 [01:52<12:44, 12.58it/s][A
 10%|█         | 1114/10730 [01:52<12:40, 12.65it/s][A
 10%|█         | 1116/10730 [01:52<14:59, 10.69it/s][A
 10%|█         | 1118/10730 [01:52<16:48,  9.53i

 13%|█▎        | 1357/10730 [02:17<18:39,  8.38it/s][A
 13%|█▎        | 1358/10730 [02:17<20:44,  7.53it/s][A
 13%|█▎        | 1359/10730 [02:17<28:27,  5.49it/s][A
 13%|█▎        | 1360/10730 [02:17<31:26,  4.97it/s][A
 13%|█▎        | 1362/10730 [02:17<28:34,  5.46it/s][A
 13%|█▎        | 1363/10730 [02:18<28:34,  5.46it/s][A
 13%|█▎        | 1365/10730 [02:18<23:18,  6.70it/s][A
 13%|█▎        | 1367/10730 [02:18<19:53,  7.84it/s][A
 13%|█▎        | 1369/10730 [02:18<17:14,  9.05it/s][A
 13%|█▎        | 1371/10730 [02:18<17:18,  9.01it/s][A
 13%|█▎        | 1373/10730 [02:18<15:29, 10.07it/s][A
 13%|█▎        | 1375/10730 [02:19<18:54,  8.24it/s][A
 13%|█▎        | 1376/10730 [02:19<18:08,  8.59it/s][A
 13%|█▎        | 1377/10730 [02:19<17:46,  8.77it/s][A
 13%|█▎        | 1379/10730 [02:19<15:40,  9.94it/s][A
 13%|█▎        | 1381/10730 [02:19<17:03,  9.14it/s][A
 13%|█▎        | 1383/10730 [02:20<15:18, 10.18it/s][A
 13%|█▎        | 1385/10730 [02:20<14:34, 10.69i

 15%|█▌        | 1618/10730 [02:45<20:03,  7.57it/s][A
 15%|█▌        | 1619/10730 [02:45<20:22,  7.45it/s][A
 15%|█▌        | 1621/10730 [02:45<17:45,  8.55it/s][A
 15%|█▌        | 1623/10730 [02:45<15:21,  9.89it/s][A
 15%|█▌        | 1625/10730 [02:45<14:27, 10.50it/s][A
 15%|█▌        | 1627/10730 [02:46<17:20,  8.75it/s][A
 15%|█▌        | 1629/10730 [02:46<15:48,  9.60it/s][A
 15%|█▌        | 1631/10730 [02:46<19:40,  7.71it/s][A
 15%|█▌        | 1632/10730 [02:46<18:27,  8.22it/s][A
 15%|█▌        | 1633/10730 [02:46<18:27,  8.22it/s][A
 15%|█▌        | 1634/10730 [02:47<17:38,  8.59it/s][A
 15%|█▌        | 1636/10730 [02:47<16:35,  9.13it/s][A
 15%|█▌        | 1638/10730 [02:47<15:50,  9.57it/s][A
 15%|█▌        | 1640/10730 [02:47<14:52, 10.19it/s][A
 15%|█▌        | 1642/10730 [02:47<13:34, 11.16it/s][A
 15%|█▌        | 1644/10730 [02:47<14:11, 10.67it/s][A
 15%|█▌        | 1646/10730 [02:48<15:05, 10.03it/s][A
 15%|█▌        | 1648/10730 [02:48<17:08,  8.83i

 17%|█▋        | 1870/10730 [03:12<18:01,  8.19it/s][A
 17%|█▋        | 1872/10730 [03:13<16:34,  8.91it/s][A
 17%|█▋        | 1874/10730 [03:13<18:03,  8.17it/s][A
 17%|█▋        | 1876/10730 [03:13<15:44,  9.38it/s][A
 18%|█▊        | 1878/10730 [03:13<13:55, 10.59it/s][A
 18%|█▊        | 1880/10730 [03:13<14:16, 10.33it/s][A
 18%|█▊        | 1882/10730 [03:14<12:26, 11.85it/s][A
 18%|█▊        | 1884/10730 [03:14<15:58,  9.23it/s][A
 18%|█▊        | 1886/10730 [03:14<16:13,  9.08it/s][A
 18%|█▊        | 1888/10730 [03:14<17:18,  8.52it/s][A
 18%|█▊        | 1890/10730 [03:15<15:22,  9.58it/s][A
 18%|█▊        | 1892/10730 [03:15<13:50, 10.64it/s][A
 18%|█▊        | 1894/10730 [03:15<15:28,  9.51it/s][A
 18%|█▊        | 1896/10730 [03:15<16:33,  8.90it/s][A
 18%|█▊        | 1898/10730 [03:15<17:24,  8.46it/s][A
 18%|█▊        | 1900/10730 [03:16<16:06,  9.14it/s][A
 18%|█▊        | 1902/10730 [03:16<15:25,  9.54it/s][A
 18%|█▊        | 1903/10730 [03:16<16:59,  8.66i

 20%|█▉        | 2145/10730 [03:40<09:39, 14.82it/s][A
 20%|██        | 2147/10730 [03:40<09:54, 14.43it/s][A
 20%|██        | 2149/10730 [03:40<11:54, 12.01it/s][A
 20%|██        | 2151/10730 [03:41<11:16, 12.68it/s][A
 20%|██        | 2153/10730 [03:41<13:13, 10.81it/s][A
 20%|██        | 2155/10730 [03:41<11:37, 12.30it/s][A
 20%|██        | 2157/10730 [03:41<13:30, 10.58it/s][A
 20%|██        | 2159/10730 [03:41<13:37, 10.49it/s][A
 20%|██        | 2161/10730 [03:41<12:07, 11.77it/s][A
 20%|██        | 2163/10730 [03:42<11:57, 11.95it/s][A
 20%|██        | 2165/10730 [03:42<13:41, 10.43it/s][A
 20%|██        | 2167/10730 [03:42<13:07, 10.87it/s][A
 20%|██        | 2169/10730 [03:42<11:54, 11.99it/s][A
 20%|██        | 2171/10730 [03:42<14:57,  9.54it/s][A
 20%|██        | 2173/10730 [03:43<16:00,  8.90it/s][A
 20%|██        | 2175/10730 [03:43<15:54,  8.96it/s][A
 20%|██        | 2177/10730 [03:43<15:15,  9.35it/s][A
 20%|██        | 2178/10730 [03:43<15:57,  8.93i

 23%|██▎       | 2419/10730 [04:07<11:07, 12.44it/s][A
 23%|██▎       | 2421/10730 [04:07<11:58, 11.56it/s][A
 23%|██▎       | 2423/10730 [04:07<13:46, 10.05it/s][A
 23%|██▎       | 2425/10730 [04:07<13:42, 10.09it/s][A
 23%|██▎       | 2427/10730 [04:07<14:40,  9.43it/s][A
 23%|██▎       | 2429/10730 [04:08<12:53, 10.73it/s][A
 23%|██▎       | 2431/10730 [04:08<12:10, 11.36it/s][A
 23%|██▎       | 2433/10730 [04:08<13:21, 10.35it/s][A
 23%|██▎       | 2435/10730 [04:08<12:20, 11.20it/s][A
 23%|██▎       | 2437/10730 [04:08<13:16, 10.41it/s][A
 23%|██▎       | 2439/10730 [04:08<12:15, 11.28it/s][A
 23%|██▎       | 2441/10730 [04:09<11:44, 11.77it/s][A
 23%|██▎       | 2443/10730 [04:09<10:36, 13.02it/s][A
 23%|██▎       | 2445/10730 [04:09<10:49, 12.75it/s][A
 23%|██▎       | 2447/10730 [04:09<09:44, 14.18it/s][A
 23%|██▎       | 2449/10730 [04:09<09:06, 15.17it/s][A
 23%|██▎       | 2451/10730 [04:09<12:35, 10.96it/s][A
 23%|██▎       | 2453/10730 [04:10<14:24,  9.58i

 25%|██▌       | 2702/10730 [04:34<14:02,  9.53it/s][A
 25%|██▌       | 2704/10730 [04:34<12:43, 10.51it/s][A
 25%|██▌       | 2706/10730 [04:34<13:49,  9.67it/s][A
 25%|██▌       | 2708/10730 [04:34<13:20, 10.03it/s][A
 25%|██▌       | 2710/10730 [04:34<13:18, 10.05it/s][A
 25%|██▌       | 2712/10730 [04:35<11:48, 11.31it/s][A
 25%|██▌       | 2714/10730 [04:35<10:56, 12.21it/s][A
 25%|██▌       | 2716/10730 [04:35<11:01, 12.11it/s][A
 25%|██▌       | 2718/10730 [04:35<10:58, 12.16it/s][A
 25%|██▌       | 2720/10730 [04:35<10:21, 12.88it/s][A
 25%|██▌       | 2722/10730 [04:35<09:34, 13.94it/s][A
 25%|██▌       | 2724/10730 [04:35<09:38, 13.84it/s][A
 25%|██▌       | 2726/10730 [04:36<10:46, 12.37it/s][A
 25%|██▌       | 2728/10730 [04:36<15:24,  8.66it/s][A
 25%|██▌       | 2730/10730 [04:36<16:33,  8.06it/s][A
 25%|██▌       | 2731/10730 [04:36<19:50,  6.72it/s][A
 25%|██▌       | 2732/10730 [04:37<20:01,  6.66it/s][A
 25%|██▌       | 2733/10730 [04:37<18:38,  7.15i

 28%|██▊       | 2966/10730 [05:02<18:31,  6.98it/s][A
 28%|██▊       | 2968/10730 [05:02<18:02,  7.17it/s][A
 28%|██▊       | 2970/10730 [05:02<16:13,  7.97it/s][A
 28%|██▊       | 2971/10730 [05:02<15:18,  8.45it/s][A
 28%|██▊       | 2973/10730 [05:02<14:51,  8.71it/s][A
 28%|██▊       | 2974/10730 [05:02<16:36,  7.78it/s][A
 28%|██▊       | 2976/10730 [05:03<13:47,  9.37it/s][A
 28%|██▊       | 2978/10730 [05:03<11:42, 11.04it/s][A
 28%|██▊       | 2980/10730 [05:03<10:30, 12.29it/s][A
 28%|██▊       | 2982/10730 [05:03<13:35,  9.50it/s][A
 28%|██▊       | 2984/10730 [05:03<13:03,  9.89it/s][A
 28%|██▊       | 2986/10730 [05:04<14:41,  8.78it/s][A
 28%|██▊       | 2988/10730 [05:04<13:04,  9.87it/s][A
 28%|██▊       | 2990/10730 [05:04<14:47,  8.72it/s][A
 28%|██▊       | 2992/10730 [05:04<15:01,  8.58it/s][A
 28%|██▊       | 2993/10730 [05:04<18:20,  7.03it/s][A
 28%|██▊       | 2995/10730 [05:05<15:55,  8.10it/s][A
 28%|██▊       | 2997/10730 [05:05<14:05,  9.15i

 30%|███       | 3244/10730 [05:29<11:03, 11.28it/s][A
 30%|███       | 3246/10730 [05:29<10:31, 11.85it/s][A
 30%|███       | 3248/10730 [05:29<10:58, 11.37it/s][A
 30%|███       | 3250/10730 [05:30<11:14, 11.10it/s][A
 30%|███       | 3252/10730 [05:30<11:33, 10.79it/s][A
 30%|███       | 3254/10730 [05:30<10:54, 11.43it/s][A
 30%|███       | 3256/10730 [05:30<12:52,  9.67it/s][A
 30%|███       | 3258/10730 [05:31<13:24,  9.29it/s][A
 30%|███       | 3259/10730 [05:31<16:49,  7.40it/s][A
 30%|███       | 3261/10730 [05:31<15:31,  8.02it/s][A
 30%|███       | 3262/10730 [05:31<16:27,  7.57it/s][A
 30%|███       | 3264/10730 [05:31<15:52,  7.83it/s][A
 30%|███       | 3265/10730 [05:32<18:41,  6.66it/s][A
 30%|███       | 3267/10730 [05:32<16:59,  7.32it/s][A
 30%|███       | 3269/10730 [05:32<15:42,  7.92it/s][A
 30%|███       | 3271/10730 [05:32<15:59,  7.77it/s][A
 30%|███       | 3272/10730 [05:32<15:15,  8.15it/s][A
 31%|███       | 3274/10730 [05:33<16:00,  7.76i

 33%|███▎      | 3506/10730 [05:56<10:27, 11.52it/s][A
 33%|███▎      | 3508/10730 [05:56<09:55, 12.12it/s][A
 33%|███▎      | 3510/10730 [05:56<09:48, 12.27it/s][A
 33%|███▎      | 3512/10730 [05:56<10:02, 11.97it/s][A
 33%|███▎      | 3514/10730 [05:57<09:58, 12.07it/s][A
 33%|███▎      | 3516/10730 [05:57<08:48, 13.66it/s][A
 33%|███▎      | 3518/10730 [05:57<08:31, 14.10it/s][A
 33%|███▎      | 3520/10730 [05:57<08:46, 13.70it/s][A
 33%|███▎      | 3522/10730 [05:57<11:28, 10.47it/s][A
 33%|███▎      | 3524/10730 [05:58<14:02,  8.55it/s][A
 33%|███▎      | 3526/10730 [05:58<12:52,  9.32it/s][A
 33%|███▎      | 3528/10730 [05:58<11:45, 10.21it/s][A
 33%|███▎      | 3530/10730 [05:58<10:50, 11.06it/s][A
 33%|███▎      | 3532/10730 [05:58<10:53, 11.02it/s][A
 33%|███▎      | 3534/10730 [05:58<10:30, 11.41it/s][A
 33%|███▎      | 3536/10730 [05:59<11:05, 10.80it/s][A
 33%|███▎      | 3538/10730 [05:59<11:08, 10.77it/s][A
 33%|███▎      | 3540/10730 [05:59<12:11,  9.83i

 35%|███▌      | 3769/10730 [06:23<10:32, 11.00it/s][A
 35%|███▌      | 3771/10730 [06:23<11:51,  9.78it/s][A
 35%|███▌      | 3773/10730 [06:23<10:51, 10.68it/s][A
 35%|███▌      | 3775/10730 [06:24<10:13, 11.34it/s][A
 35%|███▌      | 3777/10730 [06:24<12:09,  9.53it/s][A
 35%|███▌      | 3779/10730 [06:24<11:12, 10.33it/s][A
 35%|███▌      | 3781/10730 [06:24<11:18, 10.24it/s][A
 35%|███▌      | 3783/10730 [06:24<10:43, 10.80it/s][A
 35%|███▌      | 3785/10730 [06:25<09:51, 11.74it/s][A
 35%|███▌      | 3787/10730 [06:25<09:52, 11.72it/s][A
 35%|███▌      | 3789/10730 [06:25<11:23, 10.16it/s][A
 35%|███▌      | 3791/10730 [06:25<12:25,  9.31it/s][A
 35%|███▌      | 3793/10730 [06:26<13:40,  8.46it/s][A
 35%|███▌      | 3794/10730 [06:26<13:57,  8.28it/s][A
 35%|███▌      | 3796/10730 [06:26<12:18,  9.39it/s][A
 35%|███▌      | 3798/10730 [06:26<13:04,  8.84it/s][A
 35%|███▌      | 3799/10730 [06:26<15:01,  7.69it/s][A
 35%|███▌      | 3801/10730 [06:26<13:51,  8.33i

 37%|███▋      | 4008/10730 [06:50<11:01, 10.17it/s][A
 37%|███▋      | 4010/10730 [06:51<11:46,  9.51it/s][A
 37%|███▋      | 4011/10730 [06:51<12:12,  9.17it/s][A
 37%|███▋      | 4013/10730 [06:51<10:53, 10.27it/s][A
 37%|███▋      | 4015/10730 [06:51<09:51, 11.35it/s][A
 37%|███▋      | 4017/10730 [06:51<09:17, 12.03it/s][A
 37%|███▋      | 4019/10730 [06:51<09:39, 11.58it/s][A
 37%|███▋      | 4021/10730 [06:51<09:45, 11.46it/s][A
 37%|███▋      | 4023/10730 [06:52<11:12,  9.98it/s][A
 38%|███▊      | 4025/10730 [06:52<10:35, 10.55it/s][A
 38%|███▊      | 4027/10730 [06:52<14:06,  7.92it/s][A
 38%|███▊      | 4028/10730 [06:52<15:01,  7.44it/s][A
 38%|███▊      | 4030/10730 [06:53<14:22,  7.77it/s][A
 38%|███▊      | 4032/10730 [06:53<12:07,  9.21it/s][A
 38%|███▊      | 4034/10730 [06:53<10:37, 10.50it/s][A
 38%|███▊      | 4036/10730 [06:53<10:44, 10.39it/s][A
 38%|███▊      | 4038/10730 [06:53<11:18,  9.87it/s][A
 38%|███▊      | 4040/10730 [06:53<10:18, 10.83i

 40%|███▉      | 4271/10730 [07:17<08:57, 12.01it/s][A
 40%|███▉      | 4273/10730 [07:17<11:30,  9.35it/s][A
 40%|███▉      | 4275/10730 [07:17<10:50,  9.92it/s][A
 40%|███▉      | 4277/10730 [07:17<09:49, 10.94it/s][A
 40%|███▉      | 4279/10730 [07:18<10:29, 10.25it/s][A
 40%|███▉      | 4281/10730 [07:18<11:39,  9.23it/s][A
 40%|███▉      | 4283/10730 [07:18<11:42,  9.17it/s][A
 40%|███▉      | 4284/10730 [07:18<11:47,  9.11it/s][A
 40%|███▉      | 4286/10730 [07:18<11:27,  9.37it/s][A
 40%|███▉      | 4287/10730 [07:19<14:03,  7.64it/s][A
 40%|███▉      | 4288/10730 [07:19<15:29,  6.93it/s][A
 40%|███▉      | 4289/10730 [07:19<14:55,  7.20it/s][A
 40%|███▉      | 4291/10730 [07:19<12:10,  8.82it/s][A
 40%|████      | 4293/10730 [07:19<10:31, 10.19it/s][A
 40%|████      | 4295/10730 [07:19<09:05, 11.81it/s][A
 40%|████      | 4297/10730 [07:19<09:04, 11.82it/s][A
 40%|████      | 4299/10730 [07:20<09:05, 11.80it/s][A
 40%|████      | 4301/10730 [07:20<09:59, 10.73i

 42%|████▏     | 4533/10730 [07:44<09:45, 10.59it/s][A
 42%|████▏     | 4535/10730 [07:44<11:00,  9.38it/s][A
 42%|████▏     | 4537/10730 [07:45<12:05,  8.53it/s][A
 42%|████▏     | 4538/10730 [07:45<14:35,  7.08it/s][A
 42%|████▏     | 4540/10730 [07:45<11:51,  8.71it/s][A
 42%|████▏     | 4542/10730 [07:45<11:37,  8.87it/s][A
 42%|████▏     | 4544/10730 [07:45<12:34,  8.20it/s][A
 42%|████▏     | 4546/10730 [07:46<11:11,  9.21it/s][A
 42%|████▏     | 4548/10730 [07:46<10:28,  9.84it/s][A
 42%|████▏     | 4550/10730 [07:46<10:36,  9.70it/s][A
 42%|████▏     | 4552/10730 [07:46<10:41,  9.63it/s][A
 42%|████▏     | 4554/10730 [07:46<09:30, 10.82it/s][A
 42%|████▏     | 4556/10730 [07:46<09:08, 11.26it/s][A
 42%|████▏     | 4558/10730 [07:47<10:52,  9.46it/s][A
 42%|████▏     | 4560/10730 [07:47<10:33,  9.75it/s][A
 43%|████▎     | 4562/10730 [07:47<12:38,  8.13it/s][A
 43%|████▎     | 4564/10730 [07:47<11:38,  8.82it/s][A
 43%|████▎     | 4566/10730 [07:48<12:00,  8.56i

 45%|████▍     | 4792/10730 [08:12<09:20, 10.59it/s][A
 45%|████▍     | 4794/10730 [08:12<08:41, 11.39it/s][A
 45%|████▍     | 4796/10730 [08:12<08:26, 11.73it/s][A
 45%|████▍     | 4798/10730 [08:12<09:09, 10.79it/s][A
 45%|████▍     | 4800/10730 [08:13<10:35,  9.33it/s][A
 45%|████▍     | 4802/10730 [08:13<11:41,  8.45it/s][A
 45%|████▍     | 4804/10730 [08:13<10:55,  9.04it/s][A
 45%|████▍     | 4805/10730 [08:13<13:09,  7.51it/s][A
 45%|████▍     | 4806/10730 [08:14<12:39,  7.80it/s][A
 45%|████▍     | 4807/10730 [08:14<14:05,  7.00it/s][A
 45%|████▍     | 4809/10730 [08:14<14:11,  6.95it/s][A
 45%|████▍     | 4811/10730 [08:14<12:58,  7.60it/s][A
 45%|████▍     | 4812/10730 [08:14<14:53,  6.62it/s][A
 45%|████▍     | 4813/10730 [08:15<14:25,  6.83it/s][A
 45%|████▍     | 4814/10730 [08:15<13:06,  7.52it/s][A
 45%|████▍     | 4815/10730 [08:15<12:51,  7.67it/s][A
 45%|████▍     | 4817/10730 [08:15<11:11,  8.80it/s][A
 45%|████▍     | 4819/10730 [08:15<10:18,  9.56i

 47%|████▋     | 5018/10730 [08:39<11:45,  8.09it/s][A
 47%|████▋     | 5019/10730 [08:39<14:10,  6.72it/s][A
 47%|████▋     | 5021/10730 [08:40<11:56,  7.97it/s][A
 47%|████▋     | 5023/10730 [08:40<10:03,  9.45it/s][A
 47%|████▋     | 5025/10730 [08:40<11:27,  8.29it/s][A
 47%|████▋     | 5027/10730 [08:40<10:23,  9.14it/s][A
 47%|████▋     | 5029/10730 [08:40<09:25, 10.09it/s][A
 47%|████▋     | 5031/10730 [08:41<09:56,  9.55it/s][A
 47%|████▋     | 5033/10730 [08:41<09:45,  9.73it/s][A
 47%|████▋     | 5035/10730 [08:41<09:15, 10.26it/s][A
 47%|████▋     | 5037/10730 [08:41<09:33,  9.92it/s][A
 47%|████▋     | 5039/10730 [08:41<09:51,  9.62it/s][A
 47%|████▋     | 5040/10730 [08:41<10:09,  9.34it/s][A
 47%|████▋     | 5042/10730 [08:42<09:34,  9.89it/s][A
 47%|████▋     | 5044/10730 [08:42<08:48, 10.75it/s][A
 47%|████▋     | 5046/10730 [08:42<08:33, 11.08it/s][A
 47%|████▋     | 5048/10730 [08:42<08:28, 11.18it/s][A
 47%|████▋     | 5050/10730 [08:42<08:40, 10.91i

 49%|████▉     | 5273/10730 [09:07<10:29,  8.66it/s][A
 49%|████▉     | 5275/10730 [09:07<10:32,  8.62it/s][A
 49%|████▉     | 5276/10730 [09:08<10:25,  8.73it/s][A
 49%|████▉     | 5277/10730 [09:08<11:47,  7.71it/s][A
 49%|████▉     | 5278/10730 [09:08<11:29,  7.91it/s][A
 49%|████▉     | 5279/10730 [09:08<11:11,  8.11it/s][A
 49%|████▉     | 5281/10730 [09:08<09:52,  9.20it/s][A
 49%|████▉     | 5283/10730 [09:08<09:13,  9.84it/s][A
 49%|████▉     | 5285/10730 [09:09<10:16,  8.83it/s][A
 49%|████▉     | 5287/10730 [09:09<08:54, 10.18it/s][A
 49%|████▉     | 5289/10730 [09:09<07:42, 11.76it/s][A
 49%|████▉     | 5291/10730 [09:09<09:19,  9.71it/s][A
 49%|████▉     | 5293/10730 [09:09<08:08, 11.12it/s][A
 49%|████▉     | 5295/10730 [09:09<08:42, 10.41it/s][A
 49%|████▉     | 5297/10730 [09:10<10:09,  8.91it/s][A
 49%|████▉     | 5299/10730 [09:10<09:20,  9.69it/s][A
 49%|████▉     | 5301/10730 [09:10<10:49,  8.36it/s][A
 49%|████▉     | 5303/10730 [09:10<10:16,  8.81i

 51%|█████     | 5495/10730 [09:33<12:14,  7.13it/s][A
 51%|█████     | 5497/10730 [09:33<11:10,  7.81it/s][A
 51%|█████     | 5499/10730 [09:34<10:21,  8.42it/s][A
 51%|█████▏    | 5501/10730 [09:34<10:56,  7.97it/s][A
 51%|█████▏    | 5502/10730 [09:34<11:46,  7.40it/s][A
 51%|█████▏    | 5504/10730 [09:34<09:39,  9.02it/s][A
 51%|█████▏    | 5506/10730 [09:34<08:13, 10.58it/s][A
 51%|█████▏    | 5508/10730 [09:34<07:48, 11.14it/s][A
 51%|█████▏    | 5510/10730 [09:35<09:34,  9.08it/s][A
 51%|█████▏    | 5512/10730 [09:35<10:30,  8.27it/s][A
 51%|█████▏    | 5514/10730 [09:35<09:36,  9.05it/s][A
 51%|█████▏    | 5516/10730 [09:35<09:37,  9.04it/s][A
 51%|█████▏    | 5517/10730 [09:36<10:43,  8.10it/s][A
 51%|█████▏    | 5518/10730 [09:36<10:50,  8.02it/s][A
 51%|█████▏    | 5520/10730 [09:36<09:23,  9.25it/s][A
 51%|█████▏    | 5522/10730 [09:36<09:08,  9.50it/s][A
 51%|█████▏    | 5524/10730 [09:36<09:03,  9.58it/s][A
 52%|█████▏    | 5526/10730 [09:36<08:38, 10.04i

 53%|█████▎    | 5728/10730 [10:00<10:30,  7.93it/s][A
 53%|█████▎    | 5729/10730 [10:00<10:42,  7.78it/s][A
 53%|█████▎    | 5730/10730 [10:00<12:53,  6.47it/s][A
 53%|█████▎    | 5731/10730 [10:01<14:08,  5.89it/s][A
 53%|█████▎    | 5732/10730 [10:01<12:38,  6.59it/s][A
 53%|█████▎    | 5734/10730 [10:01<11:11,  7.44it/s][A
 53%|█████▎    | 5735/10730 [10:01<10:57,  7.60it/s][A
 53%|█████▎    | 5736/10730 [10:01<11:36,  7.17it/s][A
 53%|█████▎    | 5738/10730 [10:01<09:49,  8.47it/s][A
 53%|█████▎    | 5740/10730 [10:02<09:02,  9.19it/s][A
 54%|█████▎    | 5742/10730 [10:02<08:10, 10.16it/s][A
 54%|█████▎    | 5744/10730 [10:02<08:39,  9.59it/s][A
 54%|█████▎    | 5746/10730 [10:02<08:06, 10.24it/s][A
 54%|█████▎    | 5748/10730 [10:02<08:03, 10.31it/s][A
 54%|█████▎    | 5750/10730 [10:02<07:55, 10.48it/s][A
 54%|█████▎    | 5752/10730 [10:03<10:40,  7.77it/s][A
 54%|█████▎    | 5753/10730 [10:03<12:32,  6.62it/s][A
 54%|█████▎    | 5754/10730 [10:03<13:03,  6.35i

 56%|█████▌    | 5991/10730 [10:27<08:24,  9.39it/s][A
 56%|█████▌    | 5993/10730 [10:27<09:05,  8.68it/s][A
 56%|█████▌    | 5995/10730 [10:27<08:07,  9.72it/s][A
 56%|█████▌    | 5997/10730 [10:28<09:17,  8.49it/s][A
 56%|█████▌    | 5998/10730 [10:28<09:43,  8.11it/s][A
 56%|█████▌    | 5999/10730 [10:28<10:03,  7.84it/s][A
 56%|█████▌    | 6000/10730 [10:28<11:01,  7.15it/s][A
 56%|█████▌    | 6001/10730 [10:28<10:09,  7.76it/s][A
 56%|█████▌    | 6002/10730 [10:28<10:21,  7.61it/s][A
 56%|█████▌    | 6003/10730 [10:29<11:47,  6.68it/s][A
 56%|█████▌    | 6005/10730 [10:29<10:16,  7.66it/s][A
 56%|█████▌    | 6007/10730 [10:29<09:05,  8.65it/s][A
 56%|█████▌    | 6008/10730 [10:29<10:53,  7.23it/s][A
 56%|█████▌    | 6010/10730 [10:29<11:03,  7.11it/s][A
 56%|█████▌    | 6012/10730 [10:30<09:22,  8.38it/s][A
 56%|█████▌    | 6014/10730 [10:30<08:28,  9.28it/s][A
 56%|█████▌    | 6016/10730 [10:30<09:27,  8.31it/s][A
 56%|█████▌    | 6017/10730 [10:30<09:04,  8.65i

 58%|█████▊    | 6244/10730 [10:55<09:15,  8.08it/s][A
 58%|█████▊    | 6245/10730 [10:55<09:29,  7.88it/s][A
 58%|█████▊    | 6246/10730 [10:55<10:15,  7.28it/s][A
 58%|█████▊    | 6247/10730 [10:55<11:14,  6.64it/s][A
 58%|█████▊    | 6248/10730 [10:55<12:32,  5.95it/s][A
 58%|█████▊    | 6249/10730 [10:55<11:30,  6.49it/s][A
 58%|█████▊    | 6250/10730 [10:56<11:23,  6.56it/s][A
 58%|█████▊    | 6252/10730 [10:56<10:05,  7.39it/s][A
 58%|█████▊    | 6254/10730 [10:56<08:55,  8.35it/s][A
 58%|█████▊    | 6256/10730 [10:56<08:32,  8.73it/s][A
 58%|█████▊    | 6258/10730 [10:56<08:26,  8.84it/s][A
 58%|█████▊    | 6259/10730 [10:57<08:35,  8.67it/s][A
 58%|█████▊    | 6261/10730 [10:57<07:38,  9.75it/s][A
 58%|█████▊    | 6263/10730 [10:57<07:21, 10.11it/s][A
 58%|█████▊    | 6265/10730 [10:57<07:39,  9.72it/s][A
 58%|█████▊    | 6267/10730 [10:57<07:47,  9.56it/s][A
 58%|█████▊    | 6269/10730 [10:57<06:43, 11.07it/s][A
 58%|█████▊    | 6271/10730 [10:58<06:33, 11.32i

 61%|██████    | 6511/10730 [11:22<05:51, 11.99it/s][A
 61%|██████    | 6513/10730 [11:22<06:00, 11.71it/s][A
 61%|██████    | 6515/10730 [11:22<05:47, 12.13it/s][A
 61%|██████    | 6517/10730 [11:22<06:54, 10.17it/s][A
 61%|██████    | 6519/10730 [11:23<07:28,  9.38it/s][A
 61%|██████    | 6520/10730 [11:23<07:45,  9.05it/s][A
 61%|██████    | 6522/10730 [11:23<07:13,  9.70it/s][A
 61%|██████    | 6524/10730 [11:23<07:31,  9.31it/s][A
 61%|██████    | 6525/10730 [11:23<07:51,  8.91it/s][A
 61%|██████    | 6527/10730 [11:23<07:11,  9.73it/s][A
 61%|██████    | 6529/10730 [11:24<07:49,  8.95it/s][A
 61%|██████    | 6530/10730 [11:24<08:17,  8.44it/s][A
 61%|██████    | 6531/10730 [11:24<09:41,  7.22it/s][A
 61%|██████    | 6533/10730 [11:24<08:31,  8.20it/s][A
 61%|██████    | 6535/10730 [11:24<07:34,  9.24it/s][A
 61%|██████    | 6537/10730 [11:24<06:48, 10.27it/s][A
 61%|██████    | 6539/10730 [11:25<08:08,  8.58it/s][A
 61%|██████    | 6540/10730 [11:25<09:51,  7.08i

 63%|██████▎   | 6767/10730 [11:48<06:38,  9.93it/s][A
 63%|██████▎   | 6769/10730 [11:48<06:30, 10.13it/s][A
 63%|██████▎   | 6771/10730 [11:48<07:15,  9.08it/s][A
 63%|██████▎   | 6773/10730 [11:48<06:34, 10.03it/s][A
 63%|██████▎   | 6775/10730 [11:49<06:16, 10.50it/s][A
 63%|██████▎   | 6777/10730 [11:49<05:56, 11.08it/s][A
 63%|██████▎   | 6779/10730 [11:49<05:43, 11.52it/s][A
 63%|██████▎   | 6781/10730 [11:49<06:03, 10.86it/s][A
 63%|██████▎   | 6783/10730 [11:49<05:36, 11.72it/s][A
 63%|██████▎   | 6785/10730 [11:49<05:39, 11.62it/s][A
 63%|██████▎   | 6787/10730 [11:50<05:54, 11.12it/s][A
 63%|██████▎   | 6789/10730 [11:50<05:56, 11.06it/s][A
 63%|██████▎   | 6791/10730 [11:50<06:42,  9.79it/s][A
 63%|██████▎   | 6793/10730 [11:50<06:52,  9.55it/s][A
 63%|██████▎   | 6795/10730 [11:50<06:34,  9.97it/s][A
 63%|██████▎   | 6797/10730 [11:51<05:50, 11.21it/s][A
 63%|██████▎   | 6799/10730 [11:51<07:06,  9.22it/s][A
 63%|██████▎   | 6801/10730 [11:51<07:11,  9.11i

 66%|██████▌   | 7034/10730 [12:15<06:57,  8.84it/s][A
 66%|██████▌   | 7036/10730 [12:16<06:38,  9.26it/s][A
 66%|██████▌   | 7038/10730 [12:16<06:01, 10.21it/s][A
 66%|██████▌   | 7040/10730 [12:16<06:54,  8.90it/s][A
 66%|██████▌   | 7042/10730 [12:16<06:34,  9.36it/s][A
 66%|██████▌   | 7043/10730 [12:16<06:41,  9.17it/s][A
 66%|██████▌   | 7044/10730 [12:17<07:02,  8.73it/s][A
 66%|██████▌   | 7046/10730 [12:17<07:00,  8.77it/s][A
 66%|██████▌   | 7047/10730 [12:17<07:05,  8.66it/s][A
 66%|██████▌   | 7049/10730 [12:17<06:12,  9.88it/s][A
 66%|██████▌   | 7051/10730 [12:17<05:53, 10.42it/s][A
 66%|██████▌   | 7053/10730 [12:17<05:34, 11.00it/s][A
 66%|██████▌   | 7055/10730 [12:17<05:19, 11.50it/s][A
 66%|██████▌   | 7057/10730 [12:18<05:06, 12.00it/s][A
 66%|██████▌   | 7059/10730 [12:18<05:58, 10.24it/s][A
 66%|██████▌   | 7061/10730 [12:18<06:02, 10.11it/s][A
 66%|██████▌   | 7063/10730 [12:18<06:01, 10.14it/s][A
 66%|██████▌   | 7065/10730 [12:18<05:37, 10.86i

 68%|██████▊   | 7309/10730 [12:44<08:46,  6.50it/s][A
 68%|██████▊   | 7310/10730 [12:44<07:54,  7.21it/s][A
 68%|██████▊   | 7312/10730 [12:44<07:50,  7.26it/s][A
 68%|██████▊   | 7313/10730 [12:44<07:44,  7.36it/s][A
 68%|██████▊   | 7315/10730 [12:44<06:55,  8.22it/s][A
 68%|██████▊   | 7317/10730 [12:44<06:10,  9.22it/s][A
 68%|██████▊   | 7319/10730 [12:45<06:03,  9.38it/s][A
 68%|██████▊   | 7321/10730 [12:45<05:51,  9.69it/s][A
 68%|██████▊   | 7323/10730 [12:45<05:47,  9.79it/s][A
 68%|██████▊   | 7325/10730 [12:45<06:54,  8.22it/s][A
 68%|██████▊   | 7326/10730 [12:45<07:08,  7.95it/s][A
 68%|██████▊   | 7327/10730 [12:46<08:39,  6.56it/s][A
 68%|██████▊   | 7329/10730 [12:46<07:44,  7.32it/s][A
 68%|██████▊   | 7331/10730 [12:46<06:57,  8.13it/s][A
 68%|██████▊   | 7333/10730 [12:46<06:03,  9.34it/s][A
 68%|██████▊   | 7335/10730 [12:46<05:53,  9.60it/s][A
 68%|██████▊   | 7337/10730 [12:47<07:07,  7.95it/s][A
 68%|██████▊   | 7338/10730 [12:47<07:03,  8.01i

 70%|███████   | 7550/10730 [13:10<06:18,  8.41it/s][A
 70%|███████   | 7551/10730 [13:10<06:34,  8.07it/s][A
 70%|███████   | 7552/10730 [13:10<07:33,  7.00it/s][A
 70%|███████   | 7553/10730 [13:11<06:53,  7.68it/s][A
 70%|███████   | 7554/10730 [13:11<07:56,  6.66it/s][A
 70%|███████   | 7556/10730 [13:11<06:48,  7.77it/s][A
 70%|███████   | 7557/10730 [13:11<06:49,  7.76it/s][A
 70%|███████   | 7559/10730 [13:11<05:47,  9.12it/s][A
 70%|███████   | 7561/10730 [13:12<06:49,  7.74it/s][A
 70%|███████   | 7563/10730 [13:12<05:37,  9.37it/s][A
 71%|███████   | 7565/10730 [13:12<06:06,  8.63it/s][A
 71%|███████   | 7567/10730 [13:12<06:18,  8.35it/s][A
 71%|███████   | 7569/10730 [13:12<05:37,  9.35it/s][A
 71%|███████   | 7571/10730 [13:13<06:29,  8.11it/s][A
 71%|███████   | 7572/10730 [13:13<06:36,  7.97it/s][A
 71%|███████   | 7573/10730 [13:13<06:28,  8.12it/s][A
 71%|███████   | 7575/10730 [13:13<05:52,  8.95it/s][A
 71%|███████   | 7577/10730 [13:13<05:05, 10.33i

 73%|███████▎  | 7804/10730 [13:37<06:10,  7.90it/s][A
 73%|███████▎  | 7805/10730 [13:37<05:52,  8.29it/s][A
 73%|███████▎  | 7806/10730 [13:38<05:55,  8.23it/s][A
 73%|███████▎  | 7808/10730 [13:38<06:03,  8.05it/s][A
 73%|███████▎  | 7810/10730 [13:38<05:20,  9.11it/s][A
 73%|███████▎  | 7812/10730 [13:38<04:59,  9.74it/s][A
 73%|███████▎  | 7814/10730 [13:38<04:52,  9.98it/s][A
 73%|███████▎  | 7816/10730 [13:38<04:34, 10.62it/s][A
 73%|███████▎  | 7818/10730 [13:39<04:28, 10.85it/s][A
 73%|███████▎  | 7820/10730 [13:39<04:45, 10.21it/s][A
 73%|███████▎  | 7822/10730 [13:39<04:20, 11.17it/s][A
 73%|███████▎  | 7824/10730 [13:39<05:17,  9.14it/s][A
 73%|███████▎  | 7826/10730 [13:40<05:19,  9.08it/s][A
 73%|███████▎  | 7827/10730 [13:40<06:36,  7.33it/s][A
 73%|███████▎  | 7829/10730 [13:40<06:00,  8.06it/s][A
 73%|███████▎  | 7830/10730 [13:40<05:44,  8.42it/s][A
 73%|███████▎  | 7832/10730 [13:40<05:07,  9.41it/s][A
 73%|███████▎  | 7834/10730 [13:40<04:45, 10.15i

 75%|███████▌  | 8060/10730 [14:04<03:56, 11.30it/s][A
 75%|███████▌  | 8062/10730 [14:05<04:10, 10.64it/s][A
 75%|███████▌  | 8064/10730 [14:05<05:07,  8.66it/s][A
 75%|███████▌  | 8065/10730 [14:05<05:24,  8.22it/s][A
 75%|███████▌  | 8066/10730 [14:05<06:09,  7.21it/s][A
 75%|███████▌  | 8067/10730 [14:05<06:04,  7.30it/s][A
 75%|███████▌  | 8068/10730 [14:06<05:36,  7.91it/s][A
 75%|███████▌  | 8069/10730 [14:06<05:16,  8.40it/s][A
 75%|███████▌  | 8071/10730 [14:06<04:31,  9.78it/s][A
 75%|███████▌  | 8073/10730 [14:06<04:19, 10.25it/s][A
 75%|███████▌  | 8075/10730 [14:06<04:13, 10.48it/s][A
 75%|███████▌  | 8077/10730 [14:06<05:03,  8.75it/s][A
 75%|███████▌  | 8079/10730 [14:07<04:45,  9.28it/s][A
 75%|███████▌  | 8081/10730 [14:07<05:00,  8.81it/s][A
 75%|███████▌  | 8083/10730 [14:07<04:47,  9.22it/s][A
 75%|███████▌  | 8085/10730 [14:07<04:31,  9.74it/s][A
 75%|███████▌  | 8087/10730 [14:08<05:05,  8.66it/s][A
 75%|███████▌  | 8088/10730 [14:08<04:56,  8.90i

 77%|███████▋  | 8308/10730 [14:32<03:56, 10.24it/s][A
 77%|███████▋  | 8310/10730 [14:32<04:08,  9.74it/s][A
 77%|███████▋  | 8312/10730 [14:32<04:09,  9.70it/s][A
 77%|███████▋  | 8314/10730 [14:33<03:54, 10.31it/s][A
 78%|███████▊  | 8316/10730 [14:33<04:41,  8.58it/s][A
 78%|███████▊  | 8317/10730 [14:33<05:50,  6.89it/s][A
 78%|███████▊  | 8318/10730 [14:33<05:24,  7.44it/s][A
 78%|███████▊  | 8319/10730 [14:33<05:10,  7.77it/s][A
 78%|███████▊  | 8321/10730 [14:34<04:37,  8.69it/s][A
 78%|███████▊  | 8322/10730 [14:34<04:30,  8.91it/s][A
 78%|███████▊  | 8323/10730 [14:34<05:03,  7.93it/s][A
 78%|███████▊  | 8324/10730 [14:34<05:15,  7.63it/s][A
 78%|███████▊  | 8325/10730 [14:34<05:12,  7.69it/s][A
 78%|███████▊  | 8326/10730 [14:34<04:59,  8.04it/s][A
 78%|███████▊  | 8327/10730 [14:34<05:33,  7.20it/s][A
 78%|███████▊  | 8328/10730 [14:34<05:15,  7.61it/s][A
 78%|███████▊  | 8329/10730 [14:35<05:01,  7.95it/s][A
 78%|███████▊  | 8331/10730 [14:35<04:57,  8.08i

 80%|███████▉  | 8547/10730 [14:59<04:12,  8.64it/s][A
 80%|███████▉  | 8549/10730 [14:59<03:52,  9.37it/s][A
 80%|███████▉  | 8551/10730 [15:00<03:48,  9.52it/s][A
 80%|███████▉  | 8552/10730 [15:00<04:49,  7.52it/s][A
 80%|███████▉  | 8553/10730 [15:00<05:20,  6.80it/s][A
 80%|███████▉  | 8554/10730 [15:00<04:54,  7.39it/s][A
 80%|███████▉  | 8556/10730 [15:00<04:20,  8.36it/s][A
 80%|███████▉  | 8558/10730 [15:00<04:24,  8.23it/s][A
 80%|███████▉  | 8559/10730 [15:01<04:12,  8.60it/s][A
 80%|███████▉  | 8561/10730 [15:01<04:03,  8.90it/s][A
 80%|███████▉  | 8562/10730 [15:01<04:16,  8.44it/s][A
 80%|███████▉  | 8563/10730 [15:01<04:21,  8.28it/s][A
 80%|███████▉  | 8565/10730 [15:01<03:56,  9.15it/s][A
 80%|███████▉  | 8567/10730 [15:01<04:10,  8.62it/s][A
 80%|███████▉  | 8569/10730 [15:02<04:06,  8.76it/s][A
 80%|███████▉  | 8571/10730 [15:02<03:53,  9.25it/s][A
 80%|███████▉  | 8573/10730 [15:02<03:26, 10.42it/s][A
 80%|███████▉  | 8575/10730 [15:02<03:38,  9.87i

 82%|████████▏ | 8784/10730 [15:26<03:28,  9.31it/s][A
 82%|████████▏ | 8786/10730 [15:26<03:20,  9.70it/s][A
 82%|████████▏ | 8788/10730 [15:26<03:22,  9.59it/s][A
 82%|████████▏ | 8790/10730 [15:26<03:30,  9.20it/s][A
 82%|████████▏ | 8792/10730 [15:26<03:10, 10.19it/s][A
 82%|████████▏ | 8794/10730 [15:27<03:02, 10.62it/s][A
 82%|████████▏ | 8796/10730 [15:27<02:53, 11.14it/s][A
 82%|████████▏ | 8798/10730 [15:27<02:38, 12.16it/s][A
 82%|████████▏ | 8800/10730 [15:27<02:29, 12.87it/s][A
 82%|████████▏ | 8802/10730 [15:27<02:32, 12.65it/s][A
 82%|████████▏ | 8804/10730 [15:27<02:41, 11.93it/s][A
 82%|████████▏ | 8806/10730 [15:28<03:22,  9.50it/s][A
 82%|████████▏ | 8808/10730 [15:28<03:04, 10.41it/s][A
 82%|████████▏ | 8810/10730 [15:28<02:54, 11.00it/s][A
 82%|████████▏ | 8812/10730 [15:28<03:01, 10.54it/s][A
 82%|████████▏ | 8814/10730 [15:28<02:54, 10.96it/s][A
 82%|████████▏ | 8816/10730 [15:29<02:54, 10.99it/s][A
 82%|████████▏ | 8818/10730 [15:29<02:51, 11.15i

 84%|████████▍ | 9039/10730 [15:53<02:42, 10.39it/s][A
 84%|████████▍ | 9041/10730 [15:53<02:50,  9.91it/s][A
 84%|████████▍ | 9043/10730 [15:53<02:49,  9.98it/s][A
 84%|████████▍ | 9045/10730 [15:53<02:32, 11.08it/s][A
 84%|████████▍ | 9047/10730 [15:54<02:33, 10.95it/s][A
 84%|████████▍ | 9049/10730 [15:54<02:39, 10.55it/s][A
 84%|████████▍ | 9051/10730 [15:54<02:22, 11.74it/s][A
 84%|████████▍ | 9053/10730 [15:54<02:37, 10.63it/s][A
 84%|████████▍ | 9055/10730 [15:54<02:36, 10.68it/s][A
 84%|████████▍ | 9057/10730 [15:55<02:40, 10.42it/s][A
 84%|████████▍ | 9059/10730 [15:55<02:32, 10.93it/s][A
 84%|████████▍ | 9061/10730 [15:55<02:24, 11.59it/s][A
 84%|████████▍ | 9063/10730 [15:55<02:14, 12.38it/s][A
 84%|████████▍ | 9065/10730 [15:55<02:33, 10.84it/s][A
 85%|████████▍ | 9067/10730 [15:55<02:34, 10.77it/s][A
 85%|████████▍ | 9069/10730 [15:56<02:35, 10.69it/s][A
 85%|████████▍ | 9071/10730 [15:56<02:33, 10.77it/s][A
 85%|████████▍ | 9073/10730 [15:56<03:07,  8.83i

 87%|████████▋ | 9305/10730 [16:20<02:44,  8.69it/s][A
 87%|████████▋ | 9306/10730 [16:21<02:43,  8.70it/s][A
 87%|████████▋ | 9307/10730 [16:21<02:58,  7.95it/s][A
 87%|████████▋ | 9308/10730 [16:21<02:48,  8.44it/s][A
 87%|████████▋ | 9310/10730 [16:21<03:00,  7.85it/s][A
 87%|████████▋ | 9311/10730 [16:21<03:24,  6.92it/s][A
 87%|████████▋ | 9312/10730 [16:21<03:48,  6.21it/s][A
 87%|████████▋ | 9314/10730 [16:22<03:14,  7.30it/s][A
 87%|████████▋ | 9316/10730 [16:22<02:55,  8.07it/s][A
 87%|████████▋ | 9317/10730 [16:22<03:39,  6.43it/s][A
 87%|████████▋ | 9318/10730 [16:22<04:03,  5.79it/s][A
 87%|████████▋ | 9319/10730 [16:22<03:50,  6.12it/s][A
 87%|████████▋ | 9321/10730 [16:23<03:26,  6.82it/s][A
 87%|████████▋ | 9322/10730 [16:23<03:10,  7.40it/s][A
 87%|████████▋ | 9324/10730 [16:23<02:41,  8.72it/s][A
 87%|████████▋ | 9326/10730 [16:23<02:16, 10.29it/s][A
 87%|████████▋ | 9328/10730 [16:23<02:22,  9.83it/s][A
 87%|████████▋ | 9330/10730 [16:24<03:01,  7.72i

 89%|████████▉ | 9542/10730 [16:47<02:41,  7.38it/s][A
 89%|████████▉ | 9544/10730 [16:47<02:15,  8.78it/s][A
 89%|████████▉ | 9546/10730 [16:48<02:04,  9.53it/s][A
 89%|████████▉ | 9548/10730 [16:48<01:56, 10.17it/s][A
 89%|████████▉ | 9550/10730 [16:48<02:11,  8.98it/s][A
 89%|████████▉ | 9552/10730 [16:48<01:58,  9.96it/s][A
 89%|████████▉ | 9554/10730 [16:48<01:50, 10.65it/s][A
 89%|████████▉ | 9556/10730 [16:48<01:48, 10.82it/s][A
 89%|████████▉ | 9558/10730 [16:49<01:54, 10.27it/s][A
 89%|████████▉ | 9560/10730 [16:49<02:32,  7.68it/s][A
 89%|████████▉ | 9561/10730 [16:49<02:34,  7.55it/s][A
 89%|████████▉ | 9562/10730 [16:49<03:00,  6.45it/s][A
 89%|████████▉ | 9563/10730 [16:50<03:17,  5.90it/s][A
 89%|████████▉ | 9565/10730 [16:50<02:49,  6.87it/s][A
 89%|████████▉ | 9566/10730 [16:50<02:49,  6.85it/s][A
 89%|████████▉ | 9568/10730 [16:50<02:25,  7.96it/s][A
 89%|████████▉ | 9570/10730 [16:50<02:15,  8.54it/s][A
 89%|████████▉ | 9572/10730 [16:50<02:01,  9.54i

 91%|█████████▏| 9802/10730 [17:15<02:18,  6.68it/s][A
 91%|█████████▏| 9803/10730 [17:16<02:32,  6.09it/s][A
 91%|█████████▏| 9805/10730 [17:16<02:04,  7.43it/s][A
 91%|█████████▏| 9807/10730 [17:16<01:51,  8.29it/s][A
 91%|█████████▏| 9809/10730 [17:16<01:36,  9.50it/s][A
 91%|█████████▏| 9811/10730 [17:16<01:32,  9.95it/s][A
 91%|█████████▏| 9813/10730 [17:17<01:33,  9.82it/s][A
 91%|█████████▏| 9815/10730 [17:17<01:33,  9.82it/s][A
 91%|█████████▏| 9817/10730 [17:17<01:27, 10.41it/s][A
 92%|█████████▏| 9819/10730 [17:17<01:34,  9.64it/s][A
 92%|█████████▏| 9821/10730 [17:17<01:40,  9.03it/s][A
 92%|█████████▏| 9822/10730 [17:18<01:49,  8.32it/s][A
 92%|█████████▏| 9824/10730 [17:18<01:36,  9.42it/s][A
 92%|█████████▏| 9826/10730 [17:18<01:37,  9.26it/s][A
 92%|█████████▏| 9827/10730 [17:18<01:47,  8.43it/s][A
 92%|█████████▏| 9828/10730 [17:18<01:51,  8.07it/s][A
 92%|█████████▏| 9829/10730 [17:18<01:53,  7.95it/s][A
 92%|█████████▏| 9831/10730 [17:19<01:45,  8.53i

 93%|█████████▎| 10031/10730 [17:42<01:41,  6.87it/s][A
 93%|█████████▎| 10032/10730 [17:42<01:54,  6.08it/s][A
 94%|█████████▎| 10033/10730 [17:42<02:02,  5.67it/s][A
 94%|█████████▎| 10035/10730 [17:43<01:57,  5.93it/s][A
 94%|█████████▎| 10036/10730 [17:43<02:04,  5.57it/s][A
 94%|█████████▎| 10038/10730 [17:43<01:39,  6.97it/s][A
 94%|█████████▎| 10039/10730 [17:43<01:54,  6.01it/s][A
 94%|█████████▎| 10041/10730 [17:43<01:36,  7.17it/s][A
 94%|█████████▎| 10043/10730 [17:43<01:25,  8.04it/s][A
 94%|█████████▎| 10044/10730 [17:44<01:43,  6.62it/s][A
 94%|█████████▎| 10045/10730 [17:44<01:40,  6.81it/s][A
 94%|█████████▎| 10046/10730 [17:44<01:35,  7.18it/s][A
 94%|█████████▎| 10047/10730 [17:44<01:48,  6.27it/s][A
 94%|█████████▎| 10048/10730 [17:44<01:38,  6.95it/s][A
 94%|█████████▎| 10050/10730 [17:44<01:24,  8.04it/s][A
 94%|█████████▎| 10052/10730 [17:45<01:12,  9.30it/s][A
 94%|█████████▎| 10054/10730 [17:45<01:12,  9.37it/s][A
 94%|█████████▎| 10056/10730 [1

 96%|█████████▌| 10269/10730 [18:09<01:13,  6.24it/s][A
 96%|█████████▌| 10271/10730 [18:09<01:10,  6.51it/s][A
 96%|█████████▌| 10272/10730 [18:09<01:17,  5.90it/s][A
 96%|█████████▌| 10274/10730 [18:09<01:07,  6.79it/s][A
 96%|█████████▌| 10276/10730 [18:09<00:57,  7.95it/s][A
 96%|█████████▌| 10278/10730 [18:10<00:50,  8.95it/s][A
 96%|█████████▌| 10280/10730 [18:10<00:45,  9.88it/s][A
 96%|█████████▌| 10282/10730 [18:10<00:48,  9.18it/s][A
 96%|█████████▌| 10284/10730 [18:10<00:44, 10.11it/s][A
 96%|█████████▌| 10286/10730 [18:10<00:43, 10.18it/s][A
 96%|█████████▌| 10288/10730 [18:10<00:45,  9.76it/s][A
 96%|█████████▌| 10290/10730 [18:11<00:51,  8.54it/s][A
 96%|█████████▌| 10292/10730 [18:11<00:47,  9.32it/s][A
 96%|█████████▌| 10294/10730 [18:11<00:40, 10.78it/s][A
 96%|█████████▌| 10296/10730 [18:11<00:38, 11.26it/s][A
 96%|█████████▌| 10298/10730 [18:11<00:37, 11.46it/s][A
 96%|█████████▌| 10300/10730 [18:12<00:34, 12.36it/s][A
 96%|█████████▌| 10302/10730 [1

 98%|█████████▊| 10526/10730 [18:36<00:20,  9.78it/s][A
 98%|█████████▊| 10528/10730 [18:36<00:20, 10.08it/s][A
 98%|█████████▊| 10530/10730 [18:36<00:19, 10.41it/s][A
 98%|█████████▊| 10532/10730 [18:36<00:22,  8.91it/s][A
 98%|█████████▊| 10533/10730 [18:36<00:23,  8.34it/s][A
 98%|█████████▊| 10535/10730 [18:37<00:21,  9.09it/s][A
 98%|█████████▊| 10537/10730 [18:37<00:19, 10.02it/s][A
 98%|█████████▊| 10539/10730 [18:37<00:20,  9.28it/s][A
 98%|█████████▊| 10541/10730 [18:37<00:18, 10.20it/s][A
 98%|█████████▊| 10543/10730 [18:37<00:20,  9.35it/s][A
 98%|█████████▊| 10544/10730 [18:38<00:19,  9.34it/s][A
 98%|█████████▊| 10545/10730 [18:38<00:23,  8.02it/s][A
 98%|█████████▊| 10546/10730 [18:38<00:23,  7.97it/s][A
 98%|█████████▊| 10548/10730 [18:38<00:21,  8.41it/s][A
 98%|█████████▊| 10550/10730 [18:38<00:21,  8.42it/s][A
 98%|█████████▊| 10552/10730 [18:38<00:20,  8.64it/s][A
 98%|█████████▊| 10554/10730 [18:39<00:18,  9.68it/s][A
 98%|█████████▊| 10556/10730 [1

In [15]:
clustering_model = AgglomerativeClustering(n_clusters=None,
                                           distance_threshold=0.2476,
                                           linkage="single",
                                           affinity="cosine")

clustering_model.fit(embeds)
labels = clustering_model.labels_

id2url = dict()
for i, (url, _) in enumerate(url2record.items()):
    id2url[i] = url

url2label = dict()
for i, label in enumerate(labels):
    url2label[id2url[i]] = label

In [22]:
with open('bad.txt', 'w') as f:
    targets = []
    predictions = []
    for (first_url, second_url), target in markup.items():
        prediction = int(url2label[first_url] == url2label[second_url])
        first = url2record.get(first_url)
        second = url2record.get(second_url)
        targets.append(target)
        predictions.append(prediction)
        if target != prediction:
            f.write(first_url + '\n')
            f.write(second_url + '\n')
            f.write(url2record.get(first_url)['title'] + '\n')
            f.write(url2record.get(second_url)['title'] + '\n')
            f.write(f'target = {target}\nprediction = {prediction}\n')
            f.write(30*'-' + '\n')

'26 новых случаев коронавируса выявили в Мытищах'

In [None]:
def get_quality(dist_threshold, print_result=False):
    clustering_model = AgglomerativeClustering(n_clusters=None,
                                           distance_threshold=dist_threshold,
                                           linkage="single",
                                           affinity="cosine")

    clustering_model.fit(embeds)
    labels = clustering_model.labels_
    
    id2url = dict()
    for i, (url, _) in enumerate(url2record.items()):
        id2url[i] = url

    url2label = dict()
    for i, label in enumerate(labels):
        url2label[id2url[i]] = label
        
    if print_result:
        print(calc_metrics(markup, url2label, url2record))
        return
    metrics = calc_metrics(markup, url2label, url2record, output_dict=True)
    return metrics['macro avg']['f1-score']

In [None]:
domain = np.logspace(-5, 0, 70)
quals = [get_quality(dist) for dist in tqdm.tqdm(domain, total=70)]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(domain, quals);

In [None]:
closer_domain = np.linspace(domain[np.argmax(quals)-5], domain[np.argmax(quals)+5], 70)
closer_quals = [get_quality(dist) for dist in tqdm.tqdm(closer_domain, total=70)]

In [None]:
plt.plot(closer_domain, closer_quals);

In [None]:
best_dist = closer_domain[np.argmax(closer_quals)]

In [None]:
best_dist

In [60]:
get_quality(best_dist, print_result=True)

              precision    recall  f1-score   support

           0       0.88      0.82      0.85      1571
           1       0.77      0.85      0.81      1130

    accuracy                           0.83      2701
   macro avg       0.83      0.83      0.83      2701
weighted avg       0.84      0.83      0.83      2701



### Result

40k checkpoint; title + ' ' + text; MeanSum, dist = 0.187

In [38]:
get_quality(best_dist, print_result=True)

              precision    recall  f1-score   support

           0       0.87      0.84      0.85      1571
           1       0.78      0.83      0.81      1130

    accuracy                           0.83      2701
   macro avg       0.83      0.83      0.83      2701
weighted avg       0.84      0.83      0.84      2701

