In [2]:
import os, sys
import glob, json
import pandas as pd
from tqdm import tqdm

# Add repo
git_dir = os.path.abspath('../')
sys.path.append(os.path.join(git_dir, 'lib', 'utils'))

# Define data path
data_path = os.path.join(git_dir, 'data', 'hansard-parsed')

# Get all json files
data_path_senate = os.path.join(data_path, 'senate')
data_path_hofreps = os.path.join(data_path, 'hofreps')

json_files_senate = [y for x in os.walk(data_path_senate) for y in glob.glob(os.path.join(x[0], '*.json'))]
json_files_hofreps = [y for x in os.walk(data_path_hofreps) for y in glob.glob(os.path.join(x[0], '*.json'))]

def load_jsons(json_list):
    data = []
    for js_path in tqdm(json_list):
        with open(js_path) as f:
            data += json.load(f)
            
    return data

In [3]:
# Create single dataframe
df_senate = pd.DataFrame(load_jsons(json_files_senate))
df_senate = df_senate.dropna(subset=['sentences'])
df_senate['chamber'] = 'Senate'


df_hofreps = pd.DataFrame(load_jsons(json_files_hofreps))
df_hofreps = df_hofreps.dropna(subset=['sentences'])
df_hofreps['chamber'] = 'HouseOfReps'

df = df_senate.append(df_hofreps).reset_index(drop=True).copy()

100%|██████████| 485/485 [00:01<00:00, 270.38it/s]
100%|██████████| 562/562 [00:02<00:00, 205.44it/s]


In [3]:
# Drop entries without sentences
print(len(df))
df = df.dropna(subset=['sentences'])
print(len(df))

163428
161961


In [4]:
def batch_gen(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

In [5]:
# Add repo
git_dir = os.path.abspath('../')
sys.path.append(os.path.join(git_dir, 'lib', 'GoEmotions-pytorch') )
sys.path.append(os.path.join(git_dir, 'lib', 'utils') )
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from multilabel_pipeline import MultiLabelPipeline
import re
from tqdm import tqdm
import numpy as np 
import pandas as pd
import pickle

tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-ekman")
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-ekman")
model = model.to('cuda:0')

goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3,
    device=0
)


In [6]:
processed_sentence_lists = []
for ind, sentence_list_raw in enumerate(tqdm(df_senate['sentences'])):
    processed_sentences = []
    sentence_list = []
    for sentence_ind, sentence in enumerate(sentence_list_raw):
        tokens = tokenizer(sentence)
        if len(tokens['input_ids'])>500:
            print('Sentence %d in document %d is too long'%(sentence_ind, ind ))
        else:
            sentence_list.append(sentence)
    
    for batch in batch_gen(sentence_list, n=32):
        processed_sentences.extend(goemotions(batch))
        #processed_sentences.append(goemotions(sentence)[0])
            #ts['emotions'].append(goemotions(sentence)[0])
    processed_sentence_lists.append(processed_sentences)
        

  0%|          | 259/75610 [00:09<28:17, 44.38it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
  0%|          | 268/75610 [00:09<27:40, 45.38it/s]

Sentence 0 in document 261 is too long


  0%|          | 322/75610 [00:14<2:40:33,  7.82it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors
  0%|          | 324/75610 [00:14<2:25:23,  8.63it/s]

Sentence 14 in document 323 is too long


  2%|▏         | 1374/75610 [00:57<38:34, 32.07it/s]  

Sentence 4 in document 1379 is too long


  3%|▎         | 1934/75610 [01:15<27:12, 45.14it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 1937 is too long


  3%|▎         | 2122/75610 [01:24<1:06:47, 18.34it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1452 > 512). Running this sequence through the model will result in indexing errors
  3%|▎         | 2130/75610 [01:24<51:46, 23.65it/s]  

Sentence 1 in document 2122 is too long


  3%|▎         | 2422/75610 [01:38<2:24:16,  8.46it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1745 > 512). Running this sequence through the model will result in indexing errors
  3%|▎         | 2436/75610 [01:38<1:19:50, 15.28it/s]

Sentence 5 in document 2422 is too long


  4%|▍         | 2893/75610 [01:56<2:07:49,  9.48it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1543 > 512). Running this sequence through the model will result in indexing errors
  4%|▍         | 2903/75610 [01:57<1:17:12, 15.69it/s]

Sentence 1 in document 2894 is too long


  4%|▍         | 3215/75610 [02:10<31:24, 38.43it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1939 > 512). Running this sequence through the model will result in indexing errors
  4%|▍         | 3230/75610 [02:10<24:19, 49.60it/s]

Sentence 1 in document 3221 is too long


  4%|▍         | 3298/75610 [02:13<31:06, 38.73it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (2099 > 512). Running this sequence through the model will result in indexing errors
  4%|▍         | 3311/75610 [02:13<34:02, 35.41it/s]

Sentence 7 in document 3302 is too long


  5%|▌         | 4006/75610 [02:48<44:12, 27.00it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (545 > 512). Running this sequence through the model will result in indexing errors
  5%|▌         | 4019/75610 [02:48<33:07, 36.01it/s]

Sentence 0 in document 4010 is too long


  6%|▌         | 4169/75610 [02:58<1:27:57, 13.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1771 > 512). Running this sequence through the model will result in indexing errors
  6%|▌         | 4172/75610 [02:58<1:32:21, 12.89it/s]

Sentence 9 in document 4171 is too long


  6%|▌         | 4298/75610 [03:05<1:57:34, 10.11it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1555 > 512). Running this sequence through the model will result in indexing errors


Sentence 79 in document 4298 is too long


  8%|▊         | 5880/75610 [04:09<1:54:11, 10.18it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1047 > 512). Running this sequence through the model will result in indexing errors
  8%|▊         | 5885/75610 [04:09<1:27:08, 13.34it/s]

Sentence 1 in document 5882 is too long


  9%|▊         | 6558/75610 [04:34<43:42, 26.33it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1791 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1756 > 512). Running this sequence through the model will result in indexing errors
  9%|▊         | 6563/75610 [04:34<58:35, 19.64it/s]

Sentence 18 in document 6562 is too long
Sentence 54 in document 6562 is too long


  9%|▉         | 6797/75610 [04:44<33:22, 34.36it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors
  9%|▉         | 6806/75610 [04:44<27:17, 42.02it/s]

Sentence 0 in document 6802 is too long
Sentence 0 in document 6804 is too long


 10%|█         | 7620/75610 [05:13<28:18, 40.04it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1786 > 512). Running this sequence through the model will result in indexing errors
 10%|█         | 7629/75610 [05:13<25:23, 44.61it/s]

Sentence 10 in document 7628 is too long


 10%|█         | 7733/75610 [05:19<30:44, 36.80it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (688 > 512). Running this sequence through the model will result in indexing errors
 10%|█         | 7747/75610 [05:20<27:59, 40.41it/s]

Sentence 0 in document 7737 is too long


 12%|█▏        | 8949/75610 [06:11<33:44, 32.92it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors
 12%|█▏        | 8954/75610 [06:11<43:14, 25.69it/s]

Sentence 13 in document 8953 is too long


 12%|█▏        | 9287/75610 [06:26<20:59, 52.66it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors
 12%|█▏        | 9294/75610 [06:26<22:33, 49.01it/s]

Sentence 0 in document 9289 is too long


 13%|█▎        | 9593/75610 [06:34<18:37, 59.10it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors
 13%|█▎        | 9601/75610 [06:35<26:20, 41.77it/s]

Sentence 12 in document 9596 is too long


 13%|█▎        | 10175/75610 [06:54<23:38, 46.12it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors
 13%|█▎        | 10184/75610 [06:54<20:39, 52.76it/s]

Sentence 0 in document 10180 is too long


 14%|█▍        | 10693/75610 [07:14<47:04, 22.98it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1485 > 512). Running this sequence through the model will result in indexing errors
 14%|█▍        | 10697/75610 [07:14<49:48, 21.72it/s]

Sentence 39 in document 10696 is too long


 15%|█▍        | 11023/75610 [07:28<57:18, 18.79it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1207 > 512). Running this sequence through the model will result in indexing errors
 15%|█▍        | 11029/75610 [07:29<2:08:47,  8.36it/s]

Sentence 1 in document 11024 is too long


 15%|█▍        | 11082/75610 [07:30<27:51, 38.61it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1763 > 512). Running this sequence through the model will result in indexing errors


Sentence 8 in document 11084 is too long


 15%|█▍        | 11134/75610 [07:33<1:20:50, 13.29it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors


Sentence 4 in document 11135 is too long


 15%|█▍        | 11151/75610 [07:34<50:59, 21.07it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
 15%|█▍        | 11171/75610 [07:34<31:14, 34.38it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (646 > 512). Running this sequence through the model will result in indexing errors


Sentence 0 in document 11161 is too long
Sentence 2 in document 11175 is too long


 15%|█▍        | 11216/75610 [07:36<48:08, 22.30it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


Sentence 13 in document 11218 is too long


 15%|█▌        | 11407/75610 [07:44<22:49, 46.87it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1564 > 512). Running this sequence through the model will result in indexing errors
 15%|█▌        | 11415/75610 [07:44<25:21, 42.19it/s]

Sentence 6 in document 11410 is too long


 15%|█▌        | 11706/75610 [07:56<21:01, 50.65it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (590 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors


Sentence 16 in document 11708 is too long
Sentence 19 in document 11708 is too long


 16%|█▌        | 11947/75610 [08:04<26:02, 40.75it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
 16%|█▌        | 11965/75610 [08:04<18:53, 56.17it/s]

Sentence 0 in document 11952 is too long


 16%|█▋        | 12301/75610 [08:12<43:04, 24.50it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (563 > 512). Running this sequence through the model will result in indexing errors


Sentence 357 in document 12302 is too long


 17%|█▋        | 12533/75610 [08:22<56:22, 18.65it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1736 > 512). Running this sequence through the model will result in indexing errors
 17%|█▋        | 12538/75610 [08:22<1:01:33, 17.08it/s]

Sentence 19 in document 12536 is too long


 17%|█▋        | 12980/75610 [08:40<25:44, 40.55it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (515 > 512). Running this sequence through the model will result in indexing errors
 17%|█▋        | 12986/75610 [08:40<29:11, 35.76it/s]

Sentence 0 in document 12981 is too long


 17%|█▋        | 13042/75610 [08:44<1:15:12, 13.87it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1373 > 512). Running this sequence through the model will result in indexing errors
 17%|█▋        | 13057/75610 [08:44<44:04, 23.66it/s]  

Sentence 8 in document 13042 is too long


 18%|█▊        | 13406/75610 [08:58<27:35, 37.56it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1608 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1538 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3953 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1608 > 512). Running this sequence through the model will result in indexing errors
 18%|█▊        | 13413/75610 [08:58<31:49, 32.57it/s]

Sentence 7 in document 13412 is too long
Sentence 9 in document 13412 is too long
Sentence 38 in document 13412 is too long
Sentence 46 in document 13412 is too long


 18%|█▊        | 13708/75610 [09:11<44:34, 23.14it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (664 > 512). Running this sequence through the model will result in indexing errors
 18%|█▊        | 13712/75610 [09:11<44:56, 22.95it/s]

Sentence 0 in document 13710 is too long


 19%|█▉        | 14240/75610 [09:35<1:37:01, 10.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1639 > 512). Running this sequence through the model will result in indexing errors
 19%|█▉        | 14244/75610 [09:35<1:15:55, 13.47it/s]

Sentence 3 in document 14240 is too long


 19%|█▉        | 14701/75610 [09:51<59:46, 16.98it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors
 19%|█▉        | 14704/75610 [09:51<1:02:33, 16.22it/s]

Sentence 8 in document 14701 is too long


 20%|██        | 15435/75610 [10:26<1:45:22,  9.52it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors
 20%|██        | 15437/75610 [10:26<1:45:37,  9.49it/s]

Sentence 13 in document 15435 is too long


 20%|██        | 15444/75610 [10:26<1:21:30, 12.30it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
 20%|██        | 15460/75610 [10:27<48:09, 20.82it/s]  

Sentence 0 in document 15449 is too long


 21%|██        | 15697/75610 [10:39<1:25:26, 11.69it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1750 > 512). Running this sequence through the model will result in indexing errors
 21%|██        | 15700/75610 [10:39<1:14:21, 13.43it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1615 > 512). Running this sequence through the model will result in indexing errors
 21%|██        | 15708/75610 [10:39<56:26, 17.69it/s]  

Sentence 7 in document 15698 is too long
Sentence 1 in document 15707 is too long


 22%|██▏       | 16647/75610 [11:11<38:28, 25.54it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (923 > 512). Running this sequence through the model will result in indexing errors
 22%|██▏       | 16664/75610 [11:11<26:22, 37.24it/s]

Sentence 0 in document 16655 is too long


 22%|██▏       | 16987/75610 [11:22<25:00, 39.07it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
 22%|██▏       | 16999/75610 [11:22<26:20, 37.08it/s]

Sentence 0 in document 16992 is too long


 23%|██▎       | 17083/75610 [11:26<44:59, 21.68it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (724 > 512). Running this sequence through the model will result in indexing errors


Sentence 7 in document 17083 is too long


 23%|██▎       | 17498/75610 [11:41<50:44, 19.09it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (627 > 512). Running this sequence through the model will result in indexing errors
 23%|██▎       | 17502/75610 [11:41<44:08, 21.94it/s]

Sentence 0 in document 17501 is too long


 23%|██▎       | 17579/75610 [11:44<34:34, 27.97it/s]  

Sentence 0 in document 17564 is too long


 24%|██▍       | 18027/75610 [12:00<52:57, 18.12it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (534 > 512). Running this sequence through the model will result in indexing errors
 24%|██▍       | 18037/75610 [12:01<40:08, 23.90it/s]

Sentence 0 in document 18031 is too long


 24%|██▍       | 18354/75610 [12:11<2:00:17,  7.93it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
 24%|██▍       | 18373/75610 [12:12<1:04:46, 14.73it/s]

Sentence 0 in document 18360 is too long


 25%|██▍       | 18784/75610 [12:25<21:49, 43.39it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1769 > 512). Running this sequence through the model will result in indexing errors
 25%|██▍       | 18796/75610 [12:26<20:20, 46.55it/s]

Sentence 21 in document 18785 is too long


 25%|██▌       | 19077/75610 [12:39<22:43, 41.46it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1661 > 512). Running this sequence through the model will result in indexing errors
 25%|██▌       | 19083/75610 [12:39<28:47, 32.73it/s]

Sentence 9 in document 19080 is too long


 26%|██▋       | 19950/75610 [13:15<28:52, 32.13it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors
 26%|██▋       | 19966/75610 [13:15<20:36, 44.99it/s]

Sentence 0 in document 19956 is too long


 27%|██▋       | 20224/75610 [13:27<28:56, 31.90it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (620 > 512). Running this sequence through the model will result in indexing errors
 27%|██▋       | 20241/75610 [13:27<19:49, 46.54it/s]

Sentence 0 in document 20228 is too long


 27%|██▋       | 20303/75610 [13:31<55:04, 16.74it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors


Sentence 2 in document 20303 is too long


 28%|██▊       | 20831/75610 [13:53<23:42, 38.51it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (589 > 512). Running this sequence through the model will result in indexing errors
 28%|██▊       | 20844/75610 [13:53<20:36, 44.28it/s]

Sentence 2 in document 20833 is too long


 29%|██▉       | 21873/75610 [14:30<1:48:09,  8.28it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1606 > 512). Running this sequence through the model will result in indexing errors
 29%|██▉       | 21881/75610 [14:30<1:08:48, 13.01it/s]

Sentence 8 in document 21874 is too long


 29%|██▉       | 22083/75610 [14:38<44:17, 20.14it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (550 > 512). Running this sequence through the model will result in indexing errors
 29%|██▉       | 22096/75610 [14:38<29:32, 30.19it/s]

Sentence 0 in document 22086 is too long


 30%|██▉       | 22384/75610 [14:49<37:13, 23.83it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors
 30%|██▉       | 22390/75610 [14:50<44:53, 19.76it/s]

Sentence 0 in document 22389 is too long


 30%|██▉       | 22554/75610 [14:58<47:34, 18.59it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1792 > 512). Running this sequence through the model will result in indexing errors
 30%|██▉       | 22563/75610 [14:58<35:05, 25.20it/s]

Sentence 22 in document 22555 is too long


 30%|███       | 23036/75610 [15:19<41:18, 21.21it/s]  

Sentence 2 in document 23032 is too long


 31%|███▏      | 23728/75610 [15:46<12:34, 68.80it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1645 > 512). Running this sequence through the model will result in indexing errors
 31%|███▏      | 23736/75610 [15:46<12:31, 69.03it/s]

Sentence 3 in document 23735 is too long


 32%|███▏      | 23905/75610 [15:56<2:02:26,  7.04it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (592 > 512). Running this sequence through the model will result in indexing errors


Sentence 19 in document 23906 is too long


 32%|███▏      | 24463/75610 [16:19<51:02, 16.70it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1636 > 512). Running this sequence through the model will result in indexing errors
 32%|███▏      | 24466/75610 [16:19<1:06:27, 12.83it/s]

Sentence 68 in document 24465 is too long


 33%|███▎      | 24964/75610 [16:42<29:10, 28.93it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors
 33%|███▎      | 24981/75610 [16:43<19:31, 43.23it/s]

Sentence 0 in document 24968 is too long
Sentence 0 in document 24969 is too long


 34%|███▍      | 25812/75610 [17:17<21:08, 39.26it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (657 > 512). Running this sequence through the model will result in indexing errors
 34%|███▍      | 25831/75610 [17:17<15:15, 54.38it/s]

Sentence 0 in document 25814 is too long


 34%|███▍      | 25899/75610 [17:22<1:21:01, 10.23it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors
 34%|███▍      | 25918/75610 [17:23<44:12, 18.73it/s]  

Sentence 0 in document 25899 is too long


 34%|███▍      | 26017/75610 [17:26<1:02:20, 13.26it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (723 > 512). Running this sequence through the model will result in indexing errors
 34%|███▍      | 26020/75610 [17:26<58:28, 14.14it/s]  

Sentence 51 in document 26017 is too long


 35%|███▍      | 26285/75610 [17:41<1:55:38,  7.11it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1565 > 512). Running this sequence through the model will result in indexing errors
 35%|███▍      | 26299/75610 [17:41<1:03:45, 12.89it/s]

Sentence 1 in document 26289 is too long


 35%|███▍      | 26417/75610 [17:45<1:01:48, 13.26it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (579 > 512). Running this sequence through the model will result in indexing errors
 35%|███▍      | 26426/75610 [17:45<42:42, 19.19it/s]  

Sentence 0 in document 26420 is too long


 35%|███▌      | 26535/75610 [17:48<27:01, 30.27it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1667 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1651 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1616 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1629 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1582 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is

Sentence 3 in document 26537 is too long
Sentence 12 in document 26537 is too long
Sentence 18 in document 26537 is too long
Sentence 24 in document 26537 is too long
Sentence 30 in document 26537 is too long
Sentence 36 in document 26537 is too long
Sentence 41 in document 26537 is too long
Sentence 45 in document 26537 is too long
Sentence 53 in document 26537 is too long
Sentence 60 in document 26537 is too long
Sentence 66 in document 26537 is too long
Sentence 75 in document 26537 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (1599 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1604 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1645 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1612 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1571 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length fo

Sentence 101 in document 26537 is too long
Sentence 110 in document 26537 is too long
Sentence 117 in document 26537 is too long
Sentence 121 in document 26537 is too long
Sentence 124 in document 26537 is too long
Sentence 130 in document 26537 is too long
Sentence 136 in document 26537 is too long
Sentence 143 in document 26537 is too long
Sentence 149 in document 26537 is too long
Sentence 168 in document 26537 is too long
Sentence 178 in document 26537 is too long
Sentence 182 in document 26537 is too long
Sentence 186 in document 26537 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (1581 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1589 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1606 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1599 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1586 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length fo

Sentence 200 in document 26537 is too long
Sentence 204 in document 26537 is too long
Sentence 215 in document 26537 is too long
Sentence 224 in document 26537 is too long
Sentence 229 in document 26537 is too long
Sentence 236 in document 26537 is too long
Sentence 242 in document 26537 is too long
Sentence 251 in document 26537 is too long
Sentence 259 in document 26537 is too long


 35%|███▌      | 26559/75610 [17:51<1:07:33, 12.10it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (725 > 512). Running this sequence through the model will result in indexing errors
 35%|███▌      | 26561/75610 [17:51<1:01:48, 13.22it/s]

Sentence 10 in document 26559 is too long


 36%|███▌      | 27003/75610 [18:05<48:53, 16.57it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1635 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1573 > 512). Running this sequence through the model will result in indexing errors


Sentence 23 in document 27004 is too long
Sentence 36 in document 27004 is too long


 36%|███▌      | 27115/75610 [18:10<18:33, 43.55it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors
 36%|███▌      | 27122/75610 [18:10<16:46, 48.17it/s]

Sentence 0 in document 27115 is too long


 36%|███▌      | 27261/75610 [18:16<28:45, 28.02it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (687 > 512). Running this sequence through the model will result in indexing errors
 36%|███▌      | 27276/75610 [18:17<20:07, 40.04it/s]

Sentence 3 in document 27269 is too long


 36%|███▋      | 27514/75610 [18:25<37:51, 21.18it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1856 > 512). Running this sequence through the model will result in indexing errors


Sentence 93 in document 27515 is too long


 37%|███▋      | 27624/75610 [18:29<13:51, 57.73it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1661 > 512). Running this sequence through the model will result in indexing errors
 37%|███▋      | 27632/75610 [18:29<14:31, 55.05it/s]

Sentence 2 in document 27628 is too long


 37%|███▋      | 27806/75610 [18:37<16:34, 48.09it/s]  

Sentence 2 in document 27798 is too long


 38%|███▊      | 28594/75610 [19:13<34:48, 22.51it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
 38%|███▊      | 28603/75610 [19:13<41:26, 18.90it/s]

Sentence 3 in document 28597 is too long
Sentence 4 in document 28597 is too long


 38%|███▊      | 28843/75610 [19:22<17:10, 45.37it/s]  

Sentence 0 in document 28834 is too long


 38%|███▊      | 28875/75610 [19:24<1:18:37,  9.91it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1212 > 512). Running this sequence through the model will result in indexing errors
 38%|███▊      | 28886/75610 [19:25<51:04, 15.25it/s]  

Sentence 2 in document 28880 is too long


 38%|███▊      | 28899/75610 [19:25<32:30, 23.95it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (659 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 28899 is too long


 38%|███▊      | 29036/75610 [19:30<49:52, 15.56it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
 38%|███▊      | 29039/75610 [19:30<53:55, 14.39it/s]

Sentence 11 in document 29038 is too long
Sentence 26 in document 29038 is too long


 39%|███▉      | 29827/75610 [20:05<1:37:48,  7.80it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors
 39%|███▉      | 29843/75610 [20:05<55:03, 13.85it/s]  

Sentence 0 in document 29834 is too long


 40%|███▉      | 29924/75610 [20:07<15:45, 48.32it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (841 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1643 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (695 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1272 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2125 > 512). Running this sequence through the model will result in indexing errors


Sentence 30 in document 29929 is too long
Sentence 33 in document 29929 is too long
Sentence 38 in document 29929 is too long
Sentence 39 in document 29929 is too long
Sentence 40 in document 29929 is too long


 40%|███▉      | 30066/75610 [20:16<1:27:48,  8.64it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (944 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1697 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1347 > 512). Running this sequence through the model will result in indexing errors


Sentence 14 in document 30067 is too long
Sentence 15 in document 30067 is too long
Sentence 21 in document 30067 is too long
Sentence 60 in document 30067 is too long


 40%|███▉      | 30068/75610 [20:16<2:10:41,  5.81it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (690 > 512). Running this sequence through the model will result in indexing errors
 40%|███▉      | 30070/75610 [20:17<1:55:36,  6.57it/s]

Sentence 1 in document 30069 is too long


 40%|███▉      | 30230/75610 [20:21<21:10, 35.71it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (782 > 512). Running this sequence through the model will result in indexing errors
 40%|███▉      | 30235/75610 [20:21<20:21, 37.15it/s]

Sentence 35 in document 30233 is too long


 40%|████      | 30291/75610 [20:24<27:37, 27.35it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors


Sentence 28 in document 30291 is too long


 40%|████      | 30340/75610 [20:26<27:11, 27.75it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (769 > 512). Running this sequence through the model will result in indexing errors
 40%|████      | 30344/75610 [20:26<28:35, 26.39it/s]

Sentence 0 in document 30341 is too long


 40%|████      | 30616/75610 [20:38<1:07:25, 11.12it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (669 > 512). Running this sequence through the model will result in indexing errors


Sentence 21 in document 30619 is too long


 41%|████      | 30975/75610 [20:53<26:42, 27.85it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (942 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (554 > 512). Running this sequence through the model will result in indexing errors
 41%|████      | 30980/75610 [20:53<25:47, 28.85it/s]

Sentence 4 in document 30976 is too long
Sentence 2 in document 30978 is too long


 41%|████      | 30997/75610 [20:54<22:15, 33.41it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (522 > 512). Running this sequence through the model will result in indexing errors
 41%|████      | 31003/75610 [20:54<22:24, 33.17it/s]

Sentence 2 in document 30998 is too long


 42%|████▏     | 31390/75610 [21:07<39:41, 18.57it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1300 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 31401/75610 [21:08<27:24, 26.88it/s]

Sentence 9 in document 31392 is too long


 42%|████▏     | 31593/75610 [21:16<39:14, 18.70it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1662 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 31603/75610 [21:16<36:32, 20.07it/s]

Sentence 0 in document 31597 is too long


 42%|████▏     | 31624/75610 [21:16<21:48, 33.61it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (578 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (633 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 31631/75610 [21:17<25:26, 28.81it/s]

Sentence 9 in document 31629 is too long
Sentence 18 in document 31629 is too long


 42%|████▏     | 31769/75610 [21:23<56:14, 12.99it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 31778/75610 [21:23<37:37, 19.42it/s]

Sentence 13 in document 31772 is too long


 42%|████▏     | 32023/75610 [21:33<16:32, 43.90it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 32036/75610 [21:33<14:36, 49.70it/s]

Sentence 0 in document 32027 is too long


 42%|████▏     | 32102/75610 [21:36<22:42, 31.93it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (540 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 32109/75610 [21:36<19:04, 38.02it/s]

Sentence 2 in document 32106 is too long


 42%|████▏     | 32122/75610 [21:36<16:36, 43.66it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (622 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 32131/75610 [21:36<14:09, 51.17it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
 43%|████▎     | 32139/75610 [21:37<12:54, 56.11it/s]

Sentence 0 in document 32125 is too long
Sentence 0 in document 32132 is too long
Sentence 2 in document 32138 is too long


 43%|████▎     | 32785/75610 [22:04<33:16, 21.45it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors


Sentence 25 in document 32790 is too long


 44%|████▎     | 33033/75610 [22:16<27:06, 26.17it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1108 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1665 > 512). Running this sequence through the model will result in indexing errors
 44%|████▎     | 33042/75610 [22:16<22:00, 32.24it/s]

Sentence 14 in document 33041 is too long
Sentence 15 in document 33041 is too long


 44%|████▍     | 33212/75610 [22:22<37:28, 18.86it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1795 > 512). Running this sequence through the model will result in indexing errors
 44%|████▍     | 33228/75610 [22:22<23:18, 30.30it/s]

Sentence 11 in document 33219 is too long


 44%|████▍     | 33248/75610 [22:24<1:04:15, 10.99it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1337 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (910 > 512). Running this sequence through the model will result in indexing errors
 44%|████▍     | 33253/75610 [22:24<55:47, 12.66it/s]  

Sentence 12 in document 33252 is too long
Sentence 13 in document 33252 is too long


 44%|████▍     | 33385/75610 [22:29<37:20, 18.85it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (686 > 512). Running this sequence through the model will result in indexing errors


Sentence 8 in document 33385 is too long


 44%|████▍     | 33443/75610 [22:31<17:10, 40.92it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1257 > 512). Running this sequence through the model will result in indexing errors
 44%|████▍     | 33449/75610 [22:31<17:59, 39.07it/s]

Sentence 6 in document 33443 is too long


 44%|████▍     | 33624/75610 [22:36<15:44, 44.45it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
 44%|████▍     | 33636/75610 [22:37<15:55, 43.94it/s]

Sentence 6 in document 33629 is too long


 45%|████▍     | 33967/75610 [22:50<33:45, 20.55it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (7542 > 512). Running this sequence through the model will result in indexing errors
 45%|████▍     | 33976/75610 [22:50<26:23, 26.29it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (581 > 512). Running this sequence through the model will result in indexing errors


Sentence 17 in document 33975 is too long
Sentence 42 in document 33983 is too long


 45%|████▌     | 34160/75610 [22:59<22:13, 31.08it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (2583 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3518 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3895 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1031 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 34161 is too long
Sentence 46 in document 34161 is too long
Sentence 47 in document 34161 is too long
Sentence 48 in document 34161 is too long


 46%|████▌     | 34601/75610 [23:15<18:43, 36.51it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (836 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1338 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (586 > 512). Running this sequence through the model will result in indexing errors


Sentence 30 in document 34602 is too long
Sentence 39 in document 34602 is too long
Sentence 82 in document 34602 is too long


 46%|████▌     | 34690/75610 [23:17<13:46, 49.53it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (944 > 512). Running this sequence through the model will result in indexing errors
 46%|████▌     | 34697/75610 [23:18<25:12, 27.05it/s]

Sentence 2 in document 34695 is too long


 46%|████▌     | 34732/75610 [23:20<51:42, 13.17it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1311 > 512). Running this sequence through the model will result in indexing errors
 46%|████▌     | 34736/75610 [23:21<43:46, 15.56it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (936 > 512). Running this sequence through the model will result in indexing errors


Sentence 21 in document 34733 is too long
Sentence 5 in document 34737 is too long


 46%|████▌     | 34793/75610 [23:22<15:18, 44.42it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors
 46%|████▌     | 34800/75610 [23:22<13:53, 48.94it/s]

Sentence 2 in document 34798 is too long


 46%|████▌     | 34807/75610 [23:22<17:12, 39.50it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
 46%|████▌     | 34822/75610 [23:22<13:26, 50.57it/s]

Sentence 1 in document 34812 is too long


 46%|████▌     | 34956/75610 [23:27<1:10:28,  9.61it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors
 46%|████▋     | 34973/75610 [23:27<38:50, 17.44it/s]  

Sentence 5 in document 34957 is too long


 46%|████▋     | 34989/75610 [23:28<22:37, 29.93it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (784 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


Sentence 352 in document 34991 is too long
Sentence 468 in document 34991 is too long


 46%|████▋     | 35148/75610 [23:38<48:18, 13.96it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors
 47%|████▋     | 35166/75610 [23:38<28:49, 23.38it/s]

Sentence 0 in document 35152 is too long


 48%|████▊     | 35997/75610 [24:07<48:23, 13.64it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1326 > 512). Running this sequence through the model will result in indexing errors


Sentence 2 in document 36000 is too long


 48%|████▊     | 36115/75610 [24:13<1:04:46, 10.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3768 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (956 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (944 > 512). Running this sequence through the model will result in indexing errors
 48%|████▊     | 36117/75610 [24:13<1:07:22,  9.77it/s]

Sentence 0 in document 36116 is too long
Sentence 13 in document 36116 is too long
Sentence 15 in document 36116 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (1171 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (808 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (552 > 512). Running this sequence through the model will result in indexing errors


Sentence 20 in document 36118 is too long
Sentence 25 in document 36118 is too long
Sentence 75 in document 36118 is too long
Sentence 103 in document 36118 is too long


 48%|████▊     | 36121/75610 [24:14<2:07:35,  5.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (664 > 512). Running this sequence through the model will result in indexing errors


Sentence 1 in document 36122 is too long
Sentence 14 in document 36122 is too long


 48%|████▊     | 36123/75610 [24:15<1:57:32,  5.60it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


Sentence 13 in document 36124 is too long


 48%|████▊     | 36125/75610 [24:15<1:49:45,  6.00it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1378 > 512). Running this sequence through the model will result in indexing errors
 48%|████▊     | 36129/75610 [24:15<1:22:03,  8.02it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2949 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1976 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (701 > 512). Running this sequence through the model will res

Sentence 13 in document 36126 is too long
Sentence 2 in document 36129 is too long
Sentence 5 in document 36129 is too long
Sentence 0 in document 36132 is too long
Sentence 10 in document 36132 is too long


 48%|████▊     | 36438/75610 [24:31<1:22:40,  7.90it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors
 48%|████▊     | 36442/75610 [24:32<1:08:17,  9.56it/s]

Sentence 4 in document 36440 is too long


 49%|████▊     | 36675/75610 [24:41<18:00, 36.05it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (811 > 512). Running this sequence through the model will result in indexing errors
 49%|████▊     | 36680/75610 [24:41<20:41, 31.36it/s]

Sentence 6 in document 36677 is too long


 49%|████▊     | 36817/75610 [24:48<51:39, 12.52it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (961 > 512). Running this sequence through the model will result in indexing errors


Sentence 0 in document 36818 is too long


 49%|████▉     | 36954/75610 [24:55<44:41, 14.42it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
 49%|████▉     | 36962/75610 [24:55<33:51, 19.03it/s]

Sentence 10 in document 36960 is too long


 49%|████▉     | 37217/75610 [25:05<25:01, 25.56it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
 49%|████▉     | 37223/75610 [25:05<22:11, 28.83it/s]

Sentence 0 in document 37217 is too long


 49%|████▉     | 37385/75610 [25:12<1:11:15,  8.94it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1768 > 512). Running this sequence through the model will result in indexing errors
 49%|████▉     | 37395/75610 [25:13<42:19, 15.05it/s]  

Sentence 1 in document 37387 is too long


 50%|████▉     | 37661/75610 [25:23<24:20, 25.98it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2079 > 512). Running this sequence through the model will result in indexing errors
 50%|████▉     | 37672/75610 [25:23<21:12, 29.82it/s]

Sentence 2 in document 37664 is too long


 50%|████▉     | 37731/75610 [25:25<31:14, 20.21it/s]

Sentence 325 in document 37732 is too long


 50%|████▉     | 37744/75610 [25:27<1:06:33,  9.48it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors
 50%|████▉     | 37750/75610 [25:27<50:12, 12.57it/s]  

Sentence 1 in document 37745 is too long


 50%|█████     | 38032/75610 [25:43<1:27:36,  7.15it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (580 > 512). Running this sequence through the model will result in indexing errors


Sentence 5 in document 38033 is too long


 50%|█████     | 38051/75610 [25:44<52:28, 11.93it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (709 > 512). Running this sequence through the model will result in indexing errors


Sentence 24 in document 38054 is too long


 51%|█████     | 38206/75610 [25:49<13:12, 47.20it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (632 > 512). Running this sequence through the model will result in indexing errors
 51%|█████     | 38213/75610 [25:49<16:29, 37.80it/s]

Sentence 0 in document 38210 is too long


 51%|█████     | 38359/75610 [25:52<07:06, 87.37it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (992 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (674 > 512). Running this sequence through the model will result in indexing errors


Sentence 7 in document 38361 is too long
Sentence 9 in document 38361 is too long
Sentence 14 in document 38361 is too long


 51%|█████     | 38527/75610 [25:59<09:35, 64.41it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (2044 > 512). Running this sequence through the model will result in indexing errors
 51%|█████     | 38535/75610 [25:59<11:43, 52.71it/s]

Sentence 1 in document 38531 is too long


 51%|█████▏    | 38915/75610 [26:11<44:41, 13.68it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (958 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 38915 is too long
Sentence 8 in document 38915 is too long


 52%|█████▏    | 39048/75610 [26:18<1:01:36,  9.89it/s]

Sentence 4 in document 39049 is too long


 52%|█████▏    | 39071/75610 [26:20<50:42, 12.01it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (2054 > 512). Running this sequence through the model will result in indexing errors
 52%|█████▏    | 39086/75610 [26:20<29:30, 20.63it/s]

Sentence 2 in document 39071 is too long


 52%|█████▏    | 39474/75610 [26:40<28:14, 21.33it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1701 > 512). Running this sequence through the model will result in indexing errors


Sentence 394 in document 39476 is too long


 52%|█████▏    | 39508/75610 [26:46<58:13, 10.33it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (533 > 512). Running this sequence through the model will result in indexing errors


Sentence 29 in document 39509 is too long


 52%|█████▏    | 39510/75610 [26:46<1:16:56,  7.82it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1061 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (668 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2564 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 39511 is too long
Sentence 4 in document 39511 is too long
Sentence 26 in document 39511 is too long
Sentence 45 in document 39511 is too long


 53%|█████▎    | 39894/75610 [27:05<56:37, 10.51it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1596 > 512). Running this sequence through the model will result in indexing errors


Sentence 99 in document 39896 is too long


 53%|█████▎    | 40201/75610 [27:20<1:09:25,  8.50it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1646 > 512). Running this sequence through the model will result in indexing errors
 53%|█████▎    | 40204/75610 [27:20<55:33, 10.62it/s]  

Sentence 29 in document 40202 is too long


 54%|█████▍    | 40759/75610 [27:45<17:14, 33.70it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (548 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (641 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (764 > 512). Running this sequence through the model will result in indexing errors
 54%|█████▍    | 40771/75610 [27:45<20:39, 28.10it/s]

Sentence 11 in document 40764 is too long
Sentence 12 in document 40764 is too long
Sentence 20 in document 40764 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors
 54%|█████▍    | 40776/75610 [27:45<19:45, 29.39it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (753 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 40774 is too long
Sentence 15 in document 40776 is too long


 54%|█████▍    | 40942/75610 [27:52<37:25, 15.44it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (6368 > 512). Running this sequence through the model will result in indexing errors
 54%|█████▍    | 40950/75610 [27:52<27:40, 20.87it/s]

Sentence 25 in document 40944 is too long


 54%|█████▍    | 40964/75610 [27:52<19:41, 29.33it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (922 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (819 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1764 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1217 > 512). Running this sequence through the model will result in indexing errors
 54%|█████▍    | 40969/75610 [27:52<17:26, 33.09it/s]

Sentence 2 in document 40964 is too long
Sentence 3 in document 40964 is too long
Sentence 4 in document 40964 is too long
Sentence 5 in document 40964 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (666 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1594 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1087 > 512). Running this sequence through the model will result in indexing errors
 54%|█████▍    | 40974/75610 [27:53<20:28, 28.19it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1248 > 512). Running this sequence through the model will result in indexing errors


Sentence 39 in document 40972 is too long
Sentence 40 in document 40972 is too long
Sentence 41 in document 40972 is too long
Sentence 5 in document 40974 is too long
Sentence 8 in document 40974 is too long


 55%|█████▍    | 41541/75610 [28:14<16:01, 35.44it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (2077 > 512). Running this sequence through the model will result in indexing errors
 55%|█████▍    | 41553/75610 [28:14<15:42, 36.13it/s]

Sentence 3 in document 41546 is too long


 55%|█████▌    | 41700/75610 [28:20<24:59, 22.61it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (839 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (714 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1299 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is l

Sentence 0 in document 41701 is too long
Sentence 2 in document 41701 is too long
Sentence 11 in document 41701 is too long
Sentence 21 in document 41701 is too long
Sentence 24 in document 41701 is too long
Sentence 25 in document 41701 is too long
Sentence 46 in document 41701 is too long


 55%|█████▌    | 41718/75610 [28:21<22:03, 25.60it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1290 > 512). Running this sequence through the model will result in indexing errors
 55%|█████▌    | 41725/75610 [28:21<18:06, 31.20it/s]

Sentence 9 in document 41719 is too long


 55%|█████▌    | 41855/75610 [28:27<1:03:42,  8.83it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (698 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (735 > 512). Running this sequence through the model will result in indexing errors
 55%|█████▌    | 41865/75610 [28:28<37:15, 15.10it/s]  

Sentence 8 in document 41856 is too long
Sentence 4 in document 41858 is too long


 55%|█████▌    | 41922/75610 [28:31<1:07:11,  8.36it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2079 > 512). Running this sequence through the model will result in indexing errors
 55%|█████▌    | 41931/75610 [28:32<43:32, 12.89it/s]  

Sentence 2 in document 41923 is too long


 56%|█████▌    | 42037/75610 [28:36<1:09:27,  8.06it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
 56%|█████▌    | 42039/75610 [28:36<1:05:42,  8.52it/s]

Sentence 3 in document 42037 is too long


 56%|█████▌    | 42147/75610 [28:40<32:10, 17.33it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (979 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1203 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1340 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (952 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (926 > 512). Running this sequence through the model will result in indexing errors


Sentence 31 in document 42148 is too long
Sentence 34 in document 42148 is too long
Sentence 38 in document 42148 is too long
Sentence 39 in document 42148 is too long
Sentence 41 in document 42148 is too long


 56%|█████▌    | 42218/75610 [28:45<23:44, 23.44it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1996 > 512). Running this sequence through the model will result in indexing errors


Sentence 5 in document 42218 is too long


 56%|█████▌    | 42439/75610 [28:55<14:18, 38.64it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors
 56%|█████▌    | 42457/75610 [28:55<10:14, 53.98it/s]

Sentence 0 in document 42440 is too long


 56%|█████▋    | 42608/75610 [29:05<48:37, 11.31it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (566 > 512). Running this sequence through the model will result in indexing errors
 56%|█████▋    | 42614/75610 [29:05<38:34, 14.26it/s]

Sentence 8 in document 42613 is too long


 57%|█████▋    | 42758/75610 [29:12<51:47, 10.57it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
 57%|█████▋    | 42775/75610 [29:12<30:12, 18.11it/s]

Sentence 0 in document 42766 is too long


 57%|█████▋    | 43230/75610 [29:31<13:13, 40.80it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1493 > 512). Running this sequence through the model will result in indexing errors
 57%|█████▋    | 43242/75610 [29:32<11:21, 47.48it/s]

Sentence 13 in document 43231 is too long


 57%|█████▋    | 43448/75610 [29:41<40:40, 13.18it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (5605 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
 57%|█████▋    | 43454/75610 [29:41<30:15, 17.72it/s]

Sentence 2 in document 43450 is too long
Sentence 3 in document 43450 is too long


 57%|█████▋    | 43460/75610 [29:41<24:02, 22.29it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8336 > 512). Running this sequence through the model will result in indexing errors
 57%|█████▋    | 43466/75610 [29:41<21:02, 25.45it/s]

Sentence 0 in document 43463 is too long


 58%|█████▊    | 43515/75610 [29:42<10:22, 51.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2474 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (3510 > 512). Running this sequence through the model will result in indexing errors
 58%|█████▊    | 43522/75610 [29:42<13:48, 38.73it/s]

Sentence 0 in document 43521 is too long
Sentence 6 in document 43521 is too long


 58%|█████▊    | 43670/75610 [29:47<08:41, 61.23it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1905 > 512). Running this sequence through the model will result in indexing errors
 58%|█████▊    | 43678/75610 [29:47<10:12, 52.11it/s]

Sentence 8 in document 43671 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors
 58%|█████▊    | 43693/75610 [29:47<12:32, 42.41it/s]

Sentence 3 in document 43683 is too long


 58%|█████▊    | 43733/75610 [29:48<08:23, 63.30it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (712 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (928 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1003 > 512). Running this sequence through the model will result in indexing errors
 58%|█████▊    | 43741/75610 [29:48<10:29, 50.65it/s]

Sentence 7 in document 43739 is too long
Sentence 10 in document 43739 is too long
Sentence 13 in document 43739 is too long


 58%|█████▊    | 44187/75610 [30:05<20:15, 25.85it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (618 > 512). Running this sequence through the model will result in indexing errors
 58%|█████▊    | 44191/75610 [30:05<22:44, 23.02it/s]

Sentence 26 in document 44190 is too long


 58%|█████▊    | 44230/75610 [30:08<50:54, 10.27it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (873 > 512). Running this sequence through the model will result in indexing errors


Sentence 4 in document 44233 is too long


 59%|█████▉    | 44550/75610 [30:21<19:45, 26.20it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (768 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1881 > 512). Running this sequence through the model will result in indexing errors
 59%|█████▉    | 44556/75610 [30:22<18:59, 27.25it/s]

Sentence 21 in document 44551 is too long
Sentence 22 in document 44551 is too long


 59%|█████▉    | 44850/75610 [30:35<1:03:30,  8.07it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1171 > 512). Running this sequence through the model will result in indexing errors
 59%|█████▉    | 44866/75610 [30:35<34:36, 14.80it/s]  

Sentence 0 in document 44854 is too long


 60%|██████    | 45735/75610 [31:16<16:40, 29.85it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1301 > 512). Running this sequence through the model will result in indexing errors
 60%|██████    | 45742/75610 [31:17<22:21, 22.26it/s]

Sentence 2 in document 45740 is too long


 62%|██████▏   | 46782/75610 [32:06<32:53, 14.61it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (890 > 512). Running this sequence through the model will result in indexing errors
 62%|██████▏   | 46784/75610 [32:07<36:57, 13.00it/s]

Sentence 0 in document 46782 is too long


 62%|██████▏   | 47040/75610 [32:18<21:45, 21.89it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1970 > 512). Running this sequence through the model will result in indexing errors
 62%|██████▏   | 47049/75610 [32:18<17:05, 27.85it/s]

Sentence 2 in document 47040 is too long


 62%|██████▏   | 47125/75610 [32:20<19:54, 23.85it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1751 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1700 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1693 > 512). Running this sequence through the model will result in indexing errors
 62%|██████▏   | 47132/75610 [32:20<22:43, 20.88it/s]

Sentence 8 in document 47128 is too long
Sentence 14 in document 47128 is too long
Sentence 20 in document 47128 is too long


 63%|██████▎   | 47266/75610 [32:27<53:42,  8.80it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1771 > 512). Running this sequence through the model will result in indexing errors
 63%|██████▎   | 47269/75610 [32:28<56:19,  8.39it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (694 > 512). Running this sequence through the model will result in indexing errors


Sentence 66 in document 47267 is too long
Sentence 49 in document 47270 is too long


 64%|██████▍   | 48627/75610 [33:32<24:47, 18.15it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (1940 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (553 > 512). Running this sequence through the model will result in indexing errors
 64%|██████▍   | 48633/75610 [33:32<21:40, 20.75it/s]

Sentence 1 in document 48628 is too long
Sentence 1 in document 48629 is too long


 67%|██████▋   | 50983/75610 [35:19<21:12, 19.35it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (829 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (914 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (950 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (613 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (847 > 512). Running this sequence through the model will result in indexing errors


Sentence 30 in document 50983 is too long
Sentence 31 in document 50983 is too long
Sentence 34 in document 50983 is too long
Sentence 35 in document 50983 is too long
Sentence 37 in document 50983 is too long


 68%|██████▊   | 51692/75610 [35:50<09:49, 40.59it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors
 68%|██████▊   | 51706/75610 [35:50<08:28, 46.98it/s]

Sentence 0 in document 51697 is too long


 70%|███████   | 53279/75610 [37:09<43:53,  8.48it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (813 > 512). Running this sequence through the model will result in indexing errors
 70%|███████   | 53281/75610 [37:09<49:49,  7.47it/s]

Sentence 56 in document 53280 is too long


 70%|███████   | 53290/75610 [37:10<45:50,  8.12it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
 70%|███████   | 53292/75610 [37:10<41:12,  9.03it/s]

Sentence 3 in document 53291 is too long


 71%|███████   | 53433/75610 [37:16<15:16, 24.20it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors


Sentence 10 in document 53436 is too long


 71%|███████   | 53472/75610 [37:17<10:51, 33.97it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
 71%|███████   | 53481/75610 [37:17<08:50, 41.69it/s]

Sentence 1 in document 53473 is too long


 72%|███████▏  | 54264/75610 [37:43<10:51, 32.76it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (740 > 512). Running this sequence through the model will result in indexing errors
 72%|███████▏  | 54281/75610 [37:44<07:35, 46.79it/s]

Sentence 0 in document 54268 is too long


 72%|███████▏  | 54538/75610 [37:55<25:55, 13.55it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (537 > 512). Running this sequence through the model will result in indexing errors
 72%|███████▏  | 54547/75610 [37:55<18:31, 18.95it/s]

Sentence 4 in document 54539 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (1661 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1596 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1598 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1618 > 512). Running this sequence through the model will result in indexing errors
 72%|███████▏  | 54551/75610 [37:55<17:49, 19.70it/s]

Sentence 17 in document 54549 is too long
Sentence 21 in document 54549 is too long
Sentence 35 in document 54549 is too long
Sentence 41 in document 54549 is too long


 72%|███████▏  | 54562/75610 [37:56<12:30, 28.05it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors
 72%|███████▏  | 54571/75610 [37:56<09:57, 35.24it/s]

Sentence 3 in document 54568 is too long


 73%|███████▎  | 54823/75610 [38:06<09:53, 35.03it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
 73%|███████▎  | 54830/75610 [38:06<08:47, 39.43it/s]

Sentence 1 in document 54823 is too long


 73%|███████▎  | 55041/75610 [38:12<06:15, 54.83it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors


Sentence 0 in document 55041 is too long


 74%|███████▎  | 55685/75610 [38:37<11:01, 30.13it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
 74%|███████▎  | 55697/75610 [38:37<09:24, 35.26it/s]

Sentence 0 in document 55688 is too long


 75%|███████▍  | 56429/75610 [39:03<07:58, 40.10it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
 75%|███████▍  | 56439/75610 [39:03<06:37, 48.25it/s]

Sentence 2 in document 56437 is too long


 75%|███████▍  | 56472/75610 [39:06<44:04,  7.24it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 56474 is too long


 75%|███████▍  | 56642/75610 [39:11<14:52, 21.25it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (706 > 512). Running this sequence through the model will result in indexing errors
 75%|███████▍  | 56647/75610 [39:11<13:17, 23.77it/s]

Sentence 0 in document 56644 is too long


 75%|███████▌  | 56756/75610 [39:14<08:15, 38.09it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (635 > 512). Running this sequence through the model will result in indexing errors
 75%|███████▌  | 56765/75610 [39:15<07:06, 44.23it/s]

Sentence 0 in document 56758 is too long


 75%|███████▌  | 56887/75610 [39:21<08:01, 38.88it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
 75%|███████▌  | 56901/75610 [39:21<07:01, 44.39it/s]

Sentence 0 in document 56890 is too long


 76%|███████▌  | 57426/75610 [39:41<22:16, 13.60it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1720 > 512). Running this sequence through the model will result in indexing errors
 76%|███████▌  | 57429/75610 [39:42<19:54, 15.22it/s]

Sentence 2 in document 57427 is too long


 77%|███████▋  | 57858/75610 [40:00<24:38, 12.00it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1717 > 512). Running this sequence through the model will result in indexing errors
 77%|███████▋  | 57871/75610 [40:00<14:23, 20.53it/s]

Sentence 2 in document 57859 is too long


 77%|███████▋  | 58131/75610 [40:10<14:13, 20.48it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1691 > 512). Running this sequence through the model will result in indexing errors
 77%|███████▋  | 58138/75610 [40:10<11:13, 25.94it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1604 > 512). Running this sequence through the model will result in indexing errors
 77%|███████▋  | 58145/75610 [40:10<09:12, 31.63it/s]

Sentence 10 in document 58137 is too long
Sentence 22 in document 58138 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (2902 > 512). Running this sequence through the model will result in indexing errors
 77%|███████▋  | 58150/75610 [40:11<08:46, 33.19it/s]

Sentence 0 in document 58148 is too long


 77%|███████▋  | 58340/75610 [40:21<37:51,  7.60it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1715 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1646 > 512). Running this sequence through the model will result in indexing errors
 77%|███████▋  | 58350/75610 [40:21<21:43, 13.25it/s]

Sentence 1 in document 58343 is too long
Sentence 2 in document 58344 is too long


 77%|███████▋  | 58533/75610 [40:29<12:05, 23.52it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors
 77%|███████▋  | 58540/75610 [40:30<09:58, 28.54it/s]

Sentence 3 in document 58533 is too long


 78%|███████▊  | 58688/75610 [40:36<08:21, 33.74it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
 78%|███████▊  | 58699/75610 [40:36<06:37, 42.54it/s]

Sentence 0 in document 58697 is too long


 79%|███████▉  | 59585/75610 [41:16<08:27, 31.58it/s]  Token indices sequence length is longer than the specified maximum sequence length for this model (740 > 512). Running this sequence through the model will result in indexing errors
 79%|███████▉  | 59602/75610 [41:16<05:46, 46.17it/s]

Sentence 0 in document 59589 is too long


 79%|███████▉  | 59947/75610 [41:31<07:13, 36.12it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this sequence through the model will result in indexing errors
 79%|███████▉  | 59962/75610 [41:31<05:41, 45.81it/s]

Sentence 0 in document 59949 is too long


 79%|███████▉  | 60022/75610 [41:33<09:11, 28.28it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (798 > 512). Running this sequence through the model will result in indexing errors
 79%|███████▉  | 60027/75610 [41:33<10:14, 25.37it/s]

Sentence 12 in document 60025 is too long


 80%|███████▉  | 60252/75610 [41:42<07:25, 34.47it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors
 80%|███████▉  | 60269/75610 [41:42<05:19, 48.07it/s]

Sentence 0 in document 60254 is too long


 80%|████████  | 60716/75610 [41:57<12:19, 20.13it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1595 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1543 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1563 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1583 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1583 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is

Sentence 9 in document 60718 is too long
Sentence 10 in document 60718 is too long
Sentence 76 in document 60718 is too long
Sentence 77 in document 60718 is too long
Sentence 91 in document 60718 is too long
Sentence 92 in document 60718 is too long
Sentence 105 in document 60718 is too long
Sentence 106 in document 60718 is too long
Sentence 107 in document 60718 is too long
Sentence 108 in document 60718 is too long


 80%|████████  | 60837/75610 [42:01<18:04, 13.63it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (608 > 512). Running this sequence through the model will result in indexing errors
 80%|████████  | 60840/75610 [42:02<16:06, 15.28it/s]

Sentence 8 in document 60839 is too long


 81%|████████  | 61070/75610 [42:09<04:41, 51.57it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors
 81%|████████  | 61076/75610 [42:09<06:45, 35.83it/s]

Sentence 0 in document 61071 is too long


 81%|████████  | 61310/75610 [42:17<06:56, 34.32it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1642 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1616 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1616 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1736 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1714 > 512). Running this sequence through the model will result in indexing errors
 81%|████████  | 61316/75610 [42

Sentence 8 in document 61314 is too long
Sentence 12 in document 61314 is too long
Sentence 19 in document 61314 is too long
Sentence 23 in document 61314 is too long
Sentence 24 in document 61314 is too long


 82%|████████▏ | 61629/75610 [42:25<04:50, 48.13it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (588 > 512). Running this sequence through the model will result in indexing errors
 82%|████████▏ | 61644/75610 [42:25<04:02, 57.61it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (878 > 512). Running this sequence through the model will result in indexing errors


Sentence 2 in document 61635 is too long
Sentence 0 in document 61648 is too long


 82%|████████▏ | 61778/75610 [42:30<08:11, 28.16it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (591 > 512). Running this sequence through the model will result in indexing errors
 82%|████████▏ | 61783/75610 [42:30<07:21, 31.32it/s]

Sentence 0 in document 61779 is too long


 82%|████████▏ | 61999/75610 [42:38<16:44, 13.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (697 > 512). Running this sequence through the model will result in indexing errors
 82%|████████▏ | 62016/75610 [42:38<09:34, 23.64it/s]

Sentence 0 in document 61999 is too long


 82%|████████▏ | 62146/75610 [42:42<17:53, 12.54it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1727 > 512). Running this sequence through the model will result in indexing errors
 82%|████████▏ | 62158/75610 [42:42<10:46, 20.82it/s]

Sentence 3 in document 62146 is too long


 82%|████████▏ | 62220/75610 [42:44<06:36, 33.74it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors
 82%|████████▏ | 62229/75610 [42:44<05:26, 40.94it/s]

Sentence 2 in document 62220 is too long


 83%|████████▎ | 62933/75610 [43:16<15:30, 13.62it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1333 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2136 > 512). Running this sequence through the model will result in indexing errors
 83%|████████▎ | 62938/75610 [43:16<12:18, 17.16it/s]

Sentence 2 in document 62933 is too long
Sentence 2 in document 62934 is too long


 83%|████████▎ | 63004/75610 [43:18<15:43, 13.37it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (551 > 512). Running this sequence through the model will result in indexing errors
 83%|████████▎ | 63012/75610 [43:18<12:04, 17.39it/s]

Sentence 2 in document 63005 is too long


 84%|████████▎ | 63189/75610 [43:24<12:15, 16.89it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors
 84%|████████▎ | 63198/75610 [43:24<09:19, 22.20it/s]

Sentence 0 in document 63189 is too long


 84%|████████▍ | 63768/75610 [43:43<08:18, 23.76it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1493 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1366 > 512). Running this sequence through the model will result in indexing errors


Sentence 15 in document 63769 is too long
Sentence 7 in document 63771 is too long


 84%|████████▍ | 63775/75610 [43:44<14:41, 13.43it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1902 > 512). Running this sequence through the model will result in indexing errors
 84%|████████▍ | 63780/75610 [43:44<12:22, 15.93it/s]

Sentence 0 in document 63778 is too long


 85%|████████▍ | 64053/75610 [43:57<18:33, 10.38it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (763 > 512). Running this sequence through the model will result in indexing errors
 85%|████████▍ | 64057/75610 [43:57<15:49, 12.16it/s]

Sentence 2 in document 64053 is too long


 85%|████████▌ | 64368/75610 [44:13<14:34, 12.85it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (658 > 512). Running this sequence through the model will result in indexing errors
 85%|████████▌ | 64378/75610 [44:13<10:47, 17.34it/s]

Sentence 1 in document 64371 is too long


 85%|████████▌ | 64412/75610 [44:14<05:16, 35.33it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (868 > 512). Running this sequence through the model will result in indexing errors
 85%|████████▌ | 64420/75610 [44:14<04:23, 42.42it/s]

Sentence 10 in document 64414 is too long


 85%|████████▌ | 64426/75610 [44:15<09:52, 18.87it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (651 > 512). Running this sequence through the model will result in indexing errors
 85%|████████▌ | 64432/75610 [44:15<07:53, 23.59it/s]

Sentence 0 in document 64431 is too long


 85%|████████▌ | 64561/75610 [44:21<10:53, 16.92it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1436 > 512). Running this sequence through the model will result in indexing errors


Sentence 0 in document 64561 is too long
Sentence 1 in document 64561 is too long


 86%|████████▌ | 64771/75610 [44:28<10:30, 17.18it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2030 > 512). Running this sequence through the model will result in indexing errors
 86%|████████▌ | 64775/75610 [44:28<09:31, 18.95it/s]

Sentence 0 in document 64772 is too long
Sentence 11 in document 64774 is too long


 86%|████████▌ | 64820/75610 [44:28<03:17, 54.67it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors


Sentence 35 in document 64826 is too long


 86%|████████▌ | 64848/75610 [44:30<07:01, 25.51it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
 86%|████████▌ | 64853/75610 [44:30<06:57, 25.75it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


Sentence 13 in document 64848 is too long
Sentence 0 in document 64854 is too long


 86%|████████▋ | 65225/75610 [44:47<04:28, 38.70it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors


Sentence 66 in document 65225 is too long


 86%|████████▋ | 65263/75610 [44:52<23:16,  7.41it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2034 > 512). Running this sequence through the model will result in indexing errors
 86%|████████▋ | 65279/75610 [44:52<12:32, 13.73it/s]

Sentence 3 in document 65263 is too long


 86%|████████▋ | 65382/75610 [44:57<14:35, 11.68it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (854 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (882 > 512). Running this sequence through the model will result in indexing errors
 86%|████████▋ | 65387/75610 [44:57<12:19, 13.83it/s]

Sentence 4 in document 65386 is too long
Sentence 8 in document 65386 is too long


 87%|████████▋ | 65408/75610 [44:58<08:08, 20.88it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (924 > 512). Running this sequence through the model will result in indexing errors
 87%|████████▋ | 65412/75610 [44:59<10:40, 15.92it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2208 > 512). Running this sequence through the model will result in indexing errors


Sentence 11 in document 65411 is too long
Sentence 76 in document 65413 is too long


 87%|████████▋ | 65475/75610 [45:04<24:48,  6.81it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (675 > 512). Running this sequence through the model will result in indexing errors


Sentence 11 in document 65476 is too long


 88%|████████▊ | 66243/75610 [45:45<08:49, 17.70it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (944 > 512). Running this sequence through the model will result in indexing errors
 88%|████████▊ | 66249/75610 [45:45<07:09, 21.81it/s]

Sentence 4 in document 66244 is too long


 88%|████████▊ | 66542/75610 [45:53<04:17, 35.23it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (541 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (571 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (547 > 512). Running this sequence through the model will result in indexing errors
 88%|████████▊ | 66549/75610 [45:54<04:35, 32.88it/s]

Sentence 4 in document 66545 is too long
Sentence 5 in document 66545 is too long
Sentence 8 in document 66545 is too long
Sentence 9 in document 66545 is too long


Token indices sequence length is longer than the specified maximum sequence length for this model (1120 > 512). Running this sequence through the model will result in indexing errors
 88%|████████▊ | 66555/75610 [45:54<04:21, 34.69it/s]

Sentence 2 in document 66554 is too long


 88%|████████▊ | 66756/75610 [46:03<04:12, 35.05it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (855 > 512). Running this sequence through the model will result in indexing errors


Sentence 4 in document 66757 is too long


 88%|████████▊ | 66829/75610 [46:05<03:49, 38.25it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors
 88%|████████▊ | 66835/75610 [46:06<04:27, 32.81it/s]

Sentence 3 in document 66830 is too long


 89%|████████▉ | 67511/75610 [46:36<11:37, 11.61it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (673 > 512). Running this sequence through the model will result in indexing errors


Sentence 40 in document 67513 is too long
Sentence 107 in document 67513 is too long


 89%|████████▉ | 67514/75610 [46:37<20:35,  6.55it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2978 > 512). Running this sequence through the model will result in indexing errors
 89%|████████▉ | 67516/75610 [46:37<17:19,  7.78it/s]

Sentence 31 in document 67515 is too long


 89%|████████▉ | 67519/75610 [46:38<15:48,  8.53it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (5937 > 512). Running this sequence through the model will result in indexing errors
 89%|████████▉ | 67523/75610 [46:38<13:26, 10.03it/s]

Sentence 24 in document 67522 is too long


 89%|████████▉ | 67577/75610 [46:40<09:34, 13.97it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (512 > 512). Running this sequence through the model will result in indexing errors
 89%|████████▉ | 67586/75610 [46:40<07:12, 18.57it/s]

Sentence 0 in document 67585 is too long


 90%|████████▉ | 67673/75610 [46:43<02:35, 50.99it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (896 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1806 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1077 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (784 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (667 > 512). Running this sequence through the model will result in indexing errors


Sentence 30 in document 67678 is too long
Sentence 33 in document 67678 is too long
Sentence 34 in document 67678 is too long
Sentence 35 in document 67678 is too long
Sentence 36 in document 67678 is too long


 90%|████████▉ | 67680/75610 [46:43<05:03, 26.11it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (542 > 512). Running this sequence through the model will result in indexing errors


Sentence 2 in document 67682 is too long


 90%|█████████ | 68201/75610 [47:08<05:46, 21.40it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1820 > 512). Running this sequence through the model will result in indexing errors


Sentence 32 in document 68202 is too long


 90%|█████████ | 68218/75610 [47:09<10:15, 12.02it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1777 > 512). Running this sequence through the model will result in indexing errors
 90%|█████████ | 68220/75610 [47:09<11:14, 10.95it/s]

Sentence 12 in document 68218 is too long


 90%|█████████ | 68328/75610 [47:13<03:18, 36.75it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1249 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1483 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (765 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1029 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (968 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is l

Sentence 30 in document 68332 is too long
Sentence 35 in document 68332 is too long
Sentence 38 in document 68332 is too long
Sentence 39 in document 68332 is too long
Sentence 40 in document 68332 is too long
Sentence 41 in document 68332 is too long


 90%|█████████ | 68379/75610 [47:19<13:10,  9.15it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2196 > 512). Running this sequence through the model will result in indexing errors
 90%|█████████ | 68394/75610 [47:19<07:19, 16.42it/s]

Sentence 1 in document 68379 is too long


 92%|█████████▏| 69394/75610 [48:04<05:49, 17.79it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (996 > 512). Running this sequence through the model will result in indexing errors
 92%|█████████▏| 69400/75610 [48:05<04:36, 22.44it/s]

Sentence 2 in document 69399 is too long


 93%|█████████▎| 70226/75610 [48:41<06:00, 14.93it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (677 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (659 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (903 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (655 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (689 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is long

Sentence 13 in document 70226 is too long
Sentence 15 in document 70226 is too long
Sentence 16 in document 70226 is too long
Sentence 17 in document 70226 is too long
Sentence 18 in document 70226 is too long
Sentence 19 in document 70226 is too long
Sentence 20 in document 70226 is too long
Sentence 21 in document 70226 is too long
Sentence 22 in document 70226 is too long
Sentence 23 in document 70226 is too long
Sentence 24 in document 70226 is too long
Sentence 25 in document 70226 is too long


 93%|█████████▎| 70492/75610 [48:53<01:23, 61.25it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1775 > 512). Running this sequence through the model will result in indexing errors


Sentence 2 in document 70492 is too long


 93%|█████████▎| 70619/75610 [49:01<03:19, 25.04it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (663 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1583 > 512). Running this sequence through the model will result in indexing errors


Sentence 1 in document 70621 is too long
Sentence 2 in document 70621 is too long


 94%|█████████▍| 71442/75610 [49:34<06:46, 10.25it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2272 > 512). Running this sequence through the model will result in indexing errors
 95%|█████████▍| 71458/75610 [49:34<03:45, 18.38it/s]

Sentence 6 in document 71443 is too long


 95%|█████████▍| 71687/75610 [49:42<01:55, 34.02it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (557 > 512). Running this sequence through the model will result in indexing errors
 95%|█████████▍| 71703/75610 [49:43<01:25, 45.53it/s]

Sentence 0 in document 71692 is too long


 96%|█████████▌| 72281/75610 [50:13<05:29, 10.10it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2250 > 512). Running this sequence through the model will result in indexing errors
 96%|█████████▌| 72293/75610 [50:13<03:09, 17.48it/s]

Sentence 3 in document 72282 is too long


 96%|█████████▌| 72741/75610 [50:30<02:00, 23.89it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1750 > 512). Running this sequence through the model will result in indexing errors
 96%|█████████▌| 72744/75610 [50:30<02:24, 19.84it/s]

Sentence 20 in document 72742 is too long


 96%|█████████▌| 72762/75610 [50:32<04:38, 10.23it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1745 > 512). Running this sequence through the model will result in indexing errors
 96%|█████████▌| 72766/75610 [50:32<04:06, 11.53it/s]

Sentence 1 in document 72764 is too long


 97%|█████████▋| 73025/75610 [50:41<00:54, 47.11it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1730 > 512). Running this sequence through the model will result in indexing errors


Sentence 30 in document 73029 is too long


 97%|█████████▋| 73037/75610 [50:43<02:43, 15.76it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2771 > 512). Running this sequence through the model will result in indexing errors
 97%|█████████▋| 73041/75610 [50:43<03:34, 11.99it/s]

Sentence 0 in document 73040 is too long


 97%|█████████▋| 73202/75610 [50:50<02:44, 14.64it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2202 > 512). Running this sequence through the model will result in indexing errors
 97%|█████████▋| 73216/75610 [50:50<01:41, 23.65it/s]

Sentence 9 in document 73207 is too long


 98%|█████████▊| 74219/75610 [51:34<00:57, 24.34it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1054 > 512). Running this sequence through the model will result in indexing errors


Sentence 11 in document 74221 is too long


 98%|█████████▊| 74358/75610 [51:40<01:38, 12.75it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1732 > 512). Running this sequence through the model will result in indexing errors
 98%|█████████▊| 74371/75610 [51:41<00:58, 21.33it/s]

Sentence 1 in document 74359 is too long


 99%|█████████▉| 74804/75610 [51:57<00:25, 31.63it/s]

Sentence 0 in document 74796 is too long


 99%|█████████▉| 74858/75610 [52:01<01:29,  8.36it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (555 > 512). Running this sequence through the model will result in indexing errors


Sentence 3 in document 74860 is too long


 99%|█████████▉| 74972/75610 [52:05<00:54, 11.78it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (669 > 512). Running this sequence through the model will result in indexing errors


Sentence 185 in document 74975 is too long
Sentence 187 in document 74975 is too long


100%|██████████| 75610/75610 [52:34<00:00, 23.97it/s]


In [7]:
df_senate_processed = df_senate.copy()
df_senate_processed['sentences_emotions'] = processed_sentence_lists


# Save
df_senate_processed.to_pickle( os.path.join(data_path, 'hansard-senate-emotions-ekman.pkl') )