In [2]:
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import torch
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments
from transformers import Trainer
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator
from torch.optim import AdamW
from accelerate import Accelerator
from transformers import get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import preprocessing
import pickle
import pandas as pd
from transformers import PreTrainedTokenizerFast
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
def insert_random_mask(batch,data_collator):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [6]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
model = preprocessing.create_model_MLM(model_checkpoint)

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
tokenizer =preprocessing.create_tokenizer(model_checkpoint)

In [8]:
data_files = {"train": "swerick_data_random_train.pkl", "test": "swerick_data_random_test.pkl"}
swerick_dataset = load_dataset("pandas",data_files=data_files)
print(swerick_dataset)

DatasetDict({
    train: Dataset({
        features: ['protocole', 'texte'],
        num_rows: 12399
    })
    test: Dataset({
        features: ['protocole', 'texte'],
        num_rows: 2673
    })
})


In [7]:
with open("alvis_project/alvis_finetuning/lm_dataset_512.pkl","rb") as f:
    lm_datasets=pickle.load(f)
path=   './alvis_project/alvis_finetuning/lm_datasets_512'
lm_datasets.save_to_disk(path)

from datasets import load_from_disk
lm_datasets = load_from_disk(path)

Saving the dataset (0/21 shards):   0%|          | 0/924484 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/198632 [00:00<?, ? examples/s]

Loading dataset from disk:   0%|          | 0/21 [00:00<?, ?it/s]

In [11]:


swerick_tokenizer= PreTrainedTokenizerFast(
    tokenizer_file="/home/laurinemeier/swerick/pretraining_from_scratch/tokenizer_swerick.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]"
)

In [11]:
old_vocab = [k for k,v in tokenizer.get_vocab().items()]
new_vocab = [k for k,v in swerick_tokenizer.get_vocab().items()]
idx_old_vocab_list = list()
same_tokens_list = list()
different_tokens_list = list()

for idx_new,w in enumerate(new_vocab): 
  try:
    idx_old = old_vocab.index(w)
  except:
    idx_old = -1
  if idx_old>=0:
      idx_old_vocab_list.append(idx_old)
      same_tokens_list.append((w,idx_new))
  else:
      different_tokens_list.append((w,idx_new))

In [12]:
len(same_tokens_list),len(different_tokens_list),len(same_tokens_list)+len(different_tokens_list)

(27546, 22779, 50325)

In [15]:
new_tokens = [k for k,v in different_tokens_list if k.startswith("#") == False]
len(new_tokens), new_tokens[:10]

(18450,
 ['kommendera',
  'markägarna',
  'bandel',
  'jordfrågan',
  'sos',
  'heltidsarbetande',
  'specialstål',
  'covid',
  'tjenstemän',
  'föräldraskap'])

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
print("[ BEFORE ] tokenizer vocab size:", len(tokenizer)) 
added_tokens = tokenizer.add_tokens(new_tokens)

print("[ AFTER ] tokenizer vocab size:", len(tokenizer)) 
print()
print('added_tokens:',added_tokens)
print()

# resize the embeddings matrix of the model 
model.resize_token_embeddings(len(tokenizer)) 

[ BEFORE ] tokenizer vocab size: 50325
[ AFTER ] tokenizer vocab size: 68775

added_tokens: 18450



Embedding(68775, 768)

In [18]:
tokenizer_exBERT = tokenizer

In [26]:
nlp = spacy.load("sv_core_news_sm", exclude=['parser', 'ner'])

In [12]:
def spacy_tokenizer(document, nlp=nlp):
    # tokenize the document with spaCY
    doc = nlp(document["texte"])
    # Remove stop words and punctuation symbols
    tokens = [
        token.text for token in doc if (
        token.is_stop == False and \
        token.is_punct == False and \
        token.text.strip() != '' and \
        token.text.find("\n") == -1)]
    return tokens

def dfreq(idf, N):
    return (1+N) / np.exp(idf - 1) - 1

NameError: name 'nlp' is not defined

In [32]:
tfidf_vectorizer = TfidfVectorizer(lowercase=False, tokenizer=spacy_tokenizer, 
                                   norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

In [33]:
result = tfidf_vectorizer.fit_transform(swerick_dataset["train"])

KeyboardInterrupt: 

In [1]:
from scipy.sparse import load_npz

result = load_npz('tfidf_matrix.npz')

In [3]:
print (result)

  (0, 1657610)	0.0030932953741660816
  (0, 1957760)	0.006823897283469424
  (0, 1655230)	0.013715997081875013
  (0, 1971315)	0.003989194278341275
  (0, 627804)	0.0017911715101956076
  (0, 1430704)	0.004958993736854114
  (0, 1787513)	0.007888196275665338
  (0, 180071)	0.00257776820147333
  (0, 1638963)	0.0029724836130177877
  (0, 1989773)	0.004468364774274971
  (0, 2012710)	0.00380224606686181
  (0, 1998498)	0.002474609877555118
  (0, 440981)	0.003636665245074665
  (0, 970902)	0.005106071097124169
  (0, 1879128)	0.004311139806362657
  (0, 575094)	0.0025950607710754545
  (0, 1638958)	0.004639205687051013
  (0, 1322758)	0.007261488656492904
  (0, 2028749)	0.01010114536352555
  (0, 1317758)	0.002885625572754773
  (0, 628783)	0.003532932826899384
  (0, 1166457)	0.003329821717185065
  (0, 1959918)	0.003298643564961466
  (0, 812496)	0.0020146931634392666
  (0, 694865)	0.0029458595502830113
  :	:
  (12398, 614393)	0.003342090787245752
  (12398, 1128595)	0.009142077442794407
  (12398, 715417)	0.

In [None]:
idf = tfidf_vectorizer.idf_

idf_sorted_indexes = sorted(range(len(idf)), key=lambda k: idf[k])
idf_sorted = idf[idf_sorted_indexes]
tokens_by_df = np.array(tfidf_vectorizer.get_feature_names())[idf_sorted_indexes]
dfreqs_sorted = dfreq(idf_sorted, length).astype(np.int32)
tokens_dfreqs = {tok:dfreq for tok, dfreq in zip(tokens_by_df,dfreqs_sorted)}
tokens_pct_list = [int(round(dfreq/length*100,2)) for token,dfreq in tokens_dfreqs.items()]

In [1]:
import csv

tokens_pct_dict = {}

with open('tokens_pct_list.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) 
    for row in reader:
        token = row[0]
        percentage = float(row[1])
        tokens_pct_dict[token] = percentage

In [9]:
num_docs = len(swerick_dataset["train"])

# Calculate the threshold for the minimum document frequency
min_df_percentage = 3 # Minimum percentage of documents a token must appear in

# Find tokens that meet or exceed this document frequency percentage
tokens_above_threshold = {token: pct for token, pct in tokens_pct_dict.items() if pct >= min_df_percentage}

# Number of tokens with document frequency above the threshold
number_tokens_with_DF_above_pct = len(tokens_above_threshold)

print(f"Number of tokens appearing in at least {min_df_percentage}% of the documents: {number_tokens_with_DF_above_pct}")
print(f"Tokens: {tokens_above_threshold}")

Number of tokens appearing in at least 3% of the documents: 22412
Tokens: {'1': 99.0, '2': 99.0, 'kl.': 98.0, '3': 98.0, '4': 98.0, '5': 97.0, 'förslag': 97.0, '6': 96.0, 'denna': 96.0, '8': 96.0, 'Kammaren': 96.0, 'anledning': 95.0, '7': 95.0, 'samt': 95.0, '10': 94.0, 'är': 94.0, '9': 93.0, 'vissa': 93.0, 'm': 93.0, 'fråga': 93.0, '11': 93.0, 'år': 93.0, '12': 92.0, 'dessa': 92.0, 'åt': 91.0, '15': 91.0, '14': 91.0, '13': 90.0, 'Herr': 90.0, '20': 90.0, 'grund': 89.0, 'fall': 89.0, 'tid': 89.0, '16': 89.0, '18': 89.0, 'sätt': 89.0, '17': 88.0, 'frågor': 88.0, 'proposition': 88.0, 'nya': 88.0, 'hos': 88.0, 'Stockholm': 88.0, '19': 88.0, 'frågan': 87.0, 'sådan': 87.0, 'tiden': 87.0, '21': 87.0, 'åtgärder': 87.0, 'väl': 87.0, 'bör': 87.0, 'sådana': 86.0, '30': 86.0, 'viss': 86.0, 'ytterligare': 86.0, '25': 86.0, '22': 86.0, 'angående': 86.0, 'sådant': 86.0, 'ledamöter': 86.0, 'hela': 85.0, 'borde': 85.0, 'se': 85.0, 'dels': 85.0, '23': 85.0, 'landet': 85.0, '26': 85.0, 'Denna': 85.0, 'j

In [10]:
old_vocab = [k for k,v in tokenizer.get_vocab().items()]
new_vocab = [token for token in tokens_above_threshold.keys()]
idx_old_vocab_list = list()
same_tokens_list = list()
different_tokens_list = list()

for idx_new,w in enumerate(new_vocab): 
  try:
    idx_old = old_vocab.index(w)
  except:
    idx_old = -1
  if idx_old>=0:
      idx_old_vocab_list.append(idx_old)
      same_tokens_list.append((w,idx_new))
  else:
      different_tokens_list.append((w,idx_new))

In [11]:
len(same_tokens_list),len(different_tokens_list),len(same_tokens_list)+len(different_tokens_list)

(13137, 9275, 22412)

In [14]:
new_tokens = [k for k,v in different_tokens_list]
print(len(new_tokens), new_tokens[:100])

9275 ['kl.', 'Kammaren', 'åtskildes', 'bordlades', 'Justerades', 'Föredrogs', 'fidem', 's.', 'Anmäldes', 'kammarens', 'propositioner', 'hemställan', 'tillfredsställande', 'hänvisades', 'Maj:ts', 't.', 'yrka', 'Maj:t', 'framställts', 'bifölls', 'k.', 'o.', 'protokollsutdrag', 'f.', 'PROTOKOLL', 'väckta', 'Utskottets', 'RIKSDAGENS', 'd.', 'slutad', 'votering', 'ärade', 'A.', 'e.', 'Överläggningen', 'reservationen', 'biföll', 'Kammarens', 'anföra', 'hemställa', 'v.', 'hemställer', 'yrkanden', 'bifalla', 'föredragning', 'anhålla', 'motionerna', 'yrkat', 'vidtaga', 'anhåller', 'Kl.', 'socialdemokratiska', 'instämde', 'förutsättningarna', 'komme', 'utlåtanden', 'betänkanden', 'underlåta', 'tillfredsställelse', 'Kungl.', 'protokollen', 'bifaller', 'Föredrogos', 'besvarad', 'antaga', 'vidhåller', 'tillstyrka', 'a.', 'förmenande', 'framlagt', 'ställningstagande', 'härefter', 'yrkandet', 'memorial', 'huru', 'bordlagda', 'frågans', 'lagutskottets', 'bestrida', 'hemställt', 'fogade', 'etc.', 'stat

In [12]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at KBLab/bert-base-swedish-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
print("[ BEFORE ] tokenizer vocab size:", len(tokenizer)) 
added_tokens = tokenizer.add_tokens(new_tokens)

print("[ AFTER ] tokenizer vocab size:", len(tokenizer)) 
print()
print('added_tokens:',added_tokens)
print()

# resize the embeddings matrix of the model 
model.resize_token_embeddings(len(tokenizer)) 

[ BEFORE ] tokenizer vocab size: 50325
[ AFTER ] tokenizer vocab size: 59600

added_tokens: 9275



Embedding(59600, 768)

In [15]:
tokenizer_exBERT = tokenizer

In [20]:
tokenizer_exBERT.save_pretrained("exbert_tokenizer")

('exbert_tokenizer/tokenizer_config.json',
 'exbert_tokenizer/special_tokens_map.json',
 'exbert_tokenizer/vocab.txt',
 'exbert_tokenizer/added_tokens.json',
 'exbert_tokenizer/tokenizer.json')

In [23]:
tokenized_datasets_train =preprocessing.tokenize_dataset(swerick_dataset["train"],tokenizer_exBERT)
tokenized_datasets_test = preprocessing.tokenize_dataset(swerick_dataset["test"],tokenizer_exBERT)

Map:   0%|          | 0/12399 [00:00<?, ? examples/s]

Map:   0%|          | 0/2673 [00:00<?, ? examples/s]

In [25]:
chunk_size = 128
lm_datasets_train = preprocessing.grouping_dataset(tokenized_datasets_train,chunk_size)
lm_datasets_test = preprocessing.grouping_dataset(tokenized_datasets_test,chunk_size)

Map:   0%|          | 0/12399 [00:00<?, ? examples/s]

Map:   0%|          | 0/2673 [00:00<?, ? examples/s]

In [27]:
from datasets import DatasetDict, concatenate_datasets

lm_dataset = DatasetDict({
    "train": lm_datasets_train,
    "test": lm_datasets_test
})
with open("lm_dataset_exbert.pkl","wb") as f:
    pickle.dump(lm_dataset,f)

In [28]:
data_valid={"valid":"swerick_data_random_valid.pkl"}
valid_dataset_initial = load_dataset("pandas",data_files=data_valid) 
valid_dataset =preprocessing.chunk_and_pad(valid_dataset_initial["valid"],tokenizer,128)
valid_dataset = Dataset.from_dict(valid_dataset)
with open("valid_dataset_exbert.pkl","wb") as f:
     pickle.dump(valid_dataset,f)

NameError: name 'Dataset' is not defined

In [2]:
import csv
tokens_pct_dict = {}

with open('tokens_pct_list.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) 
    for row in reader:
        token = row[0]
        percentage = float(row[1])
        tokens_pct_dict[token] = percentage
    
tokens_pct_dict_KBBERT = {}

with open('tokens_pct_list_KBBERT.csv', 'r', encoding='utf-8') as csvfile:
    reader = csv.reader(csvfile)
    next(reader) 
    for row in reader:
        token = row[0]
        percentage = float(row[1])
        tokens_pct_dict_KBBERT[token] = percentage

In [9]:
print(list(tokens_pct_dict_KBBERT.values())[:100])

[11.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [7]:
def top_10000_tokens(dictionnaire):

    tokens_tries = sorted(dictionnaire.items(), key=lambda item: item[1], reverse=True)
    top_tokens = tokens_tries[:10000]
    return top_tokens

# Récupérer les 10000 tokens les plus fréquents pour chaque dictionnaire
top_tokens1 = top_10000_tokens(tokens_pct_dict)
top_tokens2 = top_10000_tokens(tokens_pct_dict_KBBERT)

print(top_tokens1)
print(top_tokens2)

[('1', 99.0), ('2', 99.0), ('kl.', 98.0), ('3', 98.0), ('4', 98.0), ('5', 97.0), ('förslag', 97.0), ('6', 96.0), ('denna', 96.0), ('8', 96.0), ('Kammaren', 96.0), ('anledning', 95.0), ('7', 95.0), ('samt', 95.0), ('10', 94.0), ('är', 94.0), ('9', 93.0), ('vissa', 93.0), ('m', 93.0), ('fråga', 93.0), ('11', 93.0), ('år', 93.0), ('12', 92.0), ('dessa', 92.0), ('åt', 91.0), ('15', 91.0), ('14', 91.0), ('13', 90.0), ('Herr', 90.0), ('20', 90.0), ('grund', 89.0), ('fall', 89.0), ('tid', 89.0), ('16', 89.0), ('18', 89.0), ('sätt', 89.0), ('17', 88.0), ('frågor', 88.0), ('proposition', 88.0), ('nya', 88.0), ('hos', 88.0), ('Stockholm', 88.0), ('19', 88.0), ('frågan', 87.0), ('sådan', 87.0), ('tiden', 87.0), ('21', 87.0), ('åtgärder', 87.0), ('väl', 87.0), ('bör', 87.0), ('sådana', 86.0), ('30', 86.0), ('viss', 86.0), ('ytterligare', 86.0), ('25', 86.0), ('22', 86.0), ('angående', 86.0), ('sådant', 86.0), ('ledamöter', 86.0), ('hela', 85.0), ('borde', 85.0), ('se', 85.0), ('dels', 85.0), ('23'

In [13]:
def calculate_percentage_overlap(top_tokens1, top_tokens2):
    set1 = set(top_tokens1)
    set2 = set(top_tokens2)
    
    # Find the intersection of the two sets
    common_tokens = set1.intersection(set2)
    
    # Calculate the percentage of common tokens with respect to top_tokens1
    if len(set1) == 0:
        return 0  # To handle division by zero if set1 is empty
    percentage_overlap = (len(common_tokens) / len(set1)) * 100
    
    return percentage_overlap

top_token1 = [token for token,pc in top_tokens1]
top_token2 = [token for token,pc in top_tokens2]
pc = calculate_percentage_overlap(top_token1,top_token2)
print(pc)
                                  

58.97


In [29]:
from datasets import Dataset
valid_dataset = Dataset.from_dict(valid_dataset)
with open("valid_dataset_exbert.pkl","wb") as f:
     pickle.dump(valid_dataset,f)

In [3]:
swerick_tokenizer = AutoTokenizer.from_pretrained("evaluation/swerick_tokenizer")

In [6]:
baseline_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
swerick_voc=list(swerick_tokenizer.vocab.keys())
base_voc = list(baseline_tokenizer.vocab.keys())

In [19]:
from pretraining_from_scratch import get_vocab_sim

inter,f,s,jaccard,vocab_f = get_vocab_sim(swerick_voc,base_voc)

In [20]:
interbis,fbis,sbis,jaccardbis,vocab_fbis= get_vocab_sim(vocab_f,new_tokens)

In [20]:
interb,fb,sb,jaccardb,vocab_fb= get_vocab_sim(new_tokens,vocab_f)

In [22]:
len(vocab_fb)

103

In [21]:
print("new vocab present in the exbert tokenizer but not in Swerick tokenizer" , vocab_fb)

new vocab present in the exbert tokenizer but not in Swerick tokenizer {'P.', 'upp-', 'd.', '1960-talet', 'om-', 't.o.m.', 'kl.', 'hälso-', 'A.', '1980-talet', 'Maj:t', 'kap.', '90-talet', 'k.', 'a.', 'm.m.', 'K.', 'fram-', 'fr.o.m.', 'B.', 'Stats-Utskottets', '2,5', '16.00', 'Kl.', 'T.', 'sam-', 'Maj:ts', 'dvs.', '3,000', '20,000', 'be-', 'EU:s', 'o.', 'E.', 'm.fl.', 'W.', '3:0', 'milj.', '1990-talet', '100,000', 'kungl.', 'bl.a.', 'Forts.', 's.', 'D.', '2,000', 'så-', 'O.', 'forts.', '14.00', '5,000', 'kom-', 'sid.', 'stats-', 'M.', 'osv.', 't.ex.', 'Kungl.', 's.k.', '1,000', 'ö.', 'V.', 'J.', '50,000', '1970-talet', 'ja-propositionen', '30,000', 'miljö-', '10.00', 'e.', 'H.', 'f.', 'v.', 'L.', 'för-', 'R.', '4:0', 'FN:s', 'F.', 'I.', 'resp.', 'före-', 'in-', 'N.', 't.', 'ang.', 'n:o', 'n.', 'C.', 'ut-', 'etc.', 'N:o', 'till-', 'Ang.', '2:0', '10,000', '4,000', 'an-', '1,5', 'kr.', 'S.', '1:0', 'G.'}


In [27]:
print(interbis)
print(fbis,sbis)
print("similarity of jaccard", jaccardbis)
print("new vocab present in the swerick tokenizer but not in new tokens" , vocab_fbis)

{'medgifver', 'utskotts', 'kd', 'lemnat', 'rss', 'anföra', 'verkställd', 'NN', 'afgifva', 'företagits', 'föredragningen', 'avvecklingen', 'sammanslutningar', 'uppdraga', 'utsträcka', 'nedlagt', 'tillvarataga', 'anhållit', 'budgetförslag', 'utjämna', 'tillsatts', 'rubbas', 'reservanten', 'Näringsutskottets', 'oaktadt', 'besvarat', 'medkammaren', 'dragas', 'samhällsekonomiskt', 'reformering', 'lagstifta', 'framskriden', 'Häruti', 'trängande', 'förbise', 'interpellationssvaret', 'kommunikationerna', 'behövliga', 'Skatteutskottets', 'betydliga', 'angelägenheten', 'uttömmande', 'hvilket', 'hitintills', 'rättssäkerheten', 'underdånig', 'påpekats', 'påpekande', 'remissdebatten', 'sättes', 'afgifna', 'skäligt', 'ikraftträdande', 'framlades', 'anslås', 'avgivit', 'intyga', 'riktigare', 'tillgripas', 'synnerligt', 'bilagt', 'stadgande', 'biträder', 'frågans', 'utföll', 'Cronvall', 'afser', 'lämpligheten', 'orimlighet', 'enahanda', 'partivänner', 'beskaffenheten', 'hemställan', 'slopande', 'tills

In [32]:
print(len(set(vocab_f)))
len(interbis)

22779


1527

In [26]:
print(len(interbis)/len(set(vocab_f)))
print(len(interbis)/len(new_tokens))

0.067035427367312
0.9368098159509203


In [31]:
new_vocab_fbis = [k for k in vocab_fbis if k.startswith("#") == False]
print(len(new_vocab_fbis))
print(new_vocab_fbis)


16923
['ändtligen', 'slakthus', 'diskriminerings', 'repetitionsövningar', 'fakulteter', 'vågsk', 'utbreda', 'målsättningarna', 'miljar', 'förbättrande', 'Ines', 'fjerdedel', 'berörande', 'livränta', 'kungsladu', 'Bonarp', 'inkomsttagarna', 'kärnbränsle', 'sökandes', 'WTO', 'eldnings', 'betydligare', 'dagordnings', 'diskriminerar', 'Riksb', 'oegentligheter', 'krigskonjunkturskatten', 'stadsfullmäktiges', 'förräntning', 'säsongs', 'konjunkturned', 'Regnéll', 'avsänt', 'varf', 'landtmäter', 'tvåkammar', 'kräf', 'händt', 'ronor', 'lantbrevbär', 'brottsbalken', 'oafvis', 'författningssamling', 'bruken', 'hufvudvoteringen', 'standardisering', 'kyrkolag', 'aktiebolagslagen', 'upprättar', 'skolbibliotek', 'valkretsen', 'landsbygdsutveckling', 'Öresundsregionen', 'hvilar', 'Biörsmark', 'utomäktenskapliga', 'Tobaks', 'lotterim', 'votera', 'interpeller', 'återuppbyggnads', 'arbetskrafter', 'glidning', 'strukturomvand', 'sjukfrånvaron', 'tydligast', 'Ceballos', 'förglöm', 'uranbrytning', 'näringsp

In [40]:
# deetct if some pre rform words have been added
pre_reform_token=[]
for token in new_tokens:
    if "dt" in token or "fv" in token or "hv" in token:
        print(token)
        pre_reform_token.append(token)
       

vidtaga
vidtagas
blifvit
hafva
hvad
öfver
hvilka
äfven
hvilken
hvilket
blifva
skrifvelse
särskildt
hvar
hvarje
äfvensom
huvudtiteln
vidtagna
gifva
öfvervägande
sagdt
blefve
behöfver
behöfva
hvaraf
hvarför
hvars
hvartill
vidt
hvarom
öfverläggningen
hvarigenom
öfvertygad
bestämdt
medgifva
oaktadt
öfverensstämmelse
hvarken
tvifvel
gifvit
hufvudsakligen
behofvet
hvarpå
gifvet
hvarefter
hvari
hvarandra
hvilkas
sjelfva
öfvertygelse
vidtagande
blefvo
gifver
hvadan
medgifver
utöfver
hvarmed
utöfva
ofvan
blifver
blifvande
hvart
huvudtitel
behöfves
Öfverläggningen
öfverläggning
Grefve
Äfven
Afven
antydt
afgifvit
Norstedt
afgifva
såvidt
föranledt
behöfvas
otvifvelaktigt
angifva
pröfva
hvem
hufvudtitel
öfverensstämmande
önskvärdt
godtycke
afgifvits
tillkännagifva
skrifvelser
kräfver
hårdt
deröfver
lefva
hvardera
kändt
hvaremot
hvarvid
godtyckligt
hufvudsakliga
värdt
drifva
hvila
vidtager
godtaga
hufvudsak
fattadt
vidtog
oförändradt
skrifva
upphäfvande
lifvet
medgifvande
