In [1]:

import torch
from datasets import load_dataset,concatenate_datasets,Dataset
import math
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification,AutoTokenizer
from transformers import default_data_collator
from transformers import PreTrainedTokenizerFast
from transformers import  BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import argparse
import sys
import os
sys.path.append(os.path.abspath('../'))
import preprocessing
import preprocessing
import pickle
from sklearn.preprocessing import StandardScaler
from transformers import get_linear_schedule_with_warmup
import random
import re
import numpy as np
import pandas as pd
from evaluation import reform_scratch_classfication
from evaluation import regression_year
from lime.lime_text import LimeTextExplainer
from collections import defaultdict

In [2]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


Models Used

In [4]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
tokenizer= AutoTokenizer.from_pretrained(model_checkpoint)
model =  AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at KBLab/bert-base-swedish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_finetuned = AutoModelForSequenceClassification.from_pretrained("/home/laurinemeier/swerick/finetuning_hugging_whitespace_bis-finetuned-imdb/checkpoint-2175500")
model_finetuned=model_finetuned.to(device)

In [None]:
exbert_tokenizer = AutoTokenizer.from_pretrained("/home/laurinemeier/swerick/exbert_tokenizer")
model_exbert=  AutoModelForSequenceClassification.from_pretrained("/home/laurinemeier/swerick/exbert-finetuned-imdb/checkpoint-6054000")
model_exbert=model_exbert.to(device)

In [None]:
swerick_tokenizer= AutoTokenizer.from_pretrained("/home/laurinemeier/swerick/swerick_tokenizer")
config = transformers.BertConfig.from_pretrained("/home/laurinemeier/swerick/pretraining_scratch/checkpoint-5258900")
mosaicBert = AutoModelForMaskedLM.from_pretrained("/home/laurinemeier/swerick/pretraining_scratch/checkpoint-5258900",config=config,trust_remote_code=True)

In [3]:
def filter_NaN(subset,example):
    return example[subset] is not None


In [4]:
def subset_label(dataset,nb_obs,label_name):
    df = dataset.to_pandas()

    # Calculer le nombre d'observations pour chaque étiquette
    grouped_data = df.groupby(label_name)

    # Calculer le nombre d'observations par étiquette pour obtenir une répartition uniforme
    total_samples = nb_obs
    samples_per_label = total_samples // len(grouped_data.groups)

    # Créer une liste pour stocker les observations échantillonnées
    sampled_data = []

    # Prélever aléatoirement les observations pour chaque groupe de label
    for group_label, group_data in grouped_data.groups.items():
        group_dataset=dataset.select(group_data)
        label_data = group_dataset.shuffle(seed=np.random.randint(1, 1000)).select(range(min(len(group_data), samples_per_label)))
        sampled_data.extend(label_data)

    # Mélanger les observations pour obtenir un ordre aléatoire
    np.random.shuffle(sampled_data)

    # Créer un Dataset Hugging Face à partir des observations échantillonnées
    sampled_dataset = Dataset.from_dict({key: [example[key] for example in sampled_data] for key in sampled_data[0]})
    
    return sampled_dataset

In [5]:
def subset_random(dataset,nb_ob):
    dataset=dataset.shuffle()
    echantillon_aleatoire = dataset.select(range(nb_ob))
    return echantillon_aleatoire

In [6]:
def tokenize_function(examples):
    result = tokenizer(examples["Note"],padding=True, truncation=True,max_length=512)
    return result

In [7]:
def evaluate(model, loader):
    loss, accuracy = 0.0, []
    model.eval()
    for batch in tqdm(loader, total=len(loader)):
        input_ids = batch["input_ids"].to(device)
        input_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        output = model(input_ids,
            token_type_ids=None, 
            attention_mask=input_mask, 
            labels=labels)
        loss += output.loss.item()
        preds_batch = torch.argmax(output.logits, axis=1)
        batch_acc = torch.mean((preds_batch == labels).float())
        accuracy.append(batch_acc)
        
    accuracy = torch.mean(torch.tensor(accuracy))
    return loss, accuracy


In [8]:
def extract_date_from_filename(protocole):
    match = re.search(r'/(\d+)/', protocole)
    if match:
        year = match.group(1)
        return int(year[:4])
    else:
        return None

In [9]:
def determine_reform(example):
    example['reform'] = 'pre' if example['date'] <= 1912 else 'post'
    return example

In [10]:
def year_filter(example, year):
    print("hey")
    return example['date'] >= year

In [11]:
def filter_notes(example):
    note = example['Note']
    return ('Herr' in note or
            'Fru'  in note)

In [12]:
def filter_size(example):
    note = example['Note']
    word_count = len(note.split())
    return word_count > 30

In [13]:
def predict_proba(texts, model, tokenizer):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = torch.nn.functional.softmax(logits, dim=-1).numpy()
    return probabilities

In [14]:
data_files = {"train": "swerick_data_party_train.pkl", "test": "swerick_data_party_test.pkl"}
party_dataset = load_dataset("pandas",data_files=data_files)
print(party_dataset)

DatasetDict({
    train: Dataset({
        features: ['protocole', 'Note', 'id', 'party', 'gender'],
        num_rows: 3378877
    })
    test: Dataset({
        features: ['protocole', 'Note', 'id', 'party', 'gender'],
        num_rows: 725974
    })
})


In [11]:
data_files = {"train": "/home/laurinemeier/swerick/evaluation/swerick_data_seg_train.pkl", "test": "/home/laurinemeier/swerick/evaluation/swerick_data_seg_test.pkl"}
seg_dataset = load_dataset("pandas",data_files=data_files)
print(seg_dataset["train"][:5])

{'Text': ['statsverkspropositionen gjorda framställningar rörande utgifterna\n          för budgetåret 1969/70 inom försvarsdepartementets verksamhetsområde\n          jämte motioner.', 'Lördagen den 1 april e. m. Nr 29. 87', 'Herr Restadius, som yttrade:', 'Nr 65.', 'Regeringens agerande i husläkarfrågan och i propositionen om\n            fri etablering är sådant som har lett fram till politiska låsningar\n            i utformningen av välfärden. Men som jag ser det gäller motsättningarna\n            främst långtgående nedskärningar och privatiseringar i välfärdssamhället.'], 'Note/seg': ['note', 'note', 'note', 'note', 'seg'], '__index_level_0__': [6608920, 3521581, 1119166, 2126533, 10261505]}


Getting the date of the protocol

In [15]:
dates = [extract_date_from_filename(row['protocole']) for row in party_dataset['train']]
dates_test = [extract_date_from_filename(row['protocole']) for row in party_dataset['test']]
party_dataset['train'] = party_dataset['train'].add_column('date', dates)
party_dataset['test'] = party_dataset['test'].add_column('date', dates_test)

print(party_dataset["train"][0])
print(party_dataset["test"][0])

{'protocole': 'data/1867/prot-1867--ak--0119.xml', 'Note': 'För min del kan jag ej se något hinder för företagande af dessa val ännu något tidigare än Herr Talmannen föreslagit. Jag har föreställt mig, att vi så mycket som möjligt borde taga vara på den dyrbara tiden, och att nu ifrågakomna val kunde förrättas t. ex. på Tisdagen, och valen till suppleanter i Utskotten på Thorsdagen.', 'id': 'i-PyY1Vo1W6WaajphhpnKHN8', 'party': None, 'gender': 'man', 'date': 1867}
{'protocole': 'data/1867/prot-1867--ak--0118.xml', 'Note': 'Mine Herrar! Sannolikt är det för många bland Eder oväntadt att se en prest intaga talmansstolen i Riksdagens Andra Kammare. Det sker också icke efter min egen önskan. Jag har lika litet eftersträfvat detta höga och ansvarsfulla förtroende, som jag sökt sjelfva riksdagsmannakallets hedrande uppdrag. Men då vahnännen inom den krets jag tillhör lemnat mig det sednare, och Kongl. Maj:t sedermera behagat förläna mig det förra, har jag ej haft giltiga skäl att undandraga m

In [13]:
#Label spellign reformp
party_dataset = party_dataset.map(determine_reform)

NameError: name 'determine_reform' is not defined

In [None]:
#Year filter 
party_dataset = party_dataset.filter(lambda x: year_filter(x, 2000))
party_dataset

In [16]:
#Standarlized the date

date_scaler = StandardScaler()
dates_train_2d = [[date] for date in party_dataset['train']["date"]]
dates_test_2d=[[date] for date in party_dataset['test']["date"]]
date_scaler.fit(dates_train_2d)
dates_train =date_scaler.transform(dates_train_2d)
dates_test =date_scaler.transform(dates_test_2d)
print(dates_train)
party_dataset['train'] = party_dataset['train'].add_column('date_scaled', dates_train.squeeze())
party_dataset['test'] = party_dataset['test'].add_column('date_scaled', dates_test.squeeze())


[[-2.57723263]
 [-2.57723263]
 [-2.57723263]
 ...
 [ 1.05391207]
 [ 1.05391207]
 [ 1.05391207]]


Filter NaN Values

In [19]:
party_dataset["train"]=party_dataset["train"].filter(lambda x : filter_NaN("gender",x))
party_dataset["test"]=party_dataset["test"].filter(lambda x : filter_NaN("gender",x))

Filter Text for party Classfication

In [None]:
party_dataset_note_train=party_dataset['train'].filter(filter_notes)
party_dataset_note_test=party_dataset['test'].filter(filter_notes)
party_dataset_note_train=party_dataset_note_train.filter(filter_size)
party_dataset_note_test=party_dataset_note_test.filter(filter_size)


Taking a subset for training and testing

In [None]:
# Balanced subset 
party_train_datasets = subset_label(party_dataset_year["train"],10000,"party")
party_test_datasets = subset_label(party_dataset["test"],5000,"party")
#party_valid_datasets = subset_label(party_valid_dataset["valid"],5000,"party")

In [None]:
#random subset
train_set= subset_random(party_dataset["train"],1000)
test_set = subset_random(party_dataset["test"],10000)


In [13]:
#Converting them into csv file
df =train_set.to_pandas()
df = df.rename(columns={"Text":"content","Note/seg" : "tag"})
print(df.head())
df.to_csv("swerick_subsetdata_note_train.csv", index=False)

                                             content   tag  __index_level_0__
0                           Torsdagen den Gjuni 1985  note            8950568
1  Avkastningen av den hälftenandel som faller på...   seg            8239669
2  Beträffande herr Kilboms invändningar så talad...   seg            3530266
3  Transportstyrelsen har givit SMHI i uppdrag at...   seg           12627748
4   Enligt uppgift i pressen, IDAG den 25 april 1991  note            9785354


Finetuning Model

In [17]:
# Training for a specific task
reform_scratch_classfication("trained_gender_classification","/home/laurinemeier/swerick/finetuning_hugging_whitespace_bis-finetuned-imdb/checkpoint-2061000","/home/laurinemeier/swerick/exbert_60k-finetuned-imdb/checkpoint-4611166","/home/laurinemeier/swerick/evaluation/swerick_subsetdata_gender_train.csv","/home/laurinemeier/swerick/evaluation/swerick_subsetdata_gender_test.csv",tokenizer2="/home/laurinemeier/swerick/exbert_tokenizer_60k")
#regression_year("/home/laurinemeier/swerick/exbert_60k-finetuned-imdb/checkpoint-4806004","/home/laurinemeier/swerick/evaluation/swerick_subsetdata_date_train100.csv")


Party alignement classification
training


KeyboardInterrupt: 

In [17]:
#TRaining mutltiple times with random training dataset


for i in range (5):
        train_set = subset_random(party_dataset["train"],200)
        df=train_set.to_pandas()
        df=df.rename(columns={"Note":"content", "date_scaled" : "tag"})

        df.to_csv("swerick_subsetdata_date_train200.csv")
        print("First round")
        #reform_scratch_classfication("trained_gender_classification","/home/laurinemeier/swerick/finetuning_hugging_whitespace_bis-finetuned-imdb/checkpoint-2061000","/home/laurinemeier/swerick/exbert_60k-finetuned-imdb/checkpoint-4611166","/home/laurinemeier/swerick/evaluation/swerick_subsetdata_gender_train.csv","/home/laurinemeier/swerick/evaluation/swerick_subsetdata_gender_test.csv",tokenizer2="/home/laurinemeier/swerick/exbert_tokenizer_60k")
        regression_year("/home/laurinemeier/swerick/exbert_60k-finetuned-imdb/checkpoint-4806004","/home/laurinemeier/swerick/evaluation/swerick_subsetdata_date_train200.csv","/home/laurinemeier/swerick/exbert_tokenizer_60k")
        

First round
Year regression
training
stdout: [32m12:06:03 [INFO] [37m(train-bert)[0m: Load and save tokenizer...[0m
[32m12:06:25 [INFO] [37m(train-bert)[0m: Preprocess datasets...[0m
[32m12:06:25 [INFO] [37m(train-bert)[0m: Labels: tensor([ 0.2522,  0.3465, -1.5869,  0.8653, -1.5398, -1.5162,  1.0539, -0.1015,
        -1.7520,  0.5823, -0.5495, -2.4829,  0.5352, -1.5633,  0.8181, -0.3372,
         0.3937,  1.0068,  0.6767, -0.4316, -0.2193, -1.3747,  0.9360,  0.6059,
        -0.0543,  0.8889,  0.5588,  0.9596, -1.5633,  0.9360,  1.0068,  0.2994,
         0.4409,  0.5588,  0.3701,  0.2994, -2.3650, -1.4926,  0.8889,  0.4173,
        -1.1389,  0.4644, -1.3983,  0.6531,  0.3701, -0.7617,  0.7238,  0.9596,
         0.7474, -2.1764,  0.9596, -1.8934, -0.6909,  1.0068,  0.9596, -2.5065,
        -1.2568, -2.5537, -0.0779,  0.2758,  0.5116, -0.2193,  0.3465,  0.4880,
         0.7945,  0.7945,  0.8889,  0.5352,  0.4644,  1.0539, -1.4926,  1.0068,
         1.0539, -1.0918, -0.3608,  0

Explaining classficaition : LIME EXplainer

In [None]:
#selecting dataset
# 
df = pd.read_csv('/home/laurinemeier/swerick/evaluation/trained_note_classification054000/correct_exbert_notKBBERT.csv')
MAX_TEXT_LENGTH = 1000 # if not the explainer crash
df['content'] = df['content'].apply(lambda x: x[:MAX_TEXT_LENGTH] if isinstance(x, str) else x)
df=df.sample(n=10,random_state=42)
print(df)

In [None]:
def create_token_dicts():
    return defaultdict(int), defaultdict(int)


# Knwoing the token resposible for classification
models = {
   'kbbert': ('/home/laurinemeier/swerick/evaluation/trained_reform_classification',tokenizer),
    'sbertex': ('/home/laurinemeier/swerick/evaluation/trained_reform_classification054000', exbert_tokenizer),
    'daptbert': ('/home/laurinemeier/swerick/evaluation/trained_reform_classification061000',tokenizer),
   # 'sparlbert': ('/home/laurinemeier/swerick/evaluation/trained_reform_classification258900', swerick_tokenizer),
}


explainer = LimeTextExplainer(class_names=[0, 1])

token_freqs = {model_name: create_token_dicts() for model_name in models}

for model_name, (model_path, tokenizer) in models.items():
    print(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    pre_tokens_freq, post_tokens_freq = token_freqs[model_name]

    for idx, row in df.iterrows():
        text = row['content']
        print(row['tag'])

        exp = explainer.explain_instance(text, lambda x: predict_proba(x, model, tokenizer), num_features=2)
        exp.show_in_notebook(text=True, labels=(exp.available_labels()[0],))

         # Predict the label
        prediction = model(**tokenizer(text, return_tensors='pt')).logits.argmax(dim=1).item()
        predicted_label = 'pre' if prediction == 1 else 'post'
        
        sorted_list=sorted(exp.as_list(), key=lambda x:x[1], reverse=True)
        important_tokens = [word for word, importance in sorted_list][:3]

        if predicted_label == 'pre':
            for token in important_tokens:
                pre_tokens_freq[token] += 1
        elif predicted_label == 'post':
            for token in important_tokens:
                post_tokens_freq[token] += 1
