In [40]:
from transformers import AutoModelForMaskedLM, AutoModelForSequenceClassification,AutoTokenizer
import torch
from datasets import load_dataset
import math
from torch.utils.data import DataLoader
from transformers import default_data_collator
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import preprocessing
import argparse
import preprocessing
import pickle
from sklearn.preprocessing import LabelEncoder

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [37]:
model_checkpoint = "KBLab/bert-base-swedish-cased"
model =  AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at KBLab/bert-base-swedish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model_finetuned = AutoModelForSequenceClassification.from_pretrained("finetuning_hugging_python-finetuned-imdb/checkpoint-920384")
model_finetuned=model_finetuned.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at finetuning_hugging_python-finetuned-imdb/checkpoint-920384 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
tokenizer= AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
def filter_NaN(subset,example):
    return example[subset] is not None


In [23]:
def subset(dataset,nb_obs):
    total_sample=len(dataset)
    subset_size=nb_obs
    num_subset = total_sample//subset_size +(0 if total_sample%subset_size==0 else 1)
    sub_datasets=[]
    for i in range(num_subset):
        start_index=i*subset_size
        end_index = min((i+1)*subset_size,total_sample)
        sub_dataset=dataset.select(indices=range(start_index,end_index))
        sub_datasets.append(sub_dataset)
    return sub_datasets

In [45]:
def tokenize_function(examples):
    result = tokenizer(examples["Note"],padding=True, truncation=True)
    return result

In [4]:
data_files = {"train": "swerick_data_party_train.pkl", "test": "swerick_data_party_test.pkl"}
party_dataset = load_dataset("pandas",data_files=data_files)
print(party_dataset)

DatasetDict({
    train: Dataset({
        features: ['protocole', 'Note', 'id', 'party', 'gender'],
        num_rows: 3378877
    })
    test: Dataset({
        features: ['protocole', 'Note', 'id', 'party', 'gender'],
        num_rows: 725974
    })
})


In [15]:
data_files = {"valid": "swerick_data_party_valid.pkl"}
party_valid_dataset = load_dataset("pandas",data_files=data_files)
print(party_valid_dataset)

Generating valid split: 0 examples [00:00, ? examples/s]

DatasetDict({
    valid: Dataset({
        features: ['protocole', 'Note', 'id', 'party', 'gender'],
        num_rows: 725974
    })
})


In [6]:
party_dataset["train"]=party_dataset["train"].filter(lambda x : filter_NaN("party",x))
party_dataset["test"]=party_dataset["test"].filter(lambda x : filter_NaN("party",x))

Filter:   0%|          | 0/3378877 [00:00<?, ? examples/s]

Filter:   0%|          | 0/725974 [00:00<?, ? examples/s]

In [17]:
party_valid_dataset["valid"]=party_valid_dataset["valid"].filter(lambda x : filter_NaN("party",x))

Filter:   0%|          | 0/725974 [00:00<?, ? examples/s]

In [10]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(party_dataset["train"]["party"])
label_names = label_encoder.classes_
label_dict={ i : label_names[i] for i in  range(len(label_names))}
print(label_dict)

{0: '"vänstern"', 1: 'Andra kammarens center', 2: 'Andra kammarens frihandelsparti', 3: 'Bondeförbundet', 4: 'Centern (partigrupp 1873-1882)', 5: 'Centern (partigrupp 1885-1887)', 6: 'Centerpartiet', 7: 'Det förenade högerpartiet', 8: 'Ehrenheimska partiet', 9: 'Folkpartiet', 10: 'Folkpartiet (1895–1900)', 11: 'Friesenska diskussionsklubben', 12: 'Frihandelsvänliga centern', 13: 'Frisinnade folkpartiet', 14: 'Frisinnade försvarsvänner', 15: 'Frisinnade landsföreningen', 16: 'Första kammarens konservativa grupp', 17: 'Första kammarens ministeriella grupp', 18: 'Första kammarens minoritetsparti', 19: 'Första kammarens moderata parti', 20: 'Första kammarens nationella parti', 21: 'Första kammarens protektionistiska parti', 22: 'Gamla lantmannapartiet', 23: 'Högerns riksdagsgrupp', 24: 'Högerpartiet', 25: 'Högerpartiet de konservativa', 26: 'Jordbrukarnas fria grupp', 27: 'Junkerpartiet', 28: 'Kilbomspartiet', 29: 'Kommunistiska partiet', 30: 'Kristdemokraterna', 31: 'Lantmanna- och borgar

In [12]:
party_dataset["train"]=party_dataset["train"].map(lambda example :{"party_labels" : label_encoder.transform([example["party"]])[0]})
party_dataset["test"]=party_dataset["test"].map(lambda example :{"party_labels" : label_encoder.transform([example["party"]])[0]})

Map:   0%|          | 0/3167750 [00:00<?, ? examples/s]

Map:   0%|          | 0/676215 [00:00<?, ? examples/s]

In [18]:
party_valid_dataset["valid"]=party_valid_dataset["valid"].map(lambda example :{"party_labels" : label_encoder.transform([example["party"]])[0]})

Map:   0%|          | 0/676215 [00:00<?, ? examples/s]

In [14]:
import train_binary_bert

In [24]:
party_train_datasets = subset(party_dataset["train"],1000)
party_test_datasets = subset(party_dataset["test"],10000)
party_valid_datasets = subset(party_valid_dataset["valid"],10000)

In [25]:
train_set = party_train_datasets[0]
test_set = party_test_datasets[0]
valid_set = party_valid_datasets[0]

In [46]:
tokenized_train_datasets = train_set.map(tokenize_function,batched=True )
tokenized_test_datasets = test_set.map(tokenize_function,batched=True )
tokenized_valid_datasets = valid_set.map(tokenize_function,batched=True )
tokenized_train_datasets

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['protocole', 'Note', 'id', 'party', 'gender', 'party_labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [None]:
 train_loader = DataLoader(
            tokenized_train_datasets,
            shuffle=True,
            batch_size = batch_size,
            num_workers = num_workers
        )

    valid_loader = DataLoader(
            tokenized_valid_datasets,
            shuffle=False,
            batch_size = batch_size,
            num_workers = num_workers
        )

    # Not used atm
    test_loader = DataLoader(
            test_dataset,
            shuffle=False,
            batch_size = args.batch_size,
            num_workers = args.num_workers
        )

In [29]:
!python3 train_binary_bert.py --label_names {label_names} --train_set {train_set} --test_set test_set --valid_set valid_set --batch_size 64 

usage: train_binary_bert.py [-h] [--model_filename MODEL_FILENAME]
                            [--base_model BASE_MODEL] [--tokenizer TOKENIZER]
                            [--label_names LABEL_NAMES [LABEL_NAMES ...]]
                            [--train_set TRAIN_SET] [--test_set TEST_SET]
                            [--valid_set VALID_SET] [--device DEVICE]
                            [--n_epochs N_EPOCHS] [--batch_size BATCH_SIZE]
                            [--num_workers NUM_WORKERS]
                            [--learning_rate LEARNING_RATE]
                            [--train_ratio TRAIN_RATIO]
                            [--valid_ratio VALID_RATIO]
train_binary_bert.py: error: argument --train_set: invalid Dataset value: 'swerick_data_party_train.pkl'
fish: Unknown command: Bondeförbundet
fish: 
 'Bondeförbundet' 'Centern (partigrupp 1873-1882)'
 ^
fish: Unknown command: 'Centern (partigrupp 1885-1887)'
fish: 
 'Centern (partigrupp 1885-1887)' 'Centerpartiet'
 ^
fish: Unknown

In [None]:
loss1, accuracy1 = 0.0, []
loss2, accuracy2 = 0.0, []
model.eval()
model_hugging_face.eval()
true_labels, pred_labels1, pred_labels2 = [], [], []
for batch in tqdm(loader, total=len(loader)):
    input_ids = batch[0].to(args.device)
    input_mask = batch[1].to(args.device)
    labels = batch[2].to(args.device)
    output1 = model1(input_ids, token_type_ids=None, attention_mask=input_mask, labels=labels)
    output2 = model2(input_ids, token_type_ids=None, attention_mask=input_mask, labels=labels)
    loss1 += output1.loss.item()
    loss2 += output2.loss.item()
    preds_batch1 = torch.argmax(output1.logits, axis=1)
    preds_batch2 = torch.argmax(output2.logits, axis=1)
    batch_acc1 = torch.mean((preds_batch1 == labels).float())
    batch_acc2 = torch.mean((preds_batch2 == labels).float())
    accuracy1.append(batch_acc1)
    accuracy2.append(batch_acc2)
    true_labels.extend(labels.cpu().numpy())
    pred_labels1.extend(preds_batch1.cpu().numpy())
    pred_labels2.extend(preds_batch2.cpu().numpy())
        
        for true_label, pred_label1, pred_label2 , input_id  in zip(labels, preds_batch1, preds_batch2, input_ids):
            if true_label != pred_label1 or true_label != pred_label2 :
                text = tokenizer.decode(input_id, skip_special_tokens=True)
                matching_rows = dataset[dataset['content'].apply(lambda x: Levenshtein.ratio(text, x) >= 0.9)]
                if not matching_rows.empty:
                    github = matching_rows['github'].iloc[0]
                    protocol_id = matching_rows['protocol_id'].iloc[0]

                    misclassified_examples.append({'text': text, 'true_label': label_names[true_label.item()], 'predicted1':label_names[pred_label1.item()],'predicted2':label_names[pred_label2.item()], 'github': github, 'protocol_id': protocol_id})
                else:
                    print(f"no matching row for text: {text}")

    misclassified_df = pd.DataFrame(misclassified_examples)
    misclassified_df.to_csv('data/compare_misclassified_examples.csv', index=False, columns=['text', 'true_label', 'predicted1', 'predicted2', 'github', 'protocol_id'])
    
 # Print misclassified examples
    print("\nMisclassified Examples:")
    print(misclassified_examples)

    print("\nAccuracy model 1:", accuracy_score(true_labels, pred_labels1))
    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels1, target_names=list(label_names)))

    
    print("\nAccuracy model 2:", accuracy_score(true_labels, pred_labels2))
    print("\nClassification Report:")
    print(classification_report(true_labels, pred_labels2, target_names=list(label_names)))