In [1]:
import torch
from transformers.file_utils import is_tf_available, is_torch_available

import numpy as np
import random
import pandas as pd
import re
import math

import mlflow
import torch
mlflow.end_run()
torch.cuda.empty_cache()

# containers for misclassified articles

bin_topics = list()
bin_features = list()
bin_embeds = list()
bin_topics_features = list()
bin_topics_embeds = list()
bin_features_embeds = list()

multi_topics = list()
multi_features = list()
multi_embeds = list()
multi_topics_features = list()
multi_topics_embeds = list()
multi_features_embeds = list()





def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed). Taken from https://www.thepythoncode.com/article/finetuning-bert-using-huggingface-transformers-python 
 
    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # ^^ safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf
 
        tf.random.set_seed(seed)

In [2]:
set_seed(42)
from sklearn.model_selection import train_test_split
import pandas as pd
!pip install bs4 --user
from bs4 import BeautifulSoup

def clean_vector(v):
        v = np.where(v > 900000000, 0, v)
        return v
    
def normalize(v):
    unit_vector = np.sqrt(sum(x ** 2 for x in v))
    v = [x / unit_vector for x in v]
    return np.array(v).astype(np.float32)

def read_data():
  dataset = pd.read_csv("MV_data_final_features_topics.csv")
  dataset['content'] =  dataset['content'].apply(lambda x: BeautifulSoup(x).get_text())
  print(list(dataset['content'])[0])



  #print(type(dataset["topic_vector"][0]))
  #print(dataset["topic_vector"][0], end = "\n\n")

  topic_vectors_prep = [vector.strip("[").strip("]") for vector in list(dataset["topic_vector"])]
  topic_vectors = [np.fromstring(vector, dtype=float, sep=',').astype(np.float32) for vector in topic_vectors_prep]
  print(type(topic_vectors[0][0]))


  feature_vectors_prep = [np.fromstring(vector, dtype=float, sep=",").astype(np.float32) for vector in list(dataset["feature_vector"])]
  print(f"feature vec = {feature_vectors_prep[0]}")
  
    
  feature_vectors = [normalize(v) for v in feature_vectors_prep]
  print(f"feature vec_clean = {type(feature_vectors[0])}")
  print(f"feature vec_clean = {type(feature_vectors[0][0])}")
  print(f"feature vec_clean = {feature_vectors[0]}")
  print(len(feature_vectors[0]))


  new_labels = []
  for c in list(dataset['class']):
    if c == 1: # kritiikki
      new_labels.append(0)
    elif c == 2: # kopiointi
      new_labels.append(1)
    elif c == 4: # oma narratiivi
      new_labels.append(2)
  documents = list(dataset['content'])
  return train_test_split(documents, new_labels, topic_vectors, feature_vectors, random_state=42)
  
# call the function
(train_texts, valid_texts, train_labels, valid_labels, train_topics, valid_topics, train_features, valid_features) = read_data()
class_labels = ["kritiikki","kopiointi","oma narratiivi"]

(train_classes, train_class_counts) = np.unique(train_labels,return_counts=True)
(valid_classes, valid_class_counts) = np.unique(valid_labels,return_counts=True)
(all_classes, all_class_counts) = np.unique(np.concatenate((train_labels,valid_labels)),return_counts=True)
(all_classes,all_class_counts)

##print(f"len train: {len(train_topics)}")
##print(f"len valid: {len(valid_topics)}")

##print(train_texts[0])

##print(train_labels[0])

##print(type(train_topics[0]))

##print(len(train_labels))


Verkkouuutiset uutisoi tänään, että Perussuomalaisten Teuvo Hakkarainen puhui tiistaina eduskunnassa ulkomaalaislain käsittelyssä.
Hakkaraisen mielestä esityksen sisältämät asiat ovat positiivisia askelia, mutta eivät riittäviä.
Teuvo Hakkarainen.
Teuvo Hakkarainen sanoi:
”Valtaosa turvapaikkaturisteista on tullut ainakin kymmenen turvallisen maan läpi Suomeen, eikä heillä kotimaassakaan ole ollut konkreettista henkeen ja terveyteen kohdistuvaa uhkaa, vaikka kaikenlaisia tarinoita he ovatkin oppineet kertomaan.
Enimmäkseen he ovat ilmaisen sosiaaliturvan perässä reissaavia nuoria miehiä, joita ei kiinnosta rakentaa omaa isänmaataan. Heitä kiinnostaa siivestäminen.”
”Koska alun alkaenkaan he eivät täytä kansainvälistä suojelua koskevia kriteereitä, en tiedä, miksi heillä ylipäänsä pitäisi olla valitusoikeudet.
Valitusoikeus kuormittaa oikeuslaitostamme kohtuuttomasti. Se ensimmäinenkin hakemus pitäisi tehdä pikapäätöksenä rajalla.
Jos asiallisia henkilöpapereita ei ole, hakemusta ei pit

(array([0, 1, 2]), array([ 81, 770, 146]))

In [3]:
set_seed(42)
def split_encodings(labels, topics, features, encodings, max_length):
    new_labels = []
    new_input_ids = []
    new_token_type_ids = []
    new_attention_mask = []
    new_topic_vectors = []
    new_feature_vectors = []
    
    input_ids = encodings['input_ids']
    token_type_ids = encodings['token_type_ids']
    attention_mask = encodings['attention_mask']
    
    for index,label in enumerate(labels):
        cur_input_ids = input_ids[index]
        cur_token_type_ids = token_type_ids[index]
        cur_attention_mask = attention_mask[index]

        while len(cur_input_ids)>max_length:
            new_input_ids.append(cur_input_ids[0:max_length])
            new_token_type_ids.append(cur_token_type_ids[0:max_length])
            new_attention_mask.append(cur_attention_mask[0:max_length])
            new_labels.append(label)
            new_topic_vectors.append(topics[index])
            new_feature_vectors.append(features[index])
            
            cur_input_ids = cur_input_ids[max_length:]
            cur_token_type_ids = cur_token_type_ids[max_length:]
            cur_attention_mask = cur_attention_mask[max_length:]
            
        if len(cur_input_ids)>0:
            new_labels.append(label)
            new_topic_vectors.append(topics[index])
            new_feature_vectors.append(features[index])
            new_input_ids.append(np.lib.pad(cur_input_ids,(0,max_length-len(cur_input_ids)),constant_values=(0)))
            new_token_type_ids.append(np.lib.pad(cur_token_type_ids,(0,max_length-len(cur_input_ids)),constant_values=(0)))
            new_attention_mask.append(np.lib.pad(cur_attention_mask,(0,max_length-len(cur_input_ids)),constant_values=(0)))   
        
    return (new_labels, new_topic_vectors, new_feature_vectors,{
        'input_ids': new_input_ids,
        'token_type_ids': new_token_type_ids,
        'attention_mask': new_attention_mask
    })


model_name = "TurkuNLP/bert-base-finnish-uncased-v1" 
max_length = 512

from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(train_texts)

(train_snippets_labels, train_snippets_topics, train_snippets_features, train_snippets_encodings) = split_encodings(train_labels, train_topics, train_features, train_encodings,max_length)

valid_encodings = tokenizer(valid_texts)

(valid_snippets_labels, valid_snippets_topics, valid_snippets_features, valid_snippets_encodings) = split_encodings(valid_labels, valid_topics, valid_features, valid_encodings,max_length)

(all_snippets_classes, all_snippets_class_counts) = np.unique(np.concatenate((train_snippets_labels,valid_snippets_labels)),return_counts=True)
(len(train_labels),len(train_snippets_labels),len(valid_labels),len(valid_snippets_labels),all_snippets_classes, all_snippets_class_counts)

##print(type(valid_snippets_topics[2]))


Token indices sequence length is longer than the specified maximum sequence length for this model (954 > 512). Running this sequence through the model will result in indexing errors


(747, 1144, 250, 367, array([0, 1, 2]), array([ 137, 1040,  334]))

In [4]:
# What is below is mostly a copy of BertForSequenceClassification, but with an added class_weights parameter, 
# which gets used to tune the loss function. While this could be done in other ways (compute_loss in Trainer)
# this copy also acts as a useful insight into what actually happens within the classifier

from transformers.models.bert import BertPreTrainedModel,BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers.models.bert.configuration_bert import BertConfig
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

class BertConfigWithClassWeights(BertConfig):
    def __init__(
        self,
        class_weights = None,
        freeze_bert_weights = False,
        use_topics = False,
        use_features = False,
        use_embeddings = False,
        **kwargs
    ):
        self.class_weights = class_weights
        self.freeze_bert_weights = freeze_bert_weights
        self.use_topics = use_topics
        self.use_features = use_features
        self.use_embeddings = use_embeddings
        super().__init__(**kwargs)

class BertForWeightedSequenceClassification(BertPreTrainedModel):
    config_class = BertConfigWithClassWeights
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config
        self.class_weights = torch.tensor(self.config.class_weights) if self.config.class_weights else None
        if torch.cuda.is_available():
            self.class_weights = self.class_weights.to("cuda")
        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        print("arvot")
        print(config.hidden_size, config.num_labels)
        
                # use topics and embeddings
        if self.config.use_features and self.config.use_embeddings and self.config.use_topics:
                #print("all featr")
                self.classifier = nn.Sequential(
                                                nn.Linear(config.hidden_size + 80 + 200, config.hidden_size + 80 + 200),
                                                nn.ReLU(),
                                                nn.Dropout(),
                                                nn.Linear(config.hidden_size + 80 + 200, config.num_labels))
   

        


                # use only cls embeddings
        else:
            #print("only cls")
            self.classifier = nn.Linear(config.hidden_size, config.num_labels)


        self.init_weights()
        if config.freeze_bert_weights:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        topics=None,
        features=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        
        #print(f"input ids : {input_ids[:5]}")
        #print(f"len input ids: {len(input_ids[:5])}")
        
        #topic_vectors = {}
        #for index in range(len(input_ids)):
        #    topic_vectors[index] = torch.FloatTensor(get_topic_distribution(get_document_text(index))).to("cuda")
        
            
            
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
    

        pooled_output = outputs[1]

        #list_of_tensors = [torch.cat((pooled_output[index], topics[index]), 0) for index in range(len(input_ids))]
        
        
        # TOPICS AND EMBEDDINGS
        if self.config.use_topics and self.config.use_embeddings and not self.config.use_features:
            list_of_tensors = [torch.cat((pooled_output[index], topics[index]), 0) for index in range(len(input_ids))]
            pooled_output = torch.stack(list_of_tensors)
            
        # TOPICS ONLY
        elif self.config.use_topics and not self.config.use_embeddings:
            if not self.config.use_features:
                list_of_tensors = [topics[index] for index in range(len(input_ids))]
                pooled_output = torch.stack(list_of_tensors)
            elif self.config.use_features:
                list_of_tensors = [torch.cat((topics[index], features[index]), 0) for index in range(len(input_ids))]
            pooled_output = torch.stack(list_of_tensors)
            
        elif self.config.use_topics and self.config.use_features and not self.config.use_embeddings:
            list_of_tensors = [torch.cat((topics[index], features[index]), 0) for index in range(len(input_ids))]
            pooled_output = torch.stack(list_of_tensors)
            
        # FEATURES ONLY
        
        elif self.config.use_features and not self.config.use_topics and not self.config.use_embeddings:
            list_of_tensors = [features[index] for index in range(len(input_ids))]
            pooled_output = torch.stack(list_of_tensors)
    
            
        # CLS and structural features
        elif self.config.use_features and self.config.use_embeddings and not self.config.use_topics:
            list_of_tensors = [torch.cat((pooled_output[index], features[index]), 0) for index in range(len(input_ids))]
            pooled_output = torch.stack(list_of_tensors)
        
        # all features combined
            
        elif self.config.use_features and self.config.use_topics and self.config.use_embeddings:
            #print("yess")
            list_of_tensors = [torch.cat((pooled_output[index], features[index], topics[index]), 0) for index in range(len(input_ids))]
            #print(len(list_of_tensors[0]))
            pooled_output = torch.stack(list_of_tensors)

        # EMBEDDINGS ONLY
        
        #print(len(pooled_output[3]))
        
        pooled_output = self.dropout(pooled_output)


         
        logits = self.classifier(pooled_output)  
      
                      
    
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss(weight=self.class_weights)
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss(pos_weight=self.class_weights)
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )



In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# this function defines the metrics reported by our trainer in each evaluation pass
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  cm = confusion_matrix(labels, preds)
  return {
      'accuracy': acc,
      'confusion_matrix': str(cm)
  }

# this container mostly serves to turn our data into torch tensors when needed
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, topics, features):
        self.encodings = encodings
        self.labels = labels
        self.topics = topics
        self.features = features
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        item["topics"] = torch.tensor(self.topics[idx])
        item["features"] = torch.tensor(self.features[idx])
        return item

    def __len__(self):
        return len(self.labels)
    



# Experiment 1: Binary categorization into kopionti / other

# TOPICS ONLY

In [None]:
set_seed(42)
mlflow.end_run()
class_labels_binary = ["kritiikki/oma narratiivi","kopiointi"]
train_snippets_labels_binary = [1 if x==1 else 0 for x in train_snippets_labels]
valid_snippets_labels_binary = [1 if x==1 else 0 for x in valid_snippets_labels]
(all_snippets_classes_binary, all_snippets_class_counts_binary) = np.unique(np.concatenate((train_snippets_labels_binary,valid_snippets_labels_binary)),return_counts=True)
train_dataset_binary = Dataset(train_snippets_encodings, train_snippets_labels_binary, train_snippets_topics, train_snippets_features)
valid_dataset_binary = Dataset(valid_snippets_encodings, valid_snippets_labels_binary, valid_snippets_topics, valid_snippets_features)
num_classes = 2
class_weights = (1/all_snippets_class_counts_binary).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [5.0,1.0]
freeze_bert_weights = True
use_topics = True
use_features = False
use_embeddings = False

print(all_snippets_class_counts_binary,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args_binary = TrainingArguments(
    output_dir='./results_binary',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs_binary',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)

model_binary = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                                     num_labels=num_classes, 
                                                                     problem_type='single_label_classification', 
                                                                     class_weights=class_weights, 
                                                                     freeze_bert_weights=freeze_bert_weights, 
                                                                     use_topics=use_topics, 
                                                                     use_features=use_features,
                                                                     use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model_binary = model_binary.to("cuda")

trainer_binary = Trainer(
  model=model_binary,                         # the instantiated Transformers model to be trained
  args=training_args_binary,                  # training arguments, defined above
  train_dataset=train_dataset_binary,         # training dataset
  eval_dataset=valid_dataset_binary,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_binary.train()

In [None]:


preds = trainer_binary.predict(valid_dataset_binary)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"* Predicted {class_labels_binary[pred_label]} ({logits}) != real {class_labels_binary[label]} for:\n{text}")
        bin_topics.append((text, class_labels_binary[pred_label], class_labels_binary[label]))
        
df = pd.DataFrame.from_dict(bin_topics)
df.to_csv("bin_topics.csv")

In [None]:
bin_topics[2]

# FEATURES ONLY

In [None]:
set_seed(42)
mlflow.end_run()
class_labels_binary = ["kritiikki/oma narratiivi","kopiointi"]
train_snippets_labels_binary = [1 if x==1 else 0 for x in train_snippets_labels]
valid_snippets_labels_binary = [1 if x==1 else 0 for x in valid_snippets_labels]
(all_snippets_classes_binary, all_snippets_class_counts_binary) = np.unique(np.concatenate((train_snippets_labels_binary,valid_snippets_labels_binary)),return_counts=True)
train_dataset_binary = Dataset(train_snippets_encodings, train_snippets_labels_binary, train_snippets_topics, train_snippets_features)
valid_dataset_binary = Dataset(valid_snippets_encodings, valid_snippets_labels_binary, valid_snippets_topics, valid_snippets_features)
num_classes = 2
class_weights = (1/all_snippets_class_counts_binary).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [5.0,1.0]
freeze_bert_weights = True
use_features = True
use_topics = False
use_embeddings = False

print(all_snippets_class_counts_binary,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args_binary = TrainingArguments(
    output_dir='./results_binary',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs_binary',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)

model_binary = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                                     num_labels=num_classes, 
                                                                     problem_type='single_label_classification', 
                                                                     class_weights=class_weights, 
                                                                     freeze_bert_weights=freeze_bert_weights, 
                                                                     use_topics=use_topics,
                                                                     use_features=use_features,
                                                                     use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model_binary = model_binary.to("cuda")

trainer_binary = Trainer(
  model=model_binary,                         # the instantiated Transformers model to be trained
  args=training_args_binary,                  # training arguments, defined above
  train_dataset=train_dataset_binary,         # training dataset
  eval_dataset=valid_dataset_binary,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_binary.train()

In [None]:

preds = trainer_binary.predict(valid_dataset_binary)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"* Predicted {class_labels_binary[pred_label]} ({logits}) != real {class_labels_binary[label]} for:\n{text}")
        bin_features.append((text, class_labels_binary[pred_label], class_labels_binary[label]))
        
df = pd.DataFrame.from_dict(bin_features)
df.to_csv("bin_features_normalized.csv")

# EMBEDDINGS ONLY

In [None]:
set_seed(42)
mlflow.end_run()
class_labels_binary = ["kritiikki/oma narratiivi","kopiointi"]
train_snippets_labels_binary = [1 if x==1 else 0 for x in train_snippets_labels]
valid_snippets_labels_binary = [1 if x==1 else 0 for x in valid_snippets_labels]
(all_snippets_classes_binary, all_snippets_class_counts_binary) = np.unique(np.concatenate((train_snippets_labels_binary,valid_snippets_labels_binary)),return_counts=True)
train_dataset_binary = Dataset(train_snippets_encodings, train_snippets_labels_binary, train_snippets_topics, train_snippets_features)
valid_dataset_binary = Dataset(valid_snippets_encodings, valid_snippets_labels_binary, valid_snippets_topics, valid_snippets_features)
num_classes = 2
class_weights = (1/all_snippets_class_counts_binary).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [5.0,1.0]
freeze_bert_weights = True
use_topics = False
use_features = False
use_embeddings = True

print(all_snippets_class_counts_binary,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args_binary = TrainingArguments(
    output_dir='./results_binary',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs_binary',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)

model_binary = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                                     num_labels=num_classes, 
                                                                     problem_type='single_label_classification', 
                                                                     class_weights=class_weights, 
                                                                     freeze_bert_weights=freeze_bert_weights, 
                                                                     use_topics=use_topics, 
                                                                     use_features=use_features,
                                                                     use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model_binary = model_binary.to("cuda")

trainer_binary = Trainer(
  model=model_binary,                         # the instantiated Transformers model to be trained
  args=training_args_binary,                  # training arguments, defined above
  train_dataset=train_dataset_binary,         # training dataset
  eval_dataset=valid_dataset_binary,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_binary.train()

In [None]:

preds = trainer_binary.predict(valid_dataset_binary)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"* Predicted {class_labels_binary[pred_label]} ({logits}) != real {class_labels_binary[label]} for:\n{text}")
        bin_embeds.append((text, class_labels_binary[pred_label], class_labels_binary[label]))
        
df = pd.DataFrame.from_dict(bin_embeds)
df.to_csv("bin_embeds.csv")

# TOPICS + FEATURES

In [None]:
set_seed(42)
mlflow.end_run()
class_labels_binary = ["kritiikki/oma narratiivi","kopiointi"]
train_snippets_labels_binary = [1 if x==1 else 0 for x in train_snippets_labels]
valid_snippets_labels_binary = [1 if x==1 else 0 for x in valid_snippets_labels]
(all_snippets_classes_binary, all_snippets_class_counts_binary) = np.unique(np.concatenate((train_snippets_labels_binary,valid_snippets_labels_binary)),return_counts=True)
train_dataset_binary = Dataset(train_snippets_encodings, train_snippets_labels_binary, train_snippets_topics, train_snippets_features)
valid_dataset_binary = Dataset(valid_snippets_encodings, valid_snippets_labels_binary, valid_snippets_topics, valid_snippets_features)
num_classes = 2
class_weights = (1/all_snippets_class_counts_binary).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [5.0,1.0]
freeze_bert_weights = True
use_topics = True
use_features = True
use_embeddings = False

print(all_snippets_class_counts_binary,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args_binary = TrainingArguments(
    output_dir='./results_binary',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs_binary',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)

model_binary = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                                     num_labels=num_classes, 
                                                                     problem_type='single_label_classification', 
                                                                     class_weights=class_weights, 
                                                                     freeze_bert_weights=freeze_bert_weights, 
                                                                     use_topics=use_topics, 
                                                                     use_features=use_features,
                                                                     use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model_binary = model_binary.to("cuda")

trainer_binary = Trainer(
  model=model_binary,                         # the instantiated Transformers model to be trained
  args=training_args_binary,                  # training arguments, defined above
  train_dataset=train_dataset_binary,         # training dataset
  eval_dataset=valid_dataset_binary,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_binary.train()

In [None]:

preds = trainer_binary.predict(valid_dataset_binary)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"* Predicted {class_labels_binary[pred_label]} ({logits}) != real {class_labels_binary[label]} for:\n{text}")
        bin_topics_features.append((text, class_labels_binary[pred_label], class_labels_binary[label]))
        
df = pd.DataFrame.from_dict(bin_topics_features)
df.to_csv("bin_topics_features.csv")

# TOPICS + EMBEDDINGS

In [None]:
set_seed(42)
mlflow.end_run()
class_labels_binary = ["kritiikki/oma narratiivi","kopiointi"]
train_snippets_labels_binary = [1 if x==1 else 0 for x in train_snippets_labels]
valid_snippets_labels_binary = [1 if x==1 else 0 for x in valid_snippets_labels]
(all_snippets_classes_binary, all_snippets_class_counts_binary) = np.unique(np.concatenate((train_snippets_labels_binary,valid_snippets_labels_binary)),return_counts=True)
train_dataset_binary = Dataset(train_snippets_encodings, train_snippets_labels_binary, train_snippets_topics, train_snippets_features)
valid_dataset_binary = Dataset(valid_snippets_encodings, valid_snippets_labels_binary, valid_snippets_topics, valid_snippets_features)
num_classes = 2
class_weights = (1/all_snippets_class_counts_binary).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [5.0,1.0]
freeze_bert_weights = True
use_topics = True
use_features = False
use_embeddings = True

print(all_snippets_class_counts_binary,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args_binary = TrainingArguments(
    output_dir='./results_binary',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs_binary',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)

model_binary = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                                     num_labels=num_classes, 
                                                                     problem_type='single_label_classification', 
                                                                     class_weights=class_weights, 
                                                                     freeze_bert_weights=freeze_bert_weights, 
                                                                     use_topics=use_topics, 
                                                                     use_features=use_features,
                                                                     use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model_binary = model_binary.to("cuda")

trainer_binary = Trainer(
  model=model_binary,                         # the instantiated Transformers model to be trained
  args=training_args_binary,                  # training arguments, defined above
  train_dataset=train_dataset_binary,         # training dataset
  eval_dataset=valid_dataset_binary,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_binary.train()

In [None]:

preds = trainer_binary.predict(valid_dataset_binary)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"* Predicted {class_labels_binary[pred_label]} ({logits}) != real {class_labels_binary[label]} for:\n{text}")
        bin_topics_embeds.append((text, class_labels_binary[pred_label], class_labels_binary[label]))
        
df = pd.DataFrame.from_dict(bin_topics_embeds)
df.to_csv("bin_topics_embeds.csv")

# FEATURES + EMBEDDINGS

In [None]:
set_seed(42)
mlflow.end_run()
class_labels_binary = ["kritiikki/oma narratiivi","kopiointi"]
train_snippets_labels_binary = [1 if x==1 else 0 for x in train_snippets_labels]
valid_snippets_labels_binary = [1 if x==1 else 0 for x in valid_snippets_labels]
(all_snippets_classes_binary, all_snippets_class_counts_binary) = np.unique(np.concatenate((train_snippets_labels_binary,valid_snippets_labels_binary)),return_counts=True)
train_dataset_binary = Dataset(train_snippets_encodings, train_snippets_labels_binary, train_snippets_topics, train_snippets_features)
valid_dataset_binary = Dataset(valid_snippets_encodings, valid_snippets_labels_binary, valid_snippets_topics, valid_snippets_features)
num_classes = 2
class_weights = (1/all_snippets_class_counts_binary).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [5.0,1.0]
freeze_bert_weights = True
use_topics = False
use_features = True
use_embeddings = True

print(all_snippets_class_counts_binary,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args_binary = TrainingArguments(
    output_dir='./results_binary',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs_binary',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)

model_binary = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                                     num_labels=num_classes, 
                                                                     problem_type='single_label_classification', 
                                                                     class_weights=class_weights, 
                                                                     freeze_bert_weights=freeze_bert_weights, 
                                                                     use_topics=use_topics, 
                                                                     use_features=use_features,
                                                                     use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model_binary = model_binary.to("cuda")

trainer_binary = Trainer(
  model=model_binary,                         # the instantiated Transformers model to be trained
  args=training_args_binary,                  # training arguments, defined above
  train_dataset=train_dataset_binary,         # training dataset
  eval_dataset=valid_dataset_binary,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)
trainer_binary.train()

In [None]:

preds = trainer_binary.predict(valid_dataset_binary)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"* Predicted {class_labels_binary[pred_label]} ({logits}) != real {class_labels_binary[label]} for:\n{text}")
        bin_features_embeds.append((text, class_labels_binary[pred_label], class_labels_binary[label]))
        
df = pd.DataFrame.from_dict(bin_features_embeds)
df.to_csv("bin_features_embeds.csv")

# Experiment 2: three categories

# ONLY TOPICS

In [None]:
set_seed(42)
mlflow.end_run()
train_dataset = Dataset(train_snippets_encodings, train_snippets_labels, train_snippets_topics, train_snippets_features)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics, valid_snippets_features)

#train_dataset = Dataset(None, train_snippets_labels, train_snippets_topics)
#valid_dataset = Dataset(None, valid_snippets_labels, valid_snippets_topics)



num_classes = 3
class_weights = (1/all_snippets_class_counts).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [100.0,1.0,100.0]
freeze_bert_weights = True

use_topics = True
use_features = False
use_embeddings = False

print(all_snippets_class_counts,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)
model = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                              num_labels=num_classes, 
                                                              problem_type='single_label_classification', 
                                                              class_weights=class_weights, 
                                                              freeze_bert_weights=freeze_bert_weights, 
                                                              use_topics=use_topics, 
                                                              use_features=use_features,
                                                              use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model = model.to("cuda")

trainer = Trainer(
  model=model,                         # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=valid_dataset,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

In [None]:



preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) != real {class_labels[label]} for:\n{text}")
        multi_topics.append((text, class_labels[pred_label], class_labels[label]))
        
df = pd.DataFrame.from_dict(multi_topics)
df.to_csv("multi_topics.csv")

In [None]:
multi_only_topics[2]

# ONLY FEATURES

In [None]:
set_seed(42)
mlflow.end_run()
train_dataset = Dataset(train_snippets_encodings, train_snippets_labels, train_snippets_topics, train_snippets_features)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics, valid_snippets_features)

#train_dataset = Dataset(None, train_snippets_labels, train_snippets_topics)
#valid_dataset = Dataset(None, valid_snippets_labels, valid_snippets_topics)



num_classes = 3
class_weights = (1/all_snippets_class_counts).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [100.0,1.0,100.0]
freeze_bert_weights = True

use_features = True
use_topics = False
use_embeddings = False

print(all_snippets_class_counts,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)
model = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                              num_labels=num_classes, 
                                                              problem_type='single_label_classification', 
                                                              class_weights=class_weights, 
                                                              freeze_bert_weights=freeze_bert_weights, 
                                                              use_topics=use_topics, 
                                                              use_features=use_features,
                                                              use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model = model.to("cuda")

trainer = Trainer(
  model=model,                         # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=valid_dataset,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

In [None]:


preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) != real {class_labels[label]} for:\n{text}")
        multi_features.append((text, class_labels[pred_label], class_labels[label]))
        
df = pd.DataFrame.from_dict(multi_features)
df.to_csv("multi_features.csv")

# ONLY EMBEDDINGS

In [None]:
set_seed(42)
mlflow.end_run()
train_dataset = Dataset(train_snippets_encodings, train_snippets_labels, train_snippets_topics, train_snippets_features)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics, valid_snippets_features)

#train_dataset = Dataset(None, train_snippets_labels, train_snippets_topics)
#valid_dataset = Dataset(None, valid_snippets_labels, valid_snippets_topics)



num_classes = 3
class_weights = (1/all_snippets_class_counts).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [100.0,1.0,100.0]
freeze_bert_weights = True
use_topics = False
use_features = False
use_embeddings = True


print(all_snippets_class_counts,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)
model = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                              num_labels=num_classes, 
                                                              problem_type='single_label_classification', 
                                                              class_weights=class_weights, 
                                                              freeze_bert_weights=freeze_bert_weights, 
                                                              use_topics=use_topics, 
                                                              use_features=use_features,
                                                              use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model = model.to("cuda")

trainer = Trainer(
  model=model,                         # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=valid_dataset,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

In [None]:


preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) != real {class_labels[label]} for:\n{text}")
        multi_embeds.append((text, class_labels[pred_label], class_labels[label]))
        
df = pd.DataFrame.from_dict(multi_embeds)
df.to_csv("multi_embeds.csv")

# TOPICS + FEATURES

In [None]:
set_seed(42)
mlflow.end_run()
train_dataset = Dataset(train_snippets_encodings, train_snippets_labels, train_snippets_topics, train_snippets_features)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics, valid_snippets_features)

#train_dataset = Dataset(None, train_snippets_labels, train_snippets_topics)
#valid_dataset = Dataset(None, valid_snippets_labels, valid_snippets_topics)



num_classes = 3
class_weights = (1/all_snippets_class_counts).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [100.0,1.0,100.0]
freeze_bert_weights = True
use_topics = True
use_features = True
use_embeddings = False


print(all_snippets_class_counts,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)
model = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                              num_labels=num_classes, 
                                                              problem_type='single_label_classification', 
                                                              class_weights=class_weights, 
                                                              freeze_bert_weights=freeze_bert_weights, 
                                                              use_topics=use_topics, 
                                                              use_features=use_features,
                                                              use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model = model.to("cuda")

trainer = Trainer(
  model=model,                         # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=valid_dataset,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

In [None]:


preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) != real {class_labels[label]} for:\n{text}")
        multi_topics_features.append((text, class_labels[pred_label], class_labels[label]))
        
df = pd.DataFrame.from_dict(multi_topics_features)
df.to_csv("multi_topics_features.csv")

# TOPICS + EMBEDDINGS

In [None]:
set_seed(42)
mlflow.end_run()
train_dataset = Dataset(train_snippets_encodings, train_snippets_labels, train_snippets_topics, train_snippets_features)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics, valid_snippets_features)

#train_dataset = Dataset(None, train_snippets_labels, train_snippets_topics)
#valid_dataset = Dataset(None, valid_snippets_labels, valid_snippets_topics)



num_classes = 3
class_weights = (1/all_snippets_class_counts).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [100.0,1.0,100.0]
freeze_bert_weights = True
use_topics = True
use_features = False
use_embeddings = True


print(all_snippets_class_counts,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)
model = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                              num_labels=num_classes, 
                                                              problem_type='single_label_classification', 
                                                              class_weights=class_weights, 
                                                              freeze_bert_weights=freeze_bert_weights, 
                                                              use_topics=use_topics, 
                                                              use_features=use_features,
                                                              use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model = model.to("cuda")

trainer = Trainer(
  model=model,                         # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=valid_dataset,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

In [None]:


preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) != real {class_labels[label]} for:\n{text}")
        multi_topics_embeds.append((text, class_labels[pred_label], class_labels[label]))
        
df = pd.DataFrame.from_dict(multi_topics_embeds)
df.to_csv("multi_topics_embeds.csv")

# FEATURES + EMBEDDINGS

In [None]:
set_seed(42)
mlflow.end_run()
train_dataset = Dataset(train_snippets_encodings, train_snippets_labels, train_snippets_topics, train_snippets_features)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics, valid_snippets_features)

#train_dataset = Dataset(None, train_snippets_labels, train_snippets_topics)
#valid_dataset = Dataset(None, valid_snippets_labels, valid_snippets_topics)



num_classes = 3
class_weights = (1/all_snippets_class_counts).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [100.0,1.0,100.0]
freeze_bert_weights = True
use_topics = False
use_features = True
use_embeddings = True


print(all_snippets_class_counts,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)
model = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                              num_labels=num_classes, 
                                                              problem_type='single_label_classification', 
                                                              class_weights=class_weights, 
                                                              freeze_bert_weights=freeze_bert_weights, 
                                                              use_topics=use_topics, 
                                                              use_features=use_features,
                                                              use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model = model.to("cuda")

trainer = Trainer(
  model=model,                         # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=valid_dataset,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

# CLS + TOPICS + STRUCTURAL FEATURES

In [6]:
set_seed(42)
mlflow.end_run()
train_dataset = Dataset(train_snippets_encodings, train_snippets_labels, train_snippets_topics, train_snippets_features)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics, valid_snippets_features)

#train_dataset = Dataset(None, train_snippets_labels, train_snippets_topics)
#valid_dataset = Dataset(None, valid_snippets_labels, valid_snippets_topics)



num_classes = 3
class_weights = (1/all_snippets_class_counts).tolist()
#class_weights = ((np.sum(all_snippets_class_counts)-all_snippets_class_counts)/all_snippets_class_counts).tolist()
#class_weights = [100.0,1.0,100.0]
freeze_bert_weights = True
use_topics = True
use_features = True
use_embeddings = True


print(all_snippets_class_counts,class_weights)

from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType

batch_size = 32

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=400,              # total number of training epochs
    per_device_train_batch_size=batch_size,  # batch size per device during training
    per_device_eval_batch_size=512,   # batch size for evaluation
#    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    warmup_steps=max(10,1000//2//batch_size),                # number of warmup steps for learning rate scheduler
    logging_steps=max(1,4000//batch_size),               # log & save weights each logging_steps
    save_steps=max(1,4000//batch_size),
    evaluation_strategy="steps",     # evaluate each `logging_steps`
    save_strategy="steps",
#    lr_scheduler_type = SchedulerType.COSINE_WITH_RESTARTS
)
model = BertForWeightedSequenceClassification.from_pretrained(model_name, 
                                                              num_labels=num_classes, 
                                                              problem_type='single_label_classification', 
                                                              class_weights=class_weights, 
                                                              freeze_bert_weights=freeze_bert_weights, 
                                                              use_topics=use_topics, 
                                                              use_features=use_features,
                                                              use_embeddings=use_embeddings)
if torch.cuda.is_available():
  model = model.to("cuda")

trainer = Trainer(
  model=model,                         # the instantiated Transformers model to be trained
  args=training_args,                  # training arguments, defined above
  train_dataset=train_dataset,         # training dataset
  eval_dataset=valid_dataset,          # evaluation dataset
  compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

[ 137 1040  334] [0.0072992700729927005, 0.0009615384615384616, 0.0029940119760479044]
arvot
768 3


Some weights of the model checkpoint at TurkuNLP/bert-base-finnish-uncased-v1 were not used when initializing BertForWeightedSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForWeightedSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForWeightedSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForWeightedSequenceClassificati

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Accuracy,Confusion Matrix
125,0.9683,0.834119,0.741144,[[ 12 5 6]  [ 26 228 16]  [ 25 17 32]]
250,0.8223,0.747034,0.776567,[[ 16 5 2]  [ 22 236 12]  [ 22 19 33]]
375,0.7499,0.707459,0.809264,[[ 16 5 2]  [ 21 239 10]  [ 12 20 42]]
500,0.7156,0.671065,0.798365,[[ 16 4 3]  [ 23 233 14]  [ 16 14 44]]
625,0.6839,0.654101,0.833787,[[ 15 5 3]  [ 15 241 14]  [ 5 19 50]]
750,0.6582,0.632024,0.822888,[[ 15 5 3]  [ 17 237 16]  [ 10 14 50]]
875,0.6297,0.618591,0.825613,[[ 15 5 3]  [ 16 236 18]  [ 11 11 52]]
1000,0.6111,0.605437,0.828338,[[ 13 4 6]  [ 12 234 24]  [ 7 10 57]]
1125,0.5957,0.595068,0.831063,[[ 17 3 3]  [ 17 233 20]  [ 9 10 55]]
1250,0.5821,0.602446,0.825613,[[ 16 4 3]  [ 13 240 17]  [ 14 13 47]]


***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 12   5   6]
 [ 26 228  16]
 [ 25  17  32]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 12   5   6]
 [ 26 228  16]
 [ 25  17  32]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-125
Configuration saved in ./results/checkpoint-125/config.json
Model weights saved in ./results/checkpoint-125/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 16   5   2]
 [ 22 236  12]
 [ 22  19  33]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts

  Batch size = 512
Trainer is attempting to log a value of "[[ 14   4   5]
 [ 10 240  20]
 [ 11  13  50]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 14   4   5]
 [ 10 240  20]
 [ 11  13  50]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 15   4   4]
 [ 10 237  23]
 [  8  11  55]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute

Trainer is attempting to log a value of "[[ 17   3   3]
 [ 15 233  22]
 [ 12  10  52]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-2875
Configuration saved in ./results/checkpoint-2875/config.json
Model weights saved in ./results/checkpoint-2875/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 14   5   4]
 [ 10 240  20]
 [ 12  15  47]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 14   5   4]
 [ 10 240  20]
 [ 12  15  47]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving

Saving model checkpoint to ./results/checkpoint-4250
Configuration saved in ./results/checkpoint-4250/config.json
Model weights saved in ./results/checkpoint-4250/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 14   4   5]
 [ 13 235  22]
 [ 11  14  49]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 14   4   5]
 [ 13 235  22]
 [ 11  14  49]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-4375
Configuration saved in ./results/checkpoint-4375/config.json
Model weights saved in ./results/checkpoint-4375/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Train

Configuration saved in ./results/checkpoint-5625/config.json
Model weights saved in ./results/checkpoint-5625/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 15   3   5]
 [ 15 231  24]
 [ 10  13  51]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 15   3   5]
 [ 15 231  24]
 [ 10  13  51]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-5750
Configuration saved in ./results/checkpoint-5750/config.json
Model weights saved in ./results/checkpoint-5750/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 14   4   5]
 [

Model weights saved in ./results/checkpoint-7000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 14   4   5]
 [  8 244  18]
 [  8  18  48]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 14   4   5]
 [  8 244  18]
 [  8  18  48]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-7125
Configuration saved in ./results/checkpoint-7125/config.json
Model weights saved in ./results/checkpoint-7125/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 14   4   5]
 [ 12 236  22]
 [ 10  16  48]]" of type <class 'str'> for key "

***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 15   3   5]
 [ 12 238  20]
 [ 10  16  48]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 15   3   5]
 [ 12 238  20]
 [ 10  16  48]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-8500
Configuration saved in ./results/checkpoint-8500/config.json
Model weights saved in ./results/checkpoint-8500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 15   3   5]
 [ 12 241  17]
 [ 12  17  45]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only acce

  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 11 241  18]
 [  9  15  50]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 11 241  18]
 [  9  15  50]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-9875
Configuration saved in ./results/checkpoint-9875/config.json
Model weights saved in ./results/checkpoint-9875/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 14   3   6]
 [ 12 235  23]
 [  9  13  52]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we d

  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 15   3   5]
 [ 12 241  17]
 [ 11  15  48]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 15   3   5]
 [ 12 241  17]
 [ 11  15  48]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-11250
Configuration saved in ./results/checkpoint-11250/config.json
Model weights saved in ./results/checkpoint-11250/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 14   3   6]
 [ 12 241  17]
 [ 10  15  49]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so w

  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 11 242  17]
 [  9  15  50]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 11 242  17]
 [  9  15  50]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-12625
Configuration saved in ./results/checkpoint-12625/config.json
Model weights saved in ./results/checkpoint-12625/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 12 241  17]
 [  9  15  50]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so w

  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 12 241  17]
 [  9  15  50]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 12 241  17]
 [  9  15  50]]" of type <class 'str'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Saving model checkpoint to ./results/checkpoint-14000
Configuration saved in ./results/checkpoint-14000/config.json
Model weights saved in ./results/checkpoint-14000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 367
  Batch size = 512
Trainer is attempting to log a value of "[[ 13   4   6]
 [ 12 241  17]
 [  9  15  50]]" of type <class 'str'> for key "eval_confusion_matrix" as a metric. MLflow's log_metric() only accepts float and int types so w

TrainOutput(global_step=14400, training_loss=0.3720787928501765, metrics={'train_runtime': 5214.3091, 'train_samples_per_second': 87.759, 'train_steps_per_second': 2.762, 'total_flos': 1.219472917757952e+17, 'train_loss': 0.3720787928501765, 'epoch': 400.0})

In [8]:


preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) != real {class_labels[label]} for:\n{text}")
        multi_features_embeds.append((text, class_labels[pred_label], class_labels[label]))
        
        
df = pd.DataFrame.from_dict(multi_features_embeds)
df.to_csv("multi_features_embeds_topics.csv")

***** Running Prediction *****
  Num examples = 367
  Batch size = 512


[[ 15   3   5]
 [ 13 233  24]
 [ 10  11  53]]
Predicted oma narratiivi ([ 0.6529228 -3.0439527  1.8922771]) != real kritiikki for:
[CLS] lue aikaisemmat osat tasta : http : / / mvlehti. net / 2016 / 05 / 02 / suomea - johtaa - mafia - osa - 1 / http : / / mvlehti. net / 2016 / 05 / 02 / suomea - johtaa - mafia - osa - 2 / http : / / mvlehti. net / 2016 / 05 / 03 / suomea - johtaa - mafia - osa - 3 / http : / / mvlehti. net / 2016 / 05 / 04 / suomea - johtaa - mafia - osa - 4 / vaikka mielipidetuomioiden ja tyottomyyden uhka ovat merkittavia syita sille, ettei kansa uskalla nousta mafiaa vastaan, on ryovareiden aseista vaarallisin velan ohella media. edes kansanaanestyksilla ei ole valia, mikali mielipidemonopoli istuttaa jatkuvasti valheita kansalaisten tajuntaan. onneksi ihmiset ovat alkaneet siirtya vaihtoehtomedioiden lukijoiksi. ( kaavion lahde : yle. ) suomen maahanmuuttokriisi raiskauksineen ja kasvavine katujengeineen on niin nakyva epakohta, etta edes lehdisto ei ole kyennyt pi

In [None]:
### MISC 

In [7]:
model.save_pretrained("aboutness-bert-all_features")

Configuration saved in aboutness-bert-all_features/config.json
Model weights saved in aboutness-bert-all_features/pytorch_model.bin


In [None]:
import re
preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label != label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) != real {class_labels[label]} for:\n{text}")

In [None]:
## positive samples 

preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label == label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) == real {class_labels[label]} for:\n{text}")

In [None]:
print((1/all_snippets_class_counts).tolist())

In [None]:
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import SchedulerType
import re


model = BertForWeightedSequenceClassification.from_pretrained("aboutness-bert-topics")
trainer = Trainer(model=model, compute_metrics=compute_metrics)
valid_dataset = Dataset(valid_snippets_encodings, valid_snippets_labels, valid_snippets_topics)



preds = trainer.predict(valid_dataset)
print(preds.metrics['test_confusion_matrix'])
for index, (logits, label) in enumerate(zip(preds.predictions,preds.label_ids)):
    pred_label = logits.argmax()
    if pred_label == label:
        text = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(valid_snippets_encodings['input_ids'][index]))
        text = re.sub("\[PAD\]","",text).strip()
        print(f"Predicted {class_labels[pred_label]} ({logits}) == real {class_labels[label]} for:\n{text}")
