In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os
import json

from transformers import AutoModel # For BERTs
from transformers import AutoModelForSequenceClassification
import torch

import time

from tqdm import tqdm

In [3]:
###setting directory
data_dir="data"
model_dir="models"
output_dir="output"
###

# Read general value list (classes)

In [4]:
#return the classes in the model
def load_values_from_json(filepath):
    """Load values per level from json-file from `filepath`"""
    json_values = load_json_file(filepath)
    #values = set() 
    values = []
    for value in json_values["values"]:
        #values.add(value["level2"])
        values.append(value["name"])
    #values= sorted(values)

    return values

In [5]:
#read json file of classes in the model
def load_json_file(filepath):
    """Load content of json-file from `filepath`"""
    with open(filepath, 'r') as  json_file:
        return json.load(json_file)


In [6]:
#get a list of values to be predicted, and their length
values_filepath = os.path.join(data_dir, 'values.json')
values = load_values_from_json(values_filepath)
#print(values)
num_labels = len(values)

In [7]:
values

['Be creative',
 'Be curious',
 'Have freedom of thought',
 'Be choosing own goals',
 'Be independent',
 'Have freedom of action',
 'Have privacy',
 'Have an exciting life',
 'Have a varied life',
 'Be daring',
 'Have pleasure',
 'Be ambitious',
 'Have success',
 'Be capable',
 'Be intellectual',
 'Be courageous',
 'Have influence',
 'Have the right to command',
 'Have wealth',
 'Have social recognition',
 'Have a good reputation',
 'Have a sense of belonging',
 'Have good health',
 'Have no debts',
 'Be neat and tidy',
 'Have a comfortable life',
 'Have a safe country',
 'Have a stable society',
 'Be respecting traditions',
 'Be holding religious faith',
 'Be compliant',
 'Be self-disciplined',
 'Be behaving properly',
 'Be polite',
 'Be honoring elders',
 'Be humble',
 'Have life accepted as is',
 'Be helpful',
 'Be honest',
 'Be forgiving',
 'Have the own family secured',
 'Be loving',
 'Be responsible',
 'Have loyalty towards friends',
 'Have equality',
 'Be just',
 'Have a world a

# Finetune model

In [8]:

from transformers import TrainingArguments
from transformers import Trainer


In [9]:
#load pretrained model
def load_model_from_data_dir(model_dir, num_labels):
    """Loads Bert model from specified directory and converts to CUDA model if available"""
    model = AutoModelForSequenceClassification.from_pretrained(model_dir, 
    num_labels=num_labels,
)

    
    
    if torch.cuda.is_available():
        print(f'There are {torch.cuda.device_count()} GPU(s) available.')
        print(f'We will use the GPU: {torch.cuda.get_device_name(0)}')
        return model.to('cuda')
    
    print('No GPU available, using the CPU instead.')
    return model



In [10]:
'''
def predict_nn(trained_model, test_dataset):

    # Switch off dropout
    trained_model.eval()
    
    # Pass the required items from the dataset to the model    
    output = trained_model(attention_mask=torch.tensor(test_dataset["attention_mask"]), input_ids=torch.tensor(test_dataset["input_ids"]))
    #output = trained_model(attention_mask=test_dataset["attention_mask"], input_ids=test_dataset["input_ids"])    
                        
    # the output dictionary contains logits, which are the unnormalised scores for each class for each example:
    pred_labs = np.argmax(output["logits"].detach().numpy(), axis=1)

    return pred_labs
'''

from transformers import Trainer
class MultiLabelTrainer(Trainer):
    """
        A transformers `Trainer` with custom loss computation

        Methods
        -------
        compute_loss(model, inputs, return_outputs=False):
            Overrides loss computation from Trainer class
    """

    def compute_loss(self, model, inputs, return_outputs=False):
        """Custom loss computation"""
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

# prepare data for prediction (covid)

In [11]:
import yaml
with open('data\corona_motivations.yaml', 'r') as file:
    PVE_covid_yaml = yaml.safe_load(file)

In [12]:
PVE_covid_yaml.keys()

dict_keys(['english', 'dutch', 'project'])

In [13]:
PVE_covid=pd.DataFrame()
PVE_covid['opinion']=PVE_covid_yaml['english']
PVE_covid['project']=''
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==1]='Nursing homes allow visitors again'
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==2]='Reopen companies (horeca and contact professions are still closed)'
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==3]='Workers in contact professions can work again'
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==4]='Young people do not need to maintain 1.5 meter distance among each others'
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==5]='All restrictions are lifted for persons who are immune'
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==6]='Restrictions are lifted in Friesland, Groningen and Drenthe'
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==7]='Direct family members do not need to maintain 1.5 meter distance'
PVE_covid['project'][pd.Series(PVE_covid_yaml['project'])==8]='Horeca and entertainment re-open'


# Input for Topic modelling

In [14]:
#import tokenizer used
from transformers import AutoTokenizer

#use tokenizer of "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
from datasets import Dataset

In [15]:
#change
PVE_input = Dataset.from_pandas(PVE_covid)


In [16]:
PVE_input

Dataset({
    features: ['opinion', 'project'],
    num_rows: 59461
})

In [17]:
#Define tokenize_function
def tokenize_function(dataset):
    # Pass two strings to the tokenizer -- it will concatenate them with a [SEP] special token between them. 
    model_inputs = tokenizer(dataset['opinion'], dataset['project'], padding="max_length", max_length=200, truncation='longest_first')
    #model_inputs = tokenizer(dataset['opinion'], padding="max_length", max_length=200, truncation='longest_first')
    return model_inputs


In [18]:
#add tokenized words, "attention_mask" and "input_ids" into the datasets
PVE_input = PVE_input.map(tokenize_function, batched=True)

Map:   0%|          | 0/59461 [00:00<?, ? examples/s]

In [19]:
PVE_input

Dataset({
    features: ['opinion', 'project', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 59461
})

In [20]:
#setting arguments
from transformers import TrainingArguments
batch_size = 8
args = TrainingArguments(
    output_dir=model_dir,
    do_train=False,
    do_eval=False,
    do_predict=True,
    per_device_eval_batch_size=batch_size
    )


In [21]:
#load pretrained model
model = load_model_from_data_dir('./models/bert_train_level1/' , num_labels=len(values))

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce MX550


In [22]:
from transformers import Trainer
class MultiLabelTrainer(Trainer):
    """
        A transformers `Trainer` with custom loss computation

        Methods
        -------
        compute_loss(model, inputs, return_outputs=False):
            Overrides loss computation from Trainer class
    """

    def compute_loss(self, model, inputs, return_outputs=False):
        """Custom loss computation"""
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [23]:
#do projection
startTime = time.time()
multi_trainer = MultiLabelTrainer(
    model,
    args,
    tokenizer=tokenizer
    )

prediction = 1 * (multi_trainer.predict(PVE_input ).predictions > 0.5)
time.time()-startTime

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: project, opinion. If project, opinion are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 59461
  Batch size = 8
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


5224.204123258591

In [26]:
result_covid=pd.concat([PVE_covid,pd.DataFrame(prediction, columns=values)], axis=1)
result_covid.to_csv(os.path.join(output_dir, 'result_covid.csv'),index=False) 