In [1]:
import sys
sys.path.append("..")

from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from sklearn.model_selection import GroupShuffleSplit 
import matplotlib.pyplot as plt
import nltk
from torch import optim
from nltk.corpus import stopwords
from models import bert
from transformers import BertTokenizer, BertForSequenceClassification, BertPreTrainedModel, BertModel
import json
from torch.utils.data import DataLoader
from ray import tune
from ray.air import Checkpoint, session
from ray.tune.schedulers import ASHAScheduler
from functools import partial
import os
os.environ["RAY_FUNCTION_SIZE_ERROR_THRESHOLD"] = "1000"  # Set a higher threshold value


In [2]:
train_df = pd.read_csv('train_new_agr.csv',delimiter=',', encoding='latin-1')
test_df = pd.read_csv('test_new_agr.csv', delimiter=',')


In [4]:
test_df = test_df[test_df['disagreement']==False]

In [5]:
test_df

Unnamed: 0,annotator_id,annotation_id,unit_id,text,annotation,is_age_related,unique_annotations,disagreement
18,12,10000484,830,Getting old isn't easy and the little things s...,1,Yes,1,False
35,28,10001142,1390,People in the younger of these two groups are ...,1,No,1,False
55,54,10002181,1390,People in the younger of these two groups are ...,1,Yes,1,False
58,61,10002442,830,Getting old isn't easy and the little things s...,1,Yes,1,False
92,86,10003461,1390,People in the younger of these two groups are ...,1,Yes,1,False
...,...,...,...,...,...,...,...,...
1242,1299,10051984,11915,That patronizing look of pity makes me want to...,1,Yes,1,False
1250,1307,10052275,11512,I have several friends who vehemently deny the...,2,Yes,1,False
1254,1310,10052395,11512,I have several friends who vehemently deny the...,2,Yes,1,False
1262,1316,10052660,11939,The younger adults' brain scans showed activit...,2,No,1,False


In [13]:
train_df = pd.read_csv('../data/train_older_adult_annotations.csv',delimiter=',', encoding='latin-1')
test_df = pd.read_csv('../data/test_annotations.csv', delimiter=',')
df = pd.concat([test_df, train_df])

age_anxiety_df = pd.read_csv('../data/age_anxiety_full_responses.csv', delimiter=',')
age_experience_df = pd.read_csv('../data/age_experience_responses.csv', delimiter=',')
demographics_df = pd.read_csv('../data/demographics_responses.csv', delimiter=',')
anxiety_score_df = pd.read_csv('../data/respondent_anxiety_table.csv', delimiter=',')

df1 = pd.merge(demographics_df, anxiety_score_df, on='respondent_id')
merged_df = pd.merge(df, df1, on='respondent_id')

sentiment_labels = ['Very negative','Somewhat negative','Neutral','Somewhat positive','Very positive']
total_annotator_ids = merged_df['respondent_id'].unique().tolist()

id2label = {index: row for (index, row) in enumerate(sentiment_labels)} 
label2id = {row: index for (index, row) in enumerate(sentiment_labels)}

id2annotator = {index: row for (index, row) in enumerate(total_annotator_ids)}
annotator2id = {row: index for (index, row) in enumerate(total_annotator_ids)}

merged_df["annotation"] = merged_df["annotation"].map(label2id)
merged_df["respondent_id"] = merged_df["respondent_id"].map(annotator2id)

merged_df.rename(columns = {'respondent_id':'annotator_id', 'unit_text':'text'}, inplace = True)


In [14]:
    splitter = GroupShuffleSplit(test_size=0.3, n_splits=2, random_state = 0)
    split = splitter.split(merged_df, groups=merged_df['unit_id'])
    train_inds, test_inds = next(split)
    train_df = merged_df.iloc[train_inds]
    test_val_df = merged_df.iloc[test_inds]
    splitter = GroupShuffleSplit(test_size=0.5, n_splits=2, random_state = 0)
    split = splitter.split(test_val_df, groups=test_val_df['unit_id'])
    val_inds, test_inds = next(split)
    val_df = test_val_df.iloc[val_inds]
    test_df = test_val_df.iloc[test_inds]
    train_df = train_df.sample(frac=1)
    test_df = test_df.sample(frac=1)    
    val_df = val_df.sample(frac=1)
    len(train_df["annotator_id"].unique()), len(val_df["annotator_id"].unique()), len(test_df["annotator_id"].unique())

(1481, 1477, 1481)

In [15]:
labels = merged_df['annotation'].unique()
#sort labels
labels.sort()

In [16]:
device = torch.device("cuda")

In [17]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(labels)).to(device)
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [18]:
# Define batch size and number of workers for data loaders
batch_size = 8
num_workers = 2

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create training and testing datasets
train_dataset = bert.CustomDataset(train_df, tokenizer, labels)
val_dataset = bert.CustomDataset(val_df, tokenizer, labels)
test_dataset = bert.CustomDataset(test_df, tokenizer, labels)

# Create training and testing data loaders
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)


In [19]:
config = {
    "lr": tune.loguniform(1e-7, 1e-1),
    "weight_decay": tune.choice([0.001, 0.01, 0.1, 0.2])
}

In [20]:
mode = "text"

In [21]:
def train(config):
    # Training loop
    num_epochs = 10
    optimizer = torch.optim.AdamW(model.parameters(), lr=config["lr"].sample(), weight_decay = config["weight_decay"].sample())
    loss_fn = torch.nn.CrossEntropyLoss().to(device)
    checkpoint = session.get_checkpoint()
    
    if checkpoint:
        checkpoint_state = checkpoint.to_dict()
        start_epoch = checkpoint_state["epoch"]
        model.load_state_dict(checkpoint_state["net_state_dict"])
        optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"])
    else:
        start_epoch = 0


    for epoch in range(start_epoch, num_epochs):
        print("epochh:", epoch)
        model.train()
        total_loss = 0
        for batch in train_data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            annotator_ids = batch['annotator_id'].to(device)
            labels = batch['label'].to(device)
            if mode=="groups" :
                w, log_p = model(annotator_ids = annotator_ids, input_ids =input_ids, attention_mask = attention_mask, labels =                   labels)
                loss = torch.zeros(input_ids.size(0))
                for i in range(input_ids.size(0)):
                    loss[i] = - (w[i].log_softmax(dim=1) + log_p[i].reshape(-1, 1)).logsumexp(dim=0)[labels[i]]
                # Backward pass and optimization
                loss = torch.mean(loss)
            elif mode=="annotators":
                # Forward pass
                outputs = model(annotator_ids = annotator_ids, input_ids =input_ids, attention_mask = attention_mask, labels =                     labels)
                loss = outputs[0]
            else:
                outputs = model(input_ids =input_ids, attention_mask = attention_mask, labels = labels)
                loss = outputs[0]
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_data_loader)
        
        total_val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for batch in val_data_loader:
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                annotator_ids = batch['annotator_id'].to(device)
                labels = batch['label'].to(device)
                if mode=="groups" :
                    w, log_p = model(annotator_ids = annotator_ids, input_ids =input_ids, attention_mask = attention_mask, labels = labels)
                    loss = torch.zeros(input_ids.size(0))
                    for i in range(input_ids.size(0)):
                        loss[i] = - (w[i].log_softmax(dim=1) + log_p[i].reshape(-1, 1)).logsumexp(dim=0)[labels[i]]
                    loss = torch.mean(loss)
                    best_group = log_p.argmax(dim=1)
                    w = w[range(len(w)), best_group]
                    _, predicted = torch.max(w, 1)
                elif mode=="annotators":
                    # Forward pass
                    outputs = model(annotator_ids = annotator_ids, input_ids =input_ids, attention_mask = attention_mask, labels = labels)
                    loss = outputs[0]
                    logits = outputs[1]
                    # Get predicted labels
                    _, predicted = torch.max(logits, dim=1)
                else:
                    outputs = model(input_ids =input_ids, attention_mask = attention_mask, labels = labels)
                    print(outputs.shape, outputs)
                    loss = outputs[0]
                    logits = outputs[1]
                    # Get predicted labels
                    _, predicted = torch.max(logits, dim=1)
                    
#                 _, predicted = torch.max(outputs.data, 1)
#                 total += labels.size(0)
#                 correct += (predicted == labels).sum().item()


                total_val_loss += loss.item()
    
                correct_predictions += (predicted == labels).sum().item()
                total_predictions += labels.size(0)
                
        average_val_loss = total_val_loss / len(val_data_loader)
        accuracy = correct_predictions / total_predictions

        checkpoint_data = {
            "epoch": epoch,
            "net_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
        }
        checkpoint = Checkpoint.from_dict(checkpoint_data)

        session.report(
            {"loss": average_val_loss, "accuracy": accuracy},
            checkpoint=checkpoint,
        )

        print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {average_loss:.4f}')


In [None]:
 scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=10,
        grace_period=1,
        reduction_factor=2,
    )
result = tune.run(
    partial(train),
    resources_per_trial={"cpu": 3, "gpu": 1},
    config=config,
    num_samples=10,
    scheduler=scheduler
)

0,1
Current time:,2023-07-04 11:38:09
Running for:,00:16:21.51
Memory:,17.9/62.4 GiB

Trial name,status,loc,lr,weight_decay
train_33617_00000,PENDING,,2.55557e-06,0.01
train_33617_00001,PENDING,,2.22465e-06,0.001
train_33617_00002,PENDING,,0.00863107,0.001
train_33617_00003,PENDING,,0.00743458,0.1
train_33617_00004,PENDING,,1.51521e-06,0.1
train_33617_00005,PENDING,,0.0013465,0.01
train_33617_00006,PENDING,,0.016575,0.001
train_33617_00007,PENDING,,8.59123e-05,0.2
train_33617_00008,PENDING,,1.59707e-05,0.2
train_33617_00009,PENDING,,0.000925694,0.001


In [30]:
# train(model, device, train_data_loader,val_data_loader, mode="text")

In neither tune session nor train session!


KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), 'text.pth')

In [None]:
bert.test(model, device, test_data_loader, mode="text")

In [6]:
#!/usr/bin/env python
# coding: utf-8

# In[3]:

import os

# Set the CUDA_LAUNCH_BLOCKING environment variable
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# os.environ['TORCH_USE_CUDA_DSA'] = '1'
#!/usr/bin/env python
# coding: utf-8

# In[3]:


import sys
sys.path.append("..")
from models import bert
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from sklearn.model_selection import GroupShuffleSplit
import matplotlib.pyplot as plt
import nltk
from torch import optim
from torch.utils.data import Dataset
from nltk.corpus import stopwords
from transformers import BertTokenizer, DistilBertForSequenceClassification, BertPreTrainedModel, BertModel, BertForSequenceClassification, BertConfig, get_linear_schedule_with_warmup
import json
from torch.utils.data import DataLoader
import numpy as np


# In[4]:


train_df = pd.read_csv('train_new_agr.csv',delimiter=',', encoding='latin-1')
test_df = pd.read_csv('test_new_agr.csv', delimiter=',')

train_df = train_df[train_df['disagreement']==False]

total_annotator_ids = train_df['annotator_id'].unique().tolist()


train_labels = train_df['annotation'].unique()
test_labels = test_df['annotation'].unique()
labels = np.unique(np.concatenate((train_labels, test_labels), axis=0))
#sort labels
labels.sort()
num_labels_glob=len(labels)


# In[5]:


device = torch.device("cuda")


# In[6]:

configuration = BertConfig.from_pretrained("bert-base-uncased")
configuration.num_labels = len(labels)
configuration.num_annotators = len(total_annotator_ids)
configuration.annotator_embedding_dim = 512
configuration.hidden_size = 768 
model = bert.BertForSequenceClassificationWithAnnotators(configuration)

# configuration = BertConfig.from_pretrained("bert-base-uncased")
# configuration.num_labels = len(labels)
# configuration.hidden_size = 768 

# # model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels_glob)
# model = bert.BertForSequenceClassificationText(configuration).to(device)
# # model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels_glob)
# # for param in model.bert.parameters():
# #     param.requires_grad = False


# In[7]:


class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = self.data.iloc[index]['text']
        annotator_id = self.data.iloc[index]['annotator_id']
        annotation = self.data.iloc[index]['annotation']
        disagreement = self.data.iloc[index]['disagreement']

        # Tokenize the sentence
        inputs = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()
        annotator_id = torch.tensor(annotator_id, dtype=torch.long)
        annotation = torch.tensor(annotation, dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'annotator_ids': annotator_id,
            'label': annotation,
            'disagreement': disagreement
        }


# In[8]:


# Define batch size and number of workers for data loaders
batch_size = 8
num_workers = 1

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create training and testing datasets
train_dataset = CustomDataset(train_df, tokenizer)
test_dataset = CustomDataset(test_df, tokenizer)

# Create training and testing data loaders
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers)


# In[9]:


from transformers import TrainingArguments
from sklearn.metrics import accuracy_score
from transformers import Trainer


class CustomTrainer_annotators_text(Trainer):
    def compute_loss(self, model, inputs, device=torch.device("cuda"), return_outputs=False):
        
        input_ids = inputs.get("input_ids").to(device)
        attention_mask = inputs.get("attention_mask").to(device)
        annotator_ids = inputs.get("annotator_ids").to(device)
        labels = inputs.get("labels").to(device)
        disagreement = inputs.get("disagreement").to(device)
        
        outputs = model(annotator_ids = annotator_ids, input_ids =input_ids, attention_mask = attention_mask, labels = labels, freeze = True)
        loss = outputs[0]
        
        if return_outputs:
#             return loss, outputs
            return loss, {"logits":outputs[1], "disagreement":disagreement} 
        return loss


# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     acc = accuracy_score(labels, preds)
#     return {
#       'accuracy': acc,
#     }
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions[0].argmax(-1)
    acc = accuracy_score(labels, preds)
    disagreement = pred.predictions[1]
    labels_agreement = labels[~disagreement]
    labels_disagreement = labels[disagreement]
    predicted_agreement = preds[~disagreement]
    predicted_disagreement = preds[disagreement] 
    agreement_acc = accuracy_score(labels_agreement, predicted_agreement)
    disagreement_acc = accuracy_score(labels_disagreement, predicted_disagreement)
    return {
      'agreement_accuracy': agreement_acc,
      'disagreement_accuracy':disagreement_acc,
      'accuracy':acc
    }


# num_epochs = 10
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay = 0.01)
# schedule = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500,             num_training_steps=len(train_dataset)*num_epochs)
# optimizers = optimizer, schedule

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=num_epochs,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
# #     warmup_steps=500,
# #     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=250,
#     evaluation_strategy = "epoch",
#     logging_strategy="epoch",
#     remove_unused_columns=False,
# #     optim= "adamw_torch",
# #     learning_rate=0.01,
# )


training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=250,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    remove_unused_columns=False,
    optim="adamw_torch",
#     learning_rate=1e-6
)


trainer = CustomTrainer_annotators_text(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
#     eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)



trainer.train()








Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
