In [2]:
import json

data = []
with open('/content/model_annotations.aligned.paired.jsonl', 'r') as f:
    for line in f:
        data.append(json.loads(line))

print(len(data))


1600


In [None]:
data[0].keys()

dict_keys(['id', 'decoded', 'expert_annotations', 'turker_annotations', 'references', 'model_id', 'filepath', 'text'])

In [3]:
processed_data = []
for entry in data:
    expert_annotations = entry['expert_annotations']
    if expert_annotations: # Check if the list is not empty
        avg_coherence = sum([anno['coherence'] for anno in expert_annotations]) / len(expert_annotations)
        avg_consistency = sum([anno['consistency'] for anno in expert_annotations]) / len(expert_annotations)
        avg_fluency = sum([anno['fluency'] for anno in expert_annotations]) / len(expert_annotations)
        avg_relevance = sum([anno['relevance'] for anno in expert_annotations]) / len(expert_annotations)

        processed_entry = {
            'X_decoded': entry['decoded'],
            'X_text': entry['text'],
            'Y': {
                'coherence': avg_coherence,
                'consistency': avg_consistency,
                'fluency': avg_fluency,
                'relevance': avg_relevance
            }
        }
        processed_data.append(processed_entry)

In [4]:

!pip install transformers

from transformers import AutoTokenizer, AutoModel


model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_decoded = AutoModel.from_pretrained(model_name)
transformer_text = AutoModel.from_pretrained(model_name)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
def tokenize_data_dual(data, tokenizer, max_length=128):
    input_ids_1 = []
    attention_masks_1 = []
    input_ids_2 = []
    attention_masks_2 = []

    for entry in data:
        # Tokenize X_decoded
        encoded_decoded = tokenizer(
            entry['X_decoded'],
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids_1.append(encoded_decoded['input_ids'])
        attention_masks_1.append(encoded_decoded['attention_mask'])

        # Tokenize X_text
        encoded_text = tokenizer(
            entry['X_text'],
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids_2.append(encoded_text['input_ids'])
        attention_masks_2.append(encoded_text['attention_mask'])

    input_ids_1 = torch.cat(input_ids_1, dim=0)
    attention_masks_1 = torch.cat(attention_masks_1, dim=0)
    input_ids_2 = torch.cat(input_ids_2, dim=0)
    attention_masks_2 = torch.cat(attention_masks_2, dim=0)

    return input_ids_1, attention_masks_1, input_ids_2, attention_masks_2


In [7]:
import torch
# Tokenize both inputs separately
input_ids_1, attention_masks_1, input_ids_2, attention_masks_2 = tokenize_data_dual(processed_data, tokenizer)

# Prepare target labels
Y = torch.tensor([list(entry['Y'].values()) for entry in processed_data], dtype=torch.float32)


In [8]:
import torch

mean_Y = torch.mean(Y, dim=0)

std_Y = torch.std(Y, dim=0)


min_Y = torch.min(Y, dim=0).values

max_Y = torch.max(Y, dim=0).values

# Print the statistics
print("Statistics for Y:")
print(f"Mean: {mean_Y}")
print(f"Standard Deviation: {std_Y}")
print(f"Minimum: {min_Y}")
print(f"Maximum: {max_Y}")

Statistics for Y:
Mean: tensor([3.4125, 4.6604, 4.6729, 3.7771])
Standard Deviation: tensor([1.0373, 0.9177, 0.7290, 0.7977])
Minimum: tensor([1., 1., 1., 1.])
Maximum: tensor([5., 5., 5., 5.])


In [9]:
from torch.utils.data import TensorDataset

dataset = TensorDataset(input_ids_1, attention_masks_1, input_ids_2, attention_masks_2, Y)


In [10]:
from sklearn.model_selection import train_test_split

indices = torch.arange(len(Y))
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)

def subset(tensors, idxs):
    return [tensor[idxs] for tensor in tensors]

train_tensors = subset([input_ids_1, attention_masks_1, input_ids_2, attention_masks_2, Y], train_idx)
val_tensors = subset([input_ids_1, attention_masks_1, input_ids_2, attention_masks_2, Y], val_idx)

from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(*train_tensors)
val_dataset = TensorDataset(*val_tensors)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)


In [17]:
import torch
import torch.nn as nn

class CombinedModel(nn.Module):
    def __init__(self, transformer_abstract, transformer_pitch, mlp_input_dim, mlp_output_dim, use_mean_pooling=False):
        super().__init__()
        self.transformer_abstract = transformer_abstract  # For X_text
        self.transformer_pitch = transformer_pitch        # For X_decoded
        self.use_mean_pooling = use_mean_pooling

        for param in self.transformer_abstract.parameters():
            param.requires_grad = False
        for param in self.transformer_pitch.parameters():
            param.requires_grad = False

        self.mlp = nn.Sequential(
            nn.Linear(mlp_input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, mlp_output_dim)
        )

    def mean_pool(self, hidden_state, attention_mask):
        # Custom mean pooling over sequence dimension
        mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_state.size()).float()
        return (hidden_state * mask_expanded).sum(1) / mask_expanded.sum(1).clamp(min=1e-9)

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2):
        # Run both transformer encoders
        outputs_1 = self.transformer_abstract(input_ids=input_ids_1, attention_mask=attention_mask_1)
        outputs_2 = self.transformer_pitch(input_ids=input_ids_2, attention_mask=attention_mask_2)

        if self.use_mean_pooling:
            emb_1 = self.mean_pool(outputs_1.last_hidden_state, attention_mask_1)
            emb_2 = self.mean_pool(outputs_2.last_hidden_state, attention_mask_2)
        else:
            # Use [CLS] token embedding
            emb_1 = outputs_1.last_hidden_state[:, 0, :]
            emb_2 = outputs_2.last_hidden_state[:, 0, :]

        # Concatenate embeddings and pass through MLP
        combined = torch.cat((emb_1, emb_2), dim=1)
        return self.mlp(combined)


In [21]:
mlp_input_dim = transformer_text.config.hidden_size * 2  # 768 * 2 = 1536 for BERT-base
mlp_output_dim = 4  # For coherence, consistency, fluency, relevance

model = CombinedModel(
    transformer_abstract=transformer_text,
    transformer_pitch=transformer_decoded,
    mlp_input_dim=mlp_input_dim,
    mlp_output_dim=mlp_output_dim,
    use_mean_pooling=False
)


In [22]:
import torch.nn as nn


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

train_losses = []
val_losses= []
for epoch in range(20):
    model.train()
    train_loss = 0
    for batch in train_loader:
        ids1, mask1, ids2, mask2, y = [x.to(device) for x in batch]

        optimizer.zero_grad()
        # print("Check")
        pred = model(ids1, mask1, ids2, mask2)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            ids1, mask1, ids2, mask2, y = [x.to(device) for x in batch]
            pred = model(ids1, mask1, ids2, mask2)
            loss = criterion(pred, y)
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    print(f"Epoch {epoch+1}/10 - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")


Epoch 1/10 - Train Loss: 15.6062 - Val Loss: 11.8064
Epoch 2/10 - Train Loss: 8.1830 - Val Loss: 2.8926
Epoch 3/10 - Train Loss: 1.6413 - Val Loss: 1.4632
Epoch 4/10 - Train Loss: 1.0167 - Val Loss: 0.9425
Epoch 5/10 - Train Loss: 0.8686 - Val Loss: 0.8988
Epoch 6/10 - Train Loss: 0.8277 - Val Loss: 0.9130
Epoch 7/10 - Train Loss: 0.8143 - Val Loss: 0.8784
Epoch 8/10 - Train Loss: 0.7997 - Val Loss: 0.8772
Epoch 9/10 - Train Loss: 0.7771 - Val Loss: 0.8529
Epoch 10/10 - Train Loss: 0.7737 - Val Loss: 0.8158
Epoch 11/10 - Train Loss: 0.7518 - Val Loss: 0.8325
Epoch 12/10 - Train Loss: 0.7313 - Val Loss: 0.8338
Epoch 13/10 - Train Loss: 0.7299 - Val Loss: 0.7999
Epoch 14/10 - Train Loss: 0.7231 - Val Loss: 0.8066
Epoch 15/10 - Train Loss: 0.7128 - Val Loss: 0.8377
Epoch 16/10 - Train Loss: 0.7063 - Val Loss: 0.7702
Epoch 17/10 - Train Loss: 0.6863 - Val Loss: 0.7835
Epoch 18/10 - Train Loss: 0.6770 - Val Loss: 0.7635
Epoch 19/10 - Train Loss: 0.6766 - Val Loss: 0.7915
Epoch 20/10 - Train