In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
import os
import re
import numpy as np
import pickle as cPickle
import copy
import nltk
from bs4 import BeautifulSoup




In [None]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=8)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Load document lists
train_list = []
test_list = []
file_path = "" # include the file path for "train_test_split" -- it is in training_data/MASC_Wikipedia of the repo
with open(file_path, 'r') as f:
    for line in f.readlines():
        filename, genre, index, train_test = line.split()
        filename = filename[:-4]
        if train_test == 'train':
            train_list.append(filename)
        if train_test == 'test':
            test_list.append(filename)


In [None]:
# Load stored pos/ner parsing sentence
file_path1 = "" # include the file path for "masc_sentence_pos_ner_dict.pkl" -- it is in training_data
file_path2 = "" # include the file path for "explicit_connective.txt" -- it is in training_data
with open(file_path1, 'rb') as f:
    sentence_pos_ner_dict = cPickle.load(f)

connective_list = []
with open(file_path2, 'r') as f:
    for line in f.readlines():
        connective_list.append(line.strip())
connective_list = tuple(connective_list)

entity_type_list = ['STATE', 'EVENT', 'REPORT', 'GENERIC_SENTENCE', 'GENERALIZING_SENTENCE', 'QUESTION', 'IMPERATIVE', "CANNOT_DECIDE"]

def process_entity_type_label(entity_type):
    return entity_type_list.index(entity_type)





In [None]:
path1 =  "" # this should be the path to the annotations_xml folder inside MASC_Wikipedia
path2 = "" #this should be the path to the raw_text folder inside MASC_Wikipedia

class ClauseDataset(Dataset):
    def __init__(self, doc_list, tokenizer, max_length=128):
        self.clauses = []
        self.labels = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.process_docs(doc_list)
        print(f"Total clauses processed: {len(self.clauses)}")
        print(f"Total labels processed: {len(self.labels)}")

    def process_docs(self, doc_list):
        for doc_name in doc_list:
            doc_path = os.path.join(path1, doc_name + '.xml')
            raw_doc_path = os.path.join(path2, doc_name + '.txt')
            #print(f"Processing document: {doc_path}")
            doc_clause_list = self.process_doc(doc_path)
            #print(f"Number of clauses in document {doc_name}: {len(doc_clause_list)}")
            paras_clause_list = self.process_paragraph(doc_clause_list, raw_doc_path)
            #print(f"Number of paragraphs in document {doc_name}: {len(paras_clause_list)}")
            for para_clause_list in paras_clause_list:
                #print(f"Number of clauses in paragraph: {len(para_clause_list)}")
                for clause in para_clause_list:
                    clause_text, entity_type = clause[0], clause[1]
                    #print(f"Clause text: {clause_text}, Entity type: {entity_type}")
                    if entity_type in entity_type_list:
                        self.clauses.append(clause_text)
                        self.labels.append(process_entity_type_label(entity_type))
            print(f"Processed {len(self.clauses)} clauses so far.")

    def process_doc(self, doc_path):
        with open(doc_path, 'r') as doc:
            xml = BeautifulSoup(doc, "lxml-xml")  # Use lxml parser for XML
            clause_list = []
            for clause in xml.find_all('segment'):
                end = int(clause.attrs['end'])
                clause_text = clause.find('text').string
                label = 'CANNOT_DECIDE'
                annotation = clause.find('annotation', attrs={"annotator": "gold"})
                if annotation.has_attr('seType'):
                    label = annotation.attrs['seType']
                clause_list.append((clause_text, label, end))
            return clause_list

    def process_paragraph(self, doc_clause_list, raw_doc_path):
        with open(raw_doc_path, 'r') as raw_doc:
            raw_para_boundary_list = [m.start() + 1 for m in re.finditer('\n\n', raw_doc.read())]
            raw_para_boundary_list += [float('inf')]
            paras_clause_list = []

            index = 0
            for raw_para_boundary in raw_para_boundary_list:
                para_clause_list = []
                while index < len(doc_clause_list):
                    end_index = doc_clause_list[index][2]
                    if end_index <= raw_para_boundary:
                        para_clause_list.append(doc_clause_list[index])
                        index += 1
                    else:
                        break
                if len(para_clause_list) > 0:
                    paras_clause_list.append(para_clause_list)
            #print(f"Processed paragraphs: {len(paras_clause_list)}, Clauses in each paragraph: {[len(p) for p in paras_clause_list]}")
            return paras_clause_list

    def __len__(self):
        return len(self.clauses)

    def __getitem__(self, idx):
        clause = self.clauses[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            clause,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [12]:
train_dataset = ClauseDataset(train_list, tokenizer)
test_dataset = ClauseDataset(test_list, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

Processed 85 clauses so far.
Processed 179 clauses so far.
Processed 283 clauses so far.
Processed 986 clauses so far.
Processed 1253 clauses so far.
Processed 1541 clauses so far.
Processed 1628 clauses so far.
Processed 1711 clauses so far.
Processed 1815 clauses so far.
Processed 1922 clauses so far.
Processed 2050 clauses so far.
Processed 2175 clauses so far.
Processed 2271 clauses so far.
Processed 2692 clauses so far.
Processed 2880 clauses so far.
Processed 2917 clauses so far.
Processed 2938 clauses so far.
Processed 3119 clauses so far.
Processed 3211 clauses so far.
Processed 3235 clauses so far.
Processed 3265 clauses so far.
Processed 3309 clauses so far.
Processed 3344 clauses so far.
Processed 3372 clauses so far.
Processed 3758 clauses so far.
Processed 3803 clauses so far.
Processed 3826 clauses so far.
Processed 3839 clauses so far.
Processed 3873 clauses so far.
Processed 3919 clauses so far.
Processed 3938 clauses so far.
Processed 3963 clauses so far.
Processed 401

In [None]:
import numpy as np

def calculate_label_distribution(dataset, entity_type_list):
    label_counts = np.zeros(len(entity_type_list), dtype=int)
    for item in dataset:
        label = item['labels'].item()  # Extract the label
        label_counts[label] += 1
    return label_counts

# Calculate label distribution for training dataset
train_label_counts = calculate_label_distribution(train_dataset, entity_type_list)
test_label_counts = calculate_label_distribution(test_dataset, entity_type_list)

# Print label distribution
print("Training Set Label Distribution:")
for i, count in enumerate(train_label_counts):
    print(f"{entity_type_list[i]}: {count}")

print("\nTest Set Label Distribution:")
for i, count in enumerate(test_label_counts):
    print(f"{entity_type_list[i]}: {count}")


Training Set Label Distribution:
STATE: 18337
EVENT: 9688
REPORT: 1617
GENERIC_SENTENCE: 7582
GENERALIZING_SENTENCE: 1466
QUESTION: 1056
IMPERATIVE: 1046
CANNOT_DECIDE: 8737

Test Set Label Distribution:
STATE: 3544
EVENT: 1733
REPORT: 323
GENERIC_SENTENCE: 1578
GENERALIZING_SENTENCE: 316
QUESTION: 207
IMPERATIVE: 236
CANNOT_DECIDE: 1765


In [13]:
import torch

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("Number of GPUs:", torch.cuda.device_count())
    print("GPU Name:", torch.cuda.get_device_name(0))


PyTorch version: 2.3.1+cu121
CUDA available: True
CUDA version: 12.1
Number of GPUs: 1
GPU Name: Tesla T4


training

In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch'  # Updated parameter name
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

model_save_path = "/content/drive/MyDrive/Stanford Summer 23-34/children book project/trained_model_new"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
# Save the training arguments

import json
training_args_dict = training_args.to_dict()
with open(os.path.join(model_save_path, 'training_args.json'), 'w') as f:
    json.dump(training_args_dict, f)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2734,0.64218,0.800454,0.797439,0.800454,0.796676
2,0.3394,0.711653,0.815296,0.81288,0.815296,0.811989
3,0.2675,0.976719,0.817666,0.816907,0.817666,0.816593


In [20]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.9767186045646667, 'eval_accuracy': 0.8176664605236034, 'eval_precision': 0.8169065481118306, 'eval_recall': 0.8176664605236034, 'eval_f1': 0.8165929589822221, 'eval_runtime': 69.0489, 'eval_samples_per_second': 140.509, 'eval_steps_per_second': 17.567, 'epoch': 3.0}


In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics  # Pass the compute_metrics function here
)


In [None]:
# Evaluate the model
results = trainer.evaluate(eval_dataset=test_dataset)
print(results)


{'eval_loss': 0.07353400439023972, 'eval_accuracy': 0.9787672644815502, 'eval_precision': 0.9787760006984491, 'eval_recall': 0.9787672644815502, 'eval_f1': 0.9787390533902349, 'eval_runtime': 65.6505, 'eval_samples_per_second': 147.783, 'eval_steps_per_second': 9.246}


In [None]:
from collections import Counter

train_labels = [label for label in train_dataset.labels]
test_labels = [label for label in test_dataset.labels]

train_label_distribution = Counter(train_labels)
test_label_distribution = Counter(test_labels)

print(f"Training label distribution: {train_label_distribution}")
print(f"Test label distribution: {test_label_distribution}")


Training label distribution: Counter({0: 18337, 1: 9688, 7: 8737, 3: 7582, 2: 1617, 4: 1466, 5: 1056, 6: 1046})
Test label distribution: Counter({0: 3544, 7: 1765, 1: 1733, 3: 1578, 2: 323, 4: 316, 6: 236, 5: 207})


In [None]:
from sklearn.model_selection import StratifiedKFold

def cross_validate_model(model, dataset, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    metrics_list = []

    for train_index, val_index in skf.split(dataset.clauses, dataset.labels):
        train_subset = torch.utils.data.Subset(dataset, train_index)
        val_subset = torch.utils.data.Subset(dataset, val_index)

        train_dataloader = DataLoader(train_subset, batch_size=8, shuffle=True)
        val_dataloader = DataLoader(val_subset, batch_size=8, shuffle=False)

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_subset,
            eval_dataset=val_subset,
            compute_metrics=compute_metrics
        )

        trainer.train()
        metrics = trainer.evaluate(eval_dataset=val_subset)
        metrics_list.append(metrics)

    avg_metrics = {
        'eval_loss': np.mean([m['eval_loss'] for m in metrics_list]),
        'eval_accuracy': np.mean([m['eval_accuracy'] for m in metrics_list]),
        'eval_precision': np.mean([m['eval_precision'] for m in metrics_list]),
        'eval_recall': np.mean([m['eval_recall'] for m in metrics_list]),
        'eval_f1': np.mean([m['eval_f1'] for m in metrics_list])
    }

    return avg_metrics

# Perform cross-validation
cross_val_metrics = cross_validate_model(model, train_dataset)
print(f"Cross-validation metrics: {cross_val_metrics}")
