In [6]:
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

# --- 1. Helper function to extract raw sentences from PCFG ---
def extract_raw_sentence(parse):
    parse = re.sub(r'\([^()]*\)', '', parse)  # Remove parse symbols
    words = parse.split()
    return ' '.join(words)

# --- 2. Load and preprocess data ---
def load_and_preprocess_data(file_path):
    data = pd.read_csv(file_path, sep='\t')
    data['sent1'] = data['Sent1_parse'].apply(extract_raw_sentence)
    data['sent2'] = data['Sent2_parse'].apply(extract_raw_sentence)
    label_mapping = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
    data['label'] = data['Label'].map(label_mapping)
    return data[['sent1', 'sent2', 'label']]

train_data = load_and_preprocess_data('train.tsv')
dev_data = load_and_preprocess_data('dev.tsv')
test_data = load_and_preprocess_data('test.tsv')

# --- 3. Model and Tokenizer Initialization ---
try:
    import sentencepiece
    print(f"SentencePiece version: {sentencepiece.__version__}")
except ImportError:
    raise ImportError(
        "SentencePiece library not found. Install it with:\n\n"
        "pip install sentencepiece\n"
    )

# Use a suitable model for NLI tasks
model_name = "roberta-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# --- 4. Tokenize the dataset ---
def tokenize_data(data):
    return tokenizer(list(data['sent1']), list(data['sent2']),
                      padding=True, truncation=True, return_tensors='pt', max_length=512)

train_encodings = tokenize_data(train_data)
dev_encodings = tokenize_data(dev_data)
test_encodings = tokenize_data(test_data)

# --- 5. Define Dataset Class ---
class NLIDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset = NLIDataset(train_encodings, train_data['label'].tolist())
dev_dataset = NLIDataset(dev_encodings, dev_data['label'].tolist())
test_dataset = NLIDataset(test_encodings, test_data['label'].tolist())

# --- 6. Define Evaluation Metrics ---
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {'accuracy': acc, 'macro_f1': f1}

# --- 7. Training Arguments ---
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics
)

# --- 8. Fine-tune the Model ---
trainer.train()

# --- 9. Evaluate on Validation and Test Sets ---
val_results = trainer.evaluate(dev_dataset)
test_results = trainer.evaluate(test_dataset)
print(f"Validation Results: {val_results}")
print(f"Test Results: {test_results}")

# --- 10. Generate Prediction Files ---
def generate_predictions(dataset, file_name):
    predictions = trainer.predict(dataset).predictions.argmax(-1)
    dataset_df = pd.read_csv(file_name, sep='\t')
    dataset_df['Prediction'] = predictions
    dataset_df.to_csv(file_name.replace('.tsv', '_predictions.tsv'), sep='\t', index=False)

generate_predictions(dev_dataset, 'dev.tsv')
generate_predictions(test_dataset, 'test.tsv')

# --- 11. Save Model and Tokenizer ---
model.save_pretrained('./nli_model')
tokenizer.save_pretrained('./nli_model')

SentencePiece version: 0.2.0


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`