In [1]:
!pip install transformers
!pip install datasets
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import torch
from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset

In [2]:
# Define your custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }

In [3]:
# Load the XLM-ProphetNet model and tokenizer
model_name = 'microsoft/xprophetnet-large-wiki100-cased'
model = XLMProphetNetForCausalLM.from_pretrained(model_name)
tokenizer = XLMProphetNetTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at microsoft/xprophetnet-large-wiki100-cased were not used when initializing XLMProphetNetForCausalLM: ['prophetnet.encoder.layers.11.feed_forward_layer_norm.bias', 'prophetnet.encoder.layers.8.self_attn.key_proj.weight', 'prophetnet.encoder.layers.9.self_attn.query_proj.weight', 'prophetnet.encoder.layers.2.self_attn.query_proj.weight', 'prophetnet.encoder.layers.7.self_attn.out_proj.weight', 'prophetnet.encoder.layers.0.self_attn_layer_norm.bias', 'prophetnet.encoder.layers.6.feed_forward.intermediate.bias', 'prophetnet.encoder.layers.9.self_attn.key_proj.weight', 'prophetnet.encoder.layers.2.feed_forward_layer_norm.bias', 'prophetnet.encoder.layers.10.feed_forward.output.weight', 'prophetnet.encoder.layers.7.feed_forward.intermediate.weight', 'prophetnet.encoder.layers.2.self_attn.value_proj.weight', 'prophetnet.encoder.layers.1.feed_forward.output.bias', 'prophetnet.encoder.layers.10.self_attn.value_proj.weight', 'prophetnet.encoder.layers.1.sel

In [4]:
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

XLMProphetNetForCausalLM(
  (prophetnet): XLMProphetNetDecoderWrapper(
    (decoder): XLMProphetNetDecoder(
      (word_embeddings): Embedding(250012, 1024, padding_idx=0)
      (position_embeddings): XLMProphetNetPositionalEmbeddings(512, 1024, padding_idx=0)
      (ngram_embeddings): Embedding(2, 1024)
      (layers): ModuleList(
        (0-11): 12 x XLMProphetNetDecoderLayer(
          (self_attn): XLMProphetNetNgramSelfAttention(
            (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (relative_pos_embeddings): Linear(in_features=1024, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (cross_attn): XLMProphetNetAttention(


In [5]:
# Define hyperparameters
batch_size = 1
max_length = 128
num_epochs = 3
learning_rate = 2e-5

In [6]:
# Load the dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train[:10]')  # Load the first 100 examples from the WikiText-2 dataset



In [7]:
dataset

Dataset({
    features: ['text'],
    num_rows: 10
})

In [8]:
# Extract texts from the dataset
texts = dataset['text']

In [9]:
# Create the custom dataset
custom_dataset = CustomDataset(texts, tokenizer, max_length)

# Create the data loader
data_loader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

In [10]:
# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [11]:
# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(data_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')

Epoch 1/3, Loss: 11.2526
Epoch 2/3, Loss: 8.2574
Epoch 3/3, Loss: 6.6093


In [12]:
# Inference
model.eval()
example_prompt = 'Once upon a time'
input_ids = tokenizer.encode(example_prompt, return_tensors='pt').to(device)

with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=50)
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f'Example Prompt: {example_prompt}')
    print(f'Generated Text: {generated_text}')

Example Prompt: Once upon a time
Generated Text: Once upon a time
