In [4]:
import pandas as pd
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer
from torch.utils.data import TensorDataset

In [5]:
train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

def tokenize_data(data):
    tokenizer.pad_token = tokenizer.eos_token
    input_texts, target_texts = data['article'].astype(str).tolist(), data['highlights'].astype(str).tolist()
    input_encodings = tokenizer(input_texts, truncation=True, padding=True, return_tensors='pt', max_length=512)
    target_encodings = tokenizer(target_texts, truncation=True, padding=True, return_tensors='pt', max_length=512)
    return input_encodings, target_encodings

train_inputs, train_targets = tokenize_data(train_df)
val_inputs, val_targets = tokenize_data(val_df)
test_inputs, test_targets = tokenize_data(test_df)

torch.save(train_inputs, 'train_inputs.pt')
torch.save(train_targets, 'train_targets.pt')
torch.save(val_inputs, 'val_inputs.pt')
torch.save(val_targets, 'val_targets.pt')
torch.save(test_inputs, 'test_inputs.pt')
torch.save(test_targets, 'test_targets.pt')

In [7]:
from torch.utils.data import TensorDataset
train_inputs = torch.load('train_inputs.pt')
train_targets = torch.load('train_targets.pt')
val_inputs = torch.load('val_inputs.pt')
val_targets = torch.load('val_targets.pt')
test_inputs = torch.load('test_inputs.pt')
test_targets = torch.load('test_targets.pt')

train_dataset = TensorDataset(train_inputs['input_ids'], train_targets['input_ids'])
val_dataset = TensorDataset(val_inputs['input_ids'], val_targets['input_ids'])
test_dataset = TensorDataset(test_inputs['input_ids'], test_targets['input_ids'])


In [8]:
class CustomEncoder(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim):
        super(CustomEncoder, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return output, hidden, cell

class CustomDecoder(nn.Module):
    def __init__(self, output_dim, embedding_dim, hidden_dim):
        super(CustomDecoder, self).__init__()
        self.embedding = nn.Embedding(output_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output)
        return prediction, hidden, cell


In [None]:
input_dim = len(tokenizer.get_vocab())
output_dim = len(tokenizer.get_vocab())
embedding_dim = 256
hidden_dim = 512

encoder = CustomEncoder(input_dim, embedding_dim, hidden_dim)
decoder = CustomDecoder(output_dim, embedding_dim, hidden_dim)

batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

encoder.eval()

with torch.no_grad():
    encoder_output, encoder_hidden, encoder_cell = encoder(train_inputs['input_ids'])

torch.save(encoder_output, 'encoder_output.pt')




In [None]:
optimizer = optim.AdamW(list(encoder.parameters()) + list(decoder.parameters()), lr=5e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 3

for epoch in range(num_epochs):
    encoder.train()
    decoder.train()
    total_loss = 0.0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        labels = batch['labels']

        encoder_output, encoder_hidden, encoder_cell = encoder(input_ids)
        decoder_input = labels[:, :-1]
        decoder_output, _, _ = decoder(decoder_input, encoder_hidden, encoder_cell)

        loss = criterion(decoder_output.view(-1, decoder_output.shape[-1]), labels[:, 1:].contiguous().view(-1))
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {total_loss / len(train_loader)}')

encoder.eval()
decoder.eval()
total_val_loss = 0.0

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        labels = batch['labels']

        encoder_output, encoder_hidden, encoder_cell = encoder(input_ids)
        decoder_input = labels[:, :-1]
        decoder_output, _, _ = decoder(decoder_input, encoder_hidden, encoder_cell)

        val_loss = criterion(decoder_output.view(-1, decoder_output.shape[-1]), labels[:, 1:].contiguous().view(-1))
        total_val_loss += val_loss.item()

print(f'Validation Loss: {total_val_loss / len(val_loader)}')


In [None]:
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

encoder.eval()
decoder.eval()
total_test_loss = 0.0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        labels = batch['labels']

        encoder_output, encoder_hidden, encoder_cell = encoder(input_ids)
        decoder_input = labels[:, :-1]
        decoder_output, _, _ = decoder(decoder_input, encoder_hidden, encoder_cell)

        test_loss = criterion(decoder_output.view(-1, decoder_output.shape[-1]), labels[:, 1:].contiguous().view(-1))
        total_test_loss += test_loss.item()

print(f'Test Loss: {total_test_loss / len(test_loader)}')
