In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from transformers import get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
import numpy as np

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("GPU is available and being used")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU instead")

In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES']  = "0"
#model = nn.DataParallel(model, output_device=0)
torch.cuda.device_count()

In [None]:
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")

summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

prompts_train

In [None]:
# dataframe 병합, prompt id 기준으로 각 아이디에 맞는 prompts_train, test 내용을 왼쪽에 붙임
train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

train

In [None]:
train['merged_text'] = 'text '+train['text'] + ' prompt_question ' + train['prompt_question'] + ' prompt_title '+train['prompt_title']+ ' prompt_text '+train['prompt_text']
train['merged_text'].head(2)

In [None]:
test['merged_text'] = 'text '+ test['text'] + ' prompt_question ' + test['prompt_question'] + ' prompt_title '+ test['prompt_title']+ ' prompt_text '+ test['prompt_text']
test['merged_text'].head(2)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/electra/base-discriminator')

In [None]:
def get_dataset(df):
    encoding = tokenizer(list(df['merged_text']),
                   padding=True,
                   truncation=True,
                   max_length=512,
                   return_tensors="pt")
    content = torch.tensor(df['content'], dtype=torch.float32)
    wording = torch.tensor(df['wording'], dtype=torch.float32)
    labels = torch.dstack((content, wording)).squeeze()
    dataset = TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels)
    return dataset

ds = get_dataset(train)

In [None]:
'''class ElectraDataset(torch.utils.data.Dataset) :
  def __init__(self, contents, wordings, tokenizer, max_length=512):
    self.documents = documents
    self.contents = contents
    self.wordings = wordings
    self.tokenizer = tokenizer
    self.max_length = max_length
    #self.encodings = encoding
    #self.labels = labels

  def __len__(self): return len(self.documents)

  def __getitem__(self, idx):
    document = self.documents
    content = self.contents
    wording = self.wordings

    # Tokenize the document
    encoding = self.tokenizer(str(document),
                   padding=True,
                   truncation=True,
                   max_length=self.max_length,
                   return_tensors="pt")
    
    input_ids = encoding['input_ids'].squeeze()
    attention_mask = encoding['attention_mask'].squeeze()

    content_tensor = torch.tensor(content, dtype=torch.float32)
    wording_tensor = torch.tensor(wording, dtype=torch.float32)
    print(content_tensor.size())
    labels = torch.dstack([content_tensor, wording_tensor]).squeeze()

    dataset = TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels)

    return dataset
    
    {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }'''

In [None]:
from torch.utils.data import TensorDataset, DataLoader, random_split

In [None]:
train_dataset, val_dataset = random_split(ds, [0.8, 0.2]) #dataset
print(len(train_dataset))
print(len(val_dataset))

In [None]:
train_dataloader = DataLoader(
            train_dataset,
            batch_size = 16
)

val_dataloader = DataLoader(
            val_dataset,
            batch_size = 16
)

In [None]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer, ElectraModel

In [None]:
class ElectraFineTuning(nn.Module):
    def __init__(self):
        super(ElectraFineTuning, self).__init__()
        self.electra = ElectraModel.from_pretrained("/kaggle/input/electra/base-discriminator").to(device)

        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768, 128)
        self.linear2 = nn.Linear(128, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.electra(input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        pooled_output = torch.mean(hidden_states, dim=1)
        pooled_output = self.dropout(pooled_output)
        output = self.linear1(pooled_output)
        output = nn.ReLU()(output)
        output = self.linear2(output)
        return output

num_labels = 2  # Replace with the number of labels in your dataset


In [None]:
model = ElectraFineTuning().to(device)

In [None]:
model

In [None]:
# Define your loss function (e.g., CrossEntropyLoss)
loss_fn = nn.CrossEntropyLoss()

# Define the number of training epochs
num_epochs = 40

# Define your optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Create data loaders for training and validation
#train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
#validation_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
early_stopping_patience = 3  # Stop training if validation loss doesn't improve for 3 consecutive epochs
min_val_loss = np.inf
no_improvement_count = 0

In [None]:
# Training loop
model.train()
patience = 0

for epoch in range(num_epochs):
    running_loss = 0.0
    for step, (input_ids, attention_mask, labels) in enumerate(train_dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        #logits = outputs.logits

        #print(outputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        if step % 100 == 0:
            print("Epoch {}, Step {}, Loss: {}".format(epoch+1, step, loss.item()))

        running_loss += loss.item()

    print(f"Epoch {epoch+1} Loss: {running_loss / len(train_dataloader)}")

    # Validation loop
    model.eval()
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        val_loss = 0.0
        for val_step, (input_ids, attention_mask, labels) in enumerate(val_dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            val_outputs = model(input_ids, attention_mask)
            val_loss += loss_fn(val_outputs, labels)
        avg_loss = val_loss / len(val_dataloader)
        print(f"Validation Loss: {avg_loss}")

        if avg_loss < min_val_loss:
            patience = 0
            min_val_loss = avg_loss
            torch.save(model.state_dict(), '/kaggle/working/model.pt')
            print(f'saving model with score: {avg_loss}')

    patience += 1
    if patience >= 10:
            print('Early Stopping trigerred on epoch: {}')
            break

    model.train()

In [None]:
checkpoint = ElectraFineTuning().to(device)
checkpoint.load_state_dict(torch.load('/kaggle/working/model.pt'))


model.eval()

enc = tokenizer(list(test['merged_text']),
                   padding=True,
                   truncation=True,
                   max_length=512,
                   return_tensors="pt")

test_ds = TensorDataset(enc['input_ids'], enc['attention_mask'])
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=False)


predictions = []
with torch.no_grad():
    for input_ids, attention_mask in test_dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.cpu().numpy())

In [None]:
submission_df = pd.DataFrame({
    'student_id': test['student_id'],
    'content': [pred[0] for pred in predictions],
    'wording': [pred[1] for pred in predictions]
})

In [None]:
submission_df.to_csv('submission.csv', index=False)
submission_df