In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel,AutoModelForSequenceClassification
import pandas as pd
import torch.nn.functional as F
import numpy as np

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    print("GPU is available and being used", device)
else:
    print("GPU is not available, using CPU instead")

In [None]:
import os

os.environ['CUDA_VISIBLE_DEVICES']  = "0"
#model = nn.DataParallel(model, output_device=0)
torch.cuda.device_count()

In [None]:
prompts_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_train.csv")
prompts_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/prompts_test.csv")

summaries_train = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_train.csv")
summaries_test = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/summaries_test.csv")

sample_submission = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")

prompts_train

In [None]:
# dataframe 병합, prompt id 기준으로 각 아이디에 맞는 prompts_train, test 내용을 왼쪽에 붙임
train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

train

In [None]:
train['merged_text'] = 'text '+train['text'] + ' prompt_question ' + train['prompt_question'] + ' prompt_title '+train['prompt_title']+ ' prompt_text '+train['prompt_text']
train['merged_text'].head(2)

In [None]:
test['merged_text'] = 'text '+ test['text'] + ' prompt_question ' + test['prompt_question'] + ' prompt_title '+ test['prompt_title']+ ' prompt_text '+ test['prompt_text']
test['merged_text'].head(2)

In [None]:
class CFG:
    def __init__(self):
        self.model_name = '/kaggle/input/roberta-base'
        self.tokenizer_path = '/kaggle/input/roberta-base'
        self.batch_size = 16
        self.num_epochs = 40
        self.token_max = 512
        self.learning_rate = 1e-5
        self.model_save_path = '/kaggle/working/model.pt'

cfg = CFG()

In [None]:
from itertools import chain
from torch.utils.data import TensorDataset, DataLoader, random_split

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer_path)
base_model = AutoModel.from_pretrained(cfg.model_name).to(device)

In [None]:
def get_dataset(df):
    encoding = tokenizer(list(train['merged_text']),
                   padding=True,
                   truncation=True,
                   max_length=cfg.token_max,
                   return_tensors="pt")
    content = torch.tensor(df['content'], dtype=torch.float32)
    wording = torch.tensor(df['wording'], dtype=torch.float32)
    labels = torch.dstack((content, wording)).squeeze()
    dataset = TensorDataset(encoding['input_ids'], encoding['attention_mask'], labels)
    return dataset

ds = get_dataset(train)

In [None]:
class RobertaModel(nn.Module):
    def __init__(self):
        super(RobertaModel, self).__init__()
        self.RobertaModel = AutoModel.from_pretrained('/kaggle/input/roberta-base')

        self.dropout = nn.Dropout(0.1)
        self.linear1 = nn.Linear(768, 128)
        self.linear2 = nn.Linear(128, 2)

    def forward(self, input_ids, attention_mask):
        #print('forward')
        outputs = self.RobertaModel(input_ids=input_ids, attention_mask=attention_mask) # roberta는 tokentypeid가 없음
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        output = self.linear1(pooled_output)
        output = nn.ReLU()(output)
        output = self.linear2(output)
        return output

In [None]:
train_ds, val_ds = random_split(ds, [0.8, 0.2])
print(len(train_ds))
print(len(val_ds))

In [None]:
train_dataloader = DataLoader(
            train_ds,
            batch_size = cfg.batch_size
)

val_dataloader = DataLoader(
            val_ds,
            batch_size = cfg.batch_size
)

In [None]:
base_model

In [None]:
model = RobertaModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate)
loss_fn = nn.MSELoss() # RMSE
min_val_loss = np.inf

In [None]:
model

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Training loop
model.train()
patience = 0

# Initialize empty lists to store training and validation loss values
train_losses = []
val_losses = []

for epoch in range(cfg.num_epochs):
    running_loss = 0.0
    for step, (input_ids, attention_mask, labels) in enumerate(train_dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        if step % 100 == 0:
            print("Epoch {}, Step {}, Loss: {}".format(epoch+1, step, loss.item()))

        running_loss += loss.item()
    # Append the average training loss for the epoch to the list
    train_losses.append(running_loss / len(train_dataloader))
    print(f"Epoch {epoch+1} Loss: {running_loss / len(train_dataloader)}")

    # Validation loop
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for val_step, (input_ids, attention_mask, labels) in enumerate(val_dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            val_outputs = model(input_ids, attention_mask)
            val_loss += loss_fn(val_outputs, labels)
        avg_loss = val_loss / len(val_dataloader)
        # Append the validation loss for the epoch to the list
        val_losses.append(avg_loss)
        print(f"Validation Loss: {avg_loss}")

        if avg_loss < min_val_loss:
            patience = 0
            min_val_loss = avg_loss
            torch.save(model.state_dict(), cfg.model_save_path)
            print(f'saving model with score: {avg_loss}')

    patience += 1
    if patience >= 10:
            print('Early Stopping trigerred on epoch: {}')
            break

    model.train()

In [None]:
# Move the NumPy arrays to the CPU
#train_losses_cpu = [t.cpu().numpy() for t in train_losses]
#val_losses_cpu = [v.cpu().numpy() for v in val_losses]

In [None]:
# Plot the loss values
'''plt.figure(figsize=(10, 5))
plt.plot(range(1, len(train_losses) + 1), train_losses, label='Training Loss')
plt.plot(range(1, len(val_losses) + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Loss Over Epochs')
plt.show()'''

In [None]:
checkpoint = RobertaModel().to(device)
checkpoint.load_state_dict(torch.load(cfg.model_save_path))
#model.load_state_dict(torch.load(cfg.model_save_path, map_location='cuda:0'), strict=False)

'''if isinstance(model, nn.DataParallel):
  model.load_state_dict(torch.load('model.pt'))
else:
  state_dict = torch.load(cfg.model_save_path)
  new_state_dict = OrderedDict()
  for k, v in state_dict.items():
    name = k[7:] # remove module
    new_state_dict[name] = v
  model.load_state_dict(new_state_dict)'''

model.eval()

enc = tokenizer(list(test['merged_text']),
                   padding=True,
                   truncation=True,
                   max_length=cfg.token_max,
                   return_tensors="pt")

test_ds = TensorDataset(enc['input_ids'], enc['attention_mask'])
test_dataloader = torch.utils.data.DataLoader(test_ds, batch_size=16, shuffle=False)


predictions = []
with torch.no_grad():
    for input_ids, attention_mask in test_dataloader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        outputs = model(input_ids, attention_mask)
        predictions.extend(outputs.cpu().numpy())

In [None]:
submission_df = pd.DataFrame({
    'student_id': test['student_id'],
    'content': [pred[0] for pred in predictions],
    'wording': [pred[1] for pred in predictions]
})

In [None]:
submission_df.to_csv('submission.csv', index=False)
submission_df