In [55]:
from utils import *
from rouge_score import *
#from ROUGE_Score_Model import Model
from evaluators import METEOR

import wandb
import numpy as np
from torch import nn, Tensor, flatten
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

___

In [56]:
meteor = METEOR()
meteor('The cat ate the mouse', 'The cat ate the mouse The cat ate the mouse')

tensor([0.9055])

In [57]:
from torch import nn, Tensor, cat, flatten
from rouge_score import *


class Model(nn.Module):
    def __init__(self, hidden_dim, output_dim, summary_len_mean, summary_len_std):
        super().__init__()

        self.summary_len_mean = summary_len_mean
        self.summary_len_std = summary_len_std

        input_dim = 2*3 + 1
        input_dim = 2

        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.non_lin = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)

        self.device = 'cpu'
        self.to(self.device)

    def forward(self, prompt_text, summary_text, scores=None):
        if scores is None:
            scores = self._calculate_rouge_scores(prompt_text, summary_text)
            scores = flatten(scores)
        summary_len_norm = self._summary_len_norm(summary_text)

        result = cat((scores, summary_len_norm))
        result = self.layer1(result)
        result = self.non_lin(result)
        result = self.layer2(result)

        return result

    def _calculate_rouge_scores(self, prompt_text, summary_text):

        #rouge_cos_1 = rouge_cos_n(prompt_text, summary_text, 1)
        #rouge_cos_2 = rouge_cos_n(prompt_text, summary_text, 2)
        meteor = METEOR()(prompt_text, summary_text)

        scores = [
            #(rouge_cos_1.precision, rouge_cos_1.recall, rouge_cos_1.fmeasure),
            #(rouge_cos_2.precision, rouge_cos_2.recall, rouge_cos_2.fmeasure),
            meteor
        ]

        return Tensor(scores).to(self.device)

    def _summary_len_norm(self, summary_text):
        zscore = (len(summary_text) - self.summary_len_mean) / self.summary_len_std
        return Tensor((zscore,)).to(self.device)


    def to(self, device, *args, **kwargs):
        super().to(device, *args, **kwargs)
        self.device = device
        return self

____

In [58]:
def get_prompt(summary, prompts_df):
    return prompts_df.loc[prompts_df.prompt_id == summary.prompt_id].iloc[0]

In [59]:
def preprocess(summaries, prompts):
    tqdm.pandas()

    prompt_columns = ['prompt_text', 'prompt_title', 'prompt_question']

    merged_df = summaries.merge(prompts, 'inner', 'prompt_id')

    print("ROUGE-Scores are being calculated. Please stand by...")
    print('1/4', end="\r")
    merged_df[['rouge1_precision', 'rouge1_recall', 'rouge1_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_n(row.prompt_text, row.text, 1), axis=1, result_type='expand')
    print('2/4', end="\r")
    merged_df[['rouge2_precision', 'rouge2_recall', 'rouge2_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_n(row.prompt_text, row.text, 2), axis=1, result_type='expand')
    print('3/4', end="\r")
    merged_df[['rougeL_precision', 'rougeL_recall', 'rougeL_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_l(row.prompt_text, row.text), axis=1, result_type='expand')
    print('4/4', end="\r")
    merged_df[['rougeLsum_precision', 'rougeLsum_recall', 'rougeLsum_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_lsum(row.prompt_text, row.text), axis=1, result_type='expand')
    print("Done")

    summaries = merged_df.drop(prompt_columns, axis=1)
    return summaries, prompts

In [60]:
def get_scores(row):
    scores = []
    for score in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:
        scores.append((row[f'{score}_precision'],
                       row[f'{score}_recall'],
                       row[f'{score}_fmeasure']))

    scores = torch.Tensor(scores)
    scores = flatten(scores)
    return scores

In [61]:
def predict(model, target_score, summary, prompts_df, scores=None):
    prompt = get_prompt(summary, prompts_df)

    if target_score == 'content':
        target = Tensor([summary.content]).to(device)
    elif target_score == 'wording':
        target = Tensor([summary.wording]).to(device)
    else:
        target = Tensor([summary.content, summary.wording]).to(device)

    predictions = model(prompt.prompt_text, summary.text, scores)
    return predictions, target

In [62]:
def mcrmse(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes of y_true and y_pred must be the same.")
    rmse_values = np.sqrt(np.mean((y_true - y_pred)**2, axis=0))
    mcrmse = np.mean(rmse_values)

    return mcrmse

## Setup

In [63]:
KEY = None

In [64]:
device, path = setup(wandb_key=KEY)
summaries_df, prompts_df = get_data('../kaggle/input/commonlit-evaluate-student-summaries')

[0;34mGPU not available. CPU used.[0m


## Set up Data (1)
**Stratified Split**

In [65]:
train_df, test_df = train_test_split(summaries_df, test_size=0.2, stratify=summaries_df["prompt_id"], random_state=42)

In [66]:
text_len_mean = train_df.text.apply(len).mean()
text_len_std  = train_df.text.apply(len).std()

In [67]:
epochs = 10
learning_rate = 0.01
hidden_dim = 64
target_score = 'content'
#target_score = 'wording'
#target_score = 'both'
output_dim = 2 if target_score == 'both' else 1

In [68]:
model = Model(hidden_dim, output_dim, text_len_mean, text_len_std).to(device)

In [69]:
criterion = nn.MSELoss()
optimizer=torch.optim.Adam

In [70]:
optimizer = optimizer(list(model.parameters()))
optimizer.lr = learning_rate

## Training Model

In [71]:
#project = "ESS_4"
name = f"ROUGE_{target_score}"
#notes = "Trained on all four topics"

#architecture = "ROUGE"

In [72]:
"""wandb.init(
    project=project,
    name=name,
    notes=notes,

    # track hyperparameters and run metadata
    config={
        "architecture": architecture,
        "learning_rate": learning_rate,
        "epochs": epochs,
        "loss_function":type(criterion),
        "optimizer":type(optimizer),
        "hidden_dim":hidden_dim,
        "non-lin":model.non_lin
        },
)"""

'wandb.init(\n    project=project,\n    name=name,\n    notes=notes,\n\n    # track hyperparameters and run metadata\n    config={\n        "architecture": architecture,\n        "learning_rate": learning_rate,\n        "epochs": epochs,\n        "loss_function":type(criterion),\n        "optimizer":type(optimizer),\n        "hidden_dim":hidden_dim,\n        "non-lin":model.non_lin\n        },\n)'

### Training

In [73]:
#train_df, prompts_df = preprocess(train_df, prompts_df)
train_df.columns

Index(['student_id', 'prompt_id', 'text', 'content', 'wording'], dtype='object')

In [74]:
for epoch in tqdm(range(epochs)):

    train_df = train_df.sample(frac=1).reset_index(drop=True)

    y_true = []
    y_pred = []

    for index, summary in train_df.iterrows():
        #scores = get_scores(summary)

        predictions, target = predict(model, target_score, summary, prompts_df)
        loss = criterion(predictions, target)
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

        y_true.append([*(float(x) for x in target)])
        y_pred.append([*(float(x) for x in predictions)])

  0%|          | 0/10 [00:00<?, ?it/s]

### Testing

In [75]:
y_true = []
y_pred = []

for index, summary in tqdm(test_df.iterrows(), total=len(test_df)):

    predictions, target = predict(model, target_score, summary, prompts_df)

    y_true.append([*(float(x) for x in target)])
    y_pred.append([*(float(x) for x in predictions)])

  0%|          | 0/1147 [00:00<?, ?it/s]

In [76]:
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"MCRMSE: {mcrmse(y_true, y_pred):.4f}")
print(f"MSE: {mse:.4f}")
print(f"R2:  {r2:.4f}")

MCRMSE: 0.5125
MSE: 0.2627
R2:  0.7774


In [77]:
torch.save(model.state_dict(), f"models/ROUGE/{name}.pt")