# Josef

In [44]:
import os
import torch
import collections
import pandas as pd
import numpy as np
from torch import Tensor, nn, cat, flatten
from nltk.util import ngrams
from nltk import word_tokenize, sent_tokenize
from sklearn.metrics import mean_squared_error, r2_score
from tqdm.notebook import tqdm

In [2]:
_PATH = '/kaggle/input/commonlit-evaluate-student-summaries'

In [3]:
def _get_path():
    if os.name == 'nt':
        return f'.{_PATH}'
    elif os.name == 'posix':
        return _PATH

In [4]:
def setup():

    if torch.cuda.is_available():
        device = torch.device("cuda")
        print('\x1b[0;32mGPU is available.\x1b[0m')
    else:
        device = torch.device("cpu")
        print('\x1b[0;34mGPU not available. CPU used.\x1b[0m')

    return device, _get_path()

In [5]:
def get_data(path=None):
    if path is None:
        path = _get_path()

    summaries_df = pd.read_csv(f'{path}/summaries_train.csv')
    prompts_df = pd.read_csv(f'{path}/prompts_train.csv')
    return summaries_df, prompts_df

In [6]:
def get_prompt(summary, prompts_df):
    prompt = prompts_df.loc[prompts_df.prompt_id == summary.prompt_id]
    return prompt.iloc[0]

In [7]:
def mcrmse(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    if y_true.shape != y_pred.shape:
        raise ValueError("Shapes of y_true and y_pred must be the same.")
    rmse_values = np.sqrt(np.mean((y_true - y_pred)**2, axis=0))
    mcrmse = np.mean(rmse_values)

    return mcrmse

### ROUGE-Scorers

In [8]:
Score = collections.namedtuple('Score', ['precision', 'recall', 'fmeasure'])

In [9]:
def tokenize(text):
    tokens = word_tokenize(text.lower())
    return tokens

In [10]:
def get_sents(text):
    sents = sent_tokenize(text)
    sents = [x for x in sents if len(x)]
    return sents

In [11]:
def _lcs_table(ref, can):
    """Create 2-d LCS score table."""
    rows = len(ref)
    cols = len(can)
    lcs_table = np.zeros((rows + 1, cols + 1))
    for i in range(1, rows + 1):
        for j in range(1, cols + 1):
            if ref[i - 1] == can[j - 1]:
                lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
            else:
                lcs_table[i][j] = max(lcs_table[i - 1][j], lcs_table[i][j - 1])
    return lcs_table

In [12]:
def _backtrack_norec(t, ref, can):
    """Read out LCS."""
    i = len(ref)
    j = len(can)
    lcs = []
    while i > 0 and j > 0:
        if ref[i - 1] == can[j - 1]:
            lcs.insert(0, i - 1)
            i -= 1
            j -= 1
        elif t[i][j - 1] > t[i - 1][j]:
            j -= 1
        else:
            i -= 1
    return lcs

In [13]:
def lcs_ind(ref, can):
    """Returns one of the longest lcs."""
    t = _lcs_table(ref, can)
    return _backtrack_norec(t, ref, can)

In [14]:
def _find_union(lcs_list):
    """Finds union LCS given a list of LCS."""
    return sorted(list(set().union(*lcs_list)))

In [15]:
def _union_lcs(ref, c_list):
    lcs_list = [lcs_ind(ref, c) for c in c_list]
    return [ref[i] for i in _find_union(lcs_list)]

In [16]:
def rouge_n(target, prediction, n):
    target_tokens = tokenize(target)
    prediction_tokens = tokenize(prediction)

    target_ngrams = collections.Counter(ngrams(target_tokens, n))
    prediction_ngrams = collections.Counter(ngrams(prediction_tokens, n))

    intersection_ngrams_count = 0
    for ngram in target_ngrams:
        intersection_ngrams_count += min(target_ngrams[ngram],
                                         prediction_ngrams[ngram])
    target_ngrams_count = sum(target_ngrams.values())
    prediction_ngrams_count = sum(prediction_ngrams.values())

    precision = intersection_ngrams_count / max(prediction_ngrams_count, 1)
    recall = intersection_ngrams_count / max(target_ngrams_count, 1)

    if precision + recall > 0:
        fmeasure = 2 * precision * recall / (precision + recall)
    else:
        fmeasure = 0.0

    return pd.Series({'precision': precision, 'recall': recall, 'fmeasure': fmeasure})

In [17]:
def rouge_l(target, prediction):
    target_tokens = tokenize(target)
    prediction_tokens = tokenize(prediction)

    if not target_tokens or not prediction_tokens:
        return pd.Series({'precision': 0, 'recall': 0, 'fmeasure': 0})

    lcs_table = _lcs_table(target_tokens, prediction_tokens)

    lcs_length = lcs_table[-1][-1]

    precision = lcs_length / len(prediction_tokens)
    recall = lcs_length / len(target_tokens)
    if precision + recall > 0:
        fmeasure = 2 * precision * recall / (precision + recall)
    else:
        fmeasure = 0.0

    return pd.Series({'precision': precision, 'recall': recall, 'fmeasure': fmeasure})

In [18]:
def rouge_lsum(target, prediction):
    target_tokens_list = [
        tokenize(s) for s in get_sents(target)]
    prediction_tokens_list = [
        tokenize(s) for s in get_sents(prediction)]

    if not target_tokens_list or not prediction_tokens_list:
        return pd.Series({'precision': 0, 'recall': 0, 'fmeasure': 0})

    m = sum(map(len, target_tokens_list))
    n = sum(map(len, prediction_tokens_list))
    if not n or not m:
        return pd.Series({'precision': 0, 'recall': 0, 'fmeasure': 0})

    # get token counts to prevent double counting
    token_cnts_r = collections.Counter()
    token_cnts_c = collections.Counter()
    for s in target_tokens_list:
        # s is a list of tokens
        token_cnts_r.update(s)
    for s in prediction_tokens_list:
        token_cnts_c.update(s)

    hits = 0
    for r in target_tokens_list:
        lcs = _union_lcs(r, prediction_tokens_list)
        # Prevent double-counting:
        # The paper describes just computing hits += len(_union_lcs()),
        # but the implementation prevents double counting
        for t in lcs:
            if token_cnts_c[t] > 0 and token_cnts_r[t] > 0:
                hits += 1
                token_cnts_c[t] -= 1
                token_cnts_r[t] -= 1

    recall = hits / m
    precision = hits / n
    if precision + recall > 0:
        fmeasure = 2 * precision * recall / (precision + recall)
    else:
        fmeasure = 0.0
    return pd.Series({'precision': precision, 'recall': recall, 'fmeasure': fmeasure})

In [19]:
def preprocess(summaries, prompts):
    tqdm.pandas()

    prompt_columns = ['prompt_text', 'prompt_title', 'prompt_question']

    merged_df = summaries.merge(prompts, 'inner', 'prompt_id')

    print("Scores are being calculated. Please stand by...")
    print('rouge_1')
    merged_df[['rouge1_recall', 'rouge1_precision', 'rouge1_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_n(row.prompt_text, row.text, 1), axis=1, result_type='expand')
    print('rouge_2')
    merged_df[['rouge2_recall', 'rouge2_precision', 'rouge2_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_n(row.prompt_text, row.text, 2), axis=1, result_type='expand')
    print('rouge_l')
    merged_df[['rougeL_recall', 'rougeL_precision', 'rougeL_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_l(row.prompt_text, row.text), axis=1, result_type='expand')
    print('rouge_l_sum')
    merged_df[['rougeLsum_recall', 'rougeLsum_precision', 'rougeLsum_fmeasure']] = merged_df[['text', 'prompt_text']].progress_apply(lambda row: rouge_lsum(row.prompt_text, row.text), axis=1, result_type='expand')

    summaries = merged_df.drop(prompt_columns, axis=1)
    return summaries, prompts

In [20]:
def get_scores(row):
    scores = []
    for score in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:
        scores.append((row[f'{score}_precision'],
                       row[f'{score}_recall'],
                       row[f'{score}_fmeasure']))

    scores = torch.Tensor(scores)
    scores = flatten(scores)
    return scores

### Model

In [21]:
class Model(nn.Module):
    def __init__(self, hidden_dim, output_dim, summary_len_mean, summary_len_std):
        super().__init__()

        self.summary_len_mean = summary_len_mean
        self.summary_len_std = summary_len_std

        self.scores = ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']
        input_dim = 4*3 + 1

        self.layer1 = nn.Linear(input_dim, hidden_dim)
        self.non_lin = nn.ReLU()
        self.layer2 = nn.Linear(hidden_dim, output_dim)

        self.device = 'cpu'
        self.to(self.device)

    def forward(self, prompt_text, summary_text, scores):
        #scores = self._calculate_rouge_scores(prompt_text, summary_text)
        #scores = flatten(scores)

        summary_len_norm = self._summary_len_norm(summary_text)

        result = cat((scores, summary_len_norm))

        result = self.layer1(result)
        result = self.non_lin(result)
        result = self.layer2(result)

        return result

    def _calculate_rouge_scores(self, prompt_text, summary_text):

        scores = [
            rouge_n(prompt_text, summary_text, 1),
            rouge_n(prompt_text, summary_text, 2),
            rouge_l(prompt_text, summary_text),
            rouge_lsum(prompt_text, summary_text)
        ]

        return Tensor(scores).to(self.device)

    def _summary_len_norm(self, summary_text):
        zscore = (len(summary_text) - self.summary_len_mean) / self.summary_len_std
        return Tensor((zscore,)).to(self.device)


    def to(self, device, *args, **kwargs):
        super().to(device, *args, **kwargs)
        self.device = device
        return self

### Setup

In [22]:
device, path = setup()
summaries_df, prompts_df = get_data(path)

[0;34mGPU not available. CPU used.[0m


In [23]:
summaries_df, prompts_df = preprocess(summaries_df, prompts_df)
summaries_df.columns

Scores are being calculated. Please stand by...
rouge_1


  0%|          | 0/5732 [00:00<?, ?it/s]

rouge_2


  0%|          | 0/5732 [00:00<?, ?it/s]

rouge_l


  0%|          | 0/5732 [00:00<?, ?it/s]

rouge_l_sum


  0%|          | 0/5732 [00:00<?, ?it/s]

Index(['student_id', 'prompt_id', 'text', 'content', 'wording',
       'rouge1_recall', 'rouge1_precision', 'rouge1_fmeasure', 'rouge2_recall',
       'rouge2_precision', 'rouge2_fmeasure', 'rougeL_recall',
       'rougeL_precision', 'rougeL_fmeasure', 'rougeLsum_recall',
       'rougeLsum_precision', 'rougeLsum_fmeasure'],
      dtype='object')

In [24]:
epochs = 10
learning_rate = 0.01
hidden_dim = 64
#output_dim = 2
output_dim = 1

In [25]:
text_len_mean = summaries_df.text.apply(len).mean()
text_len_std  = summaries_df.text.apply(len).std()

In [26]:
model = Model(hidden_dim, output_dim, text_len_mean, text_len_std)

In [27]:
criterion = nn.MSELoss()
optimizer=torch.optim.Adam

In [28]:
optimizer = optimizer(list(model.parameters()))
optimizer.lr = learning_rate

### Training

In [29]:
print("Training...")

Training...


In [30]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}", end="\r")

    summaries_df = summaries_df.sample(frac=1).reset_index(drop=True)

    y_true = []
    y_pred = []

    for index, summary in summaries_df.iterrows():

        prompt = get_prompt(summary, prompts_df)
        scores = get_scores(summary)

        #target = Tensor([summary.content, summary.wording]).to(device)
        target = Tensor([summary.content]).to(device)

        predictions = model(prompt.prompt_text, summary.text, scores)
        loss = criterion(predictions, target)
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

        y_true.append([*(float(x) for x in target)])
        y_pred.append([*(float(x) for x in predictions)])

Epoch 10/10

In [31]:
print("Done")

Done


In [32]:
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"MCRMSE: {mcrmse(y_true, y_pred):.4f}")
print(f"MSE: {mse:.4f}")
print(f"R2:  {r2:.4f}")

MCRMSE: 0.4581
MSE: 0.2098
R2:  0.8069


In [33]:
test_prompts_df = pd.read_csv(f'{path}/prompts_test.csv')
test_summaries_df = pd.read_csv(f'{path}/summaries_test.csv')

In [34]:
test_summaries_df, test_prompts_df = preprocess(test_summaries_df, test_prompts_df)

Scores are being calculated. Please stand by...
rouge_1


  0%|          | 0/4 [00:00<?, ?it/s]

rouge_2


  0%|          | 0/4 [00:00<?, ?it/s]

rouge_l


  0%|          | 0/4 [00:00<?, ?it/s]

rouge_l_sum


  0%|          | 0/4 [00:00<?, ?it/s]

In [35]:
prediction_list = []

for index, summary in test_summaries_df.iterrows():

        prompt = get_prompt(summary, test_prompts_df)
        scores = get_scores(summary)

        predictions = model(prompt.prompt_text, summary.text, scores)
        content = float(predictions[0])
        #wording = float(predictions[1])

        prediction_list.append((summary.student_id, content))#, wording))

In [36]:
predictions_josef = pd.DataFrame(prediction_list, columns=['student_id', 'content'])

# Tharrmeehan

In [37]:
# import stuff

In [38]:
#
predictions_tharrmeehan = test_summaries_df.copy()
predictions_tharrmeehan['wording'] = 0
# TODO

# Combining

In [39]:
predictions_josef.head()

Unnamed: 0,student_id,content
0,000000ffffff,-2.728357
1,222222cccccc,-2.728357
2,111111eeeeee,-2.728357
3,333333dddddd,-2.728357


In [40]:
predictions_tharrmeehan.head()

Unnamed: 0,student_id,prompt_id,text,rouge1_recall,rouge1_precision,rouge1_fmeasure,rouge2_recall,rouge2_precision,rouge2_fmeasure,rougeL_recall,rougeL_precision,rougeL_fmeasure,rougeLsum_recall,rougeLsum_precision,rougeLsum_fmeasure,wording
0,000000ffffff,abc123,Example text 1,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0
1,222222cccccc,abc123,Example text 3,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0
2,111111eeeeee,def789,Example text 2,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0
3,333333dddddd,def789,Example text 4,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0


In [41]:
predictions_josef = predictions_josef[['student_id', 'content']]
predictions_tharrmeehan = predictions_tharrmeehan[['student_id', 'wording']]

In [42]:
predictions = pd.merge(predictions_josef, predictions_tharrmeehan, on='student_id', how='inner')
#predictions = pd.merge(predictions_josef, predictions_tharrmeehan, on='student_id', how='outer')

In [43]:
predictions.to_csv('submission.csv',index=False)
display(pd.read_csv('submission.csv'))

Unnamed: 0,student_id,content,wording
0,000000ffffff,-2.728357,0
1,222222cccccc,-2.728357,0
2,111111eeeeee,-2.728357,0
3,333333dddddd,-2.728357,0
