In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
!pip install textstat rouge pyphen --no-index --find-links=file:///kaggle/input/external/ 

Looking in links: file:///kaggle/input/external/


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# progressbar
from tqdm.notebook import tqdm

# preprocessing
from nltk.corpus import stopwords
from string import punctuation
from transformers import AutoTokenizer
from textstat import textstat

In [5]:
prompts_test = pd.read_csv("data/prompts_test.csv")
summaries_test = pd.read_csv("data/summaries_test.csv")

summaries_train = pd.read_csv("data/summaries_train.csv")
prompts_train = pd.read_csv("data/prompts_train.csv")

## Tokenization etc.

In [6]:
PATH = "/kaggle/input/"

In [7]:
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

stopwords_list = set(stopwords.words("english"))
punctuation = list(punctuation)

## Feature Engineering

In [8]:
merged_train = summaries_train.merge(prompts_train, on="prompt_id", how="inner")
merged_test = summaries_test.merge(prompts_test, on="prompt_id", how="inner")

In [9]:
def word_overlap_count(row):
    """intersection(prompt_text, text)"""

    def check_is_stop_word(word):
        return word in stopwords_list

    prompt_words = row["prompt_tokens"]
    summary_words = row["summary_tokens"]

    if stopwords_list:
        prompt_words = list(filter(check_is_stop_word, prompt_words))
        summary_words = list(filter(check_is_stop_word, summary_words))
    return len(set(prompt_words).intersection(set(summary_words)))

In [10]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [11]:
def create_tokens(dataframe):
    tqdm.pandas()
    dataframe["summary_tokens"] = dataframe["text"].progress_apply(
        lambda x: tokenizer.tokenize(x, truncation=True, padding=True, max_length=1024)
    )

    dataframe["prompt_tokens"] = dataframe["prompt_text"].progress_apply(
        lambda x: tokenizer.tokenize(x, truncation=True, padding=True, max_length=1024)
    )
    
create_tokens(merged_train)
create_tokens(merged_test)

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
def ngrams(token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]


In [13]:
def ngram_co_occurrence(row, n: int) -> int:
    # Tokenize the original text and summary into words
    original_tokens = row["prompt_tokens"]
    summary_tokens = row["summary_tokens"]

    # Generate n-grams for the original text and summary
    original_ngrams = set(ngrams(original_tokens, n))
    summary_ngrams = set(ngrams(summary_tokens, n))

    # Calculate the number of common n-grams
    common_ngrams = original_ngrams.intersection(summary_ngrams)

    return len(common_ngrams)

In [14]:
from rouge import Rouge

rouge = Rouge()

def get_rouge_1_score(row):
    scores = rouge.get_scores(row["text"], row["prompt_text"])
    return scores[0]["rouge-1"]["f"]

def get_rouge_2_score(row):
    scores = rouge.get_scores(row["text"], row["prompt_text"])
    return scores[0]["rouge-2"]["f"]

def get_rouge_l_score(row):
    scores = rouge.get_scores(row["text"], row["prompt_text"])
    return scores[0]["rouge-l"]["f"]

In [15]:
def preprocess(dataframe):
    tqdm.pandas()
    dataframe["flesch_reading_ease"] = dataframe["text"].progress_apply(
        textstat.flesch_reading_ease
    )
    dataframe["difficult_words"] = dataframe["text"].progress_apply(textstat.difficult_words)
    dataframe["bigrams_overlap_count"] = dataframe.progress_apply(
        ngram_co_occurrence, args=(2,), axis=1
    )
    dataframe["automated_readability_index"] = dataframe["text"].progress_apply(
        textstat.automated_readability_index
    )
    dataframe["coleman_liau_index"] = dataframe["text"].progress_apply(
        textstat.coleman_liau_index
    )
    dataframe["linsear_write_formula"] = dataframe["text"].progress_apply(
        textstat.linsear_write_formula
    )
    dataframe["gunning_fog"] = dataframe["text"].progress_apply(textstat.gunning_fog)
    dataframe["smog_index"] = dataframe["text"].progress_apply(textstat.smog_index)
    dataframe["word_overlap_count"] = dataframe.progress_apply(word_overlap_count, axis=1)
    dataframe["dale_chall_readability_score"] = dataframe["text"].progress_apply(
    textstat.dale_chall_readability_score)
    dataframe["rouge_1"] = dataframe.progress_apply(get_rouge_1_score, axis=1)
    dataframe["rouge_2"] = dataframe.progress_apply(get_rouge_2_score, axis=1)
    dataframe["rouge_l"] = dataframe.progress_apply(get_rouge_l_score, axis=1)

    
preprocess(merged_train)
preprocess(merged_test)

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/7165 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
merged_train = merged_train.drop(["student_id", "text", "prompt_question", "prompt_title", "prompt_text", "summary_tokens", "prompt_tokens"], axis=1)
merged_train

Unnamed: 0,prompt_id,content,wording,flesch_reading_ease,difficult_words,bigrams_overlap_count,automated_readability_index,coleman_liau_index,linsear_write_formula,gunning_fog,smog_index,word_overlap_count,dale_chall_readability_score,rouge_1,rouge_2,rouge_l
0,814d6b,0.205683,0.380538,64.41,11,10,8.3,9.04,8.375000,9.40,10.7,16,7.76,0.127168,0.006768,0.127168
1,814d6b,3.272894,3.219757,65.22,34,28,9.5,10.43,6.625000,9.15,10.1,28,8.24,0.227907,0.058091,0.227907
2,814d6b,0.205683,0.380538,76.22,11,21,7.0,8.63,6.200000,8.13,9.4,15,8.71,0.168605,0.044143,0.151163
3,814d6b,0.567975,0.969062,54.26,12,27,14.5,10.97,16.333333,14.33,14.1,18,8.84,0.231638,0.066116,0.197740
4,814d6b,-0.910596,-0.081769,74.69,5,7,6.1,7.24,6.750000,8.36,0.0,9,7.81,0.093750,0.014363,0.087500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7160,39c16e,-0.981265,-1.548900,54.90,8,7,16.2,9.12,20.500000,18.05,0.0,12,10.54,0.127660,0.010292,0.109422
7161,39c16e,-0.511077,-1.589115,66.41,6,6,14.6,8.66,16.000000,13.33,0.0,12,9.34,0.110429,0.006897,0.098160
7162,39c16e,-0.834946,-0.593749,95.47,2,15,3.2,5.02,3.833333,3.88,3.1,9,6.30,0.130435,0.024096,0.118012
7163,39c16e,-0.157460,-0.165811,71.85,12,24,10.6,11.08,8.500000,9.79,9.7,11,9.92,0.131737,0.050336,0.131737


In [14]:
merged_test = merged_test.drop(["text", "prompt_question", "prompt_title", "prompt_text", "summary_tokens", "prompt_tokens"], axis=1)
merged_test

Unnamed: 0,student_id,prompt_id,flesch_reading_ease,difficult_words,bigrams_overlap_count,automated_readability_index,coleman_liau_index,linsear_write_formula,gunning_fog,smog_index,word_overlap_count,dale_chall_readability_score,rouge_1,rouge_2,rouge_l
0,000000ffffff,abc123,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0
1,222222cccccc,abc123,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0
2,111111eeeeee,def789,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0
3,333333dddddd,def789,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0


In [15]:
X_train, X_val, y_train, y_val = train_test_split(
    merged_train.drop(["wording", "content", "prompt_id"], axis=1),
    merged_train[["wording", "content"]],
    test_size=0.2,
    random_state=42,
    stratify=merged_train["prompt_id"],
)

In [16]:
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

In [17]:
import optuna

def objective(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 1e-8, 1.0, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 1, 1000),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "max_depth": trial.suggest_int("max_depth", 1, 32),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 100),
        "subsample": trial.suggest_float("subsample", 0.1, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
    }

    lgb_model = lgb.LGBMRegressor(**params)
    model = MultiOutputRegressor(lgb_model)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = mean_squared_error(y_val, y_pred, squared=False)
    return score

study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(objective, n_trials=100)

[I 2023-11-02 14:04:52,827] A new study created in memory with name: no-name-21b0f496-9d75-4f86-a29e-0405473d8701
[I 2023-11-02 14:05:08,740] Trial 0 finished with value: 0.6198880730568503 and parameters: {'learning_rate': 0.2718694377761657, 'n_estimators': 923, 'num_leaves': 96, 'max_depth': 20, 'min_child_samples': 49, 'subsample': 0.8509636459591278, 'colsample_bytree': 0.49569670848601954, 'reg_alpha': 2.8271837018426556e-07, 'reg_lambda': 8.645547409123926e-05}. Best is trial 0 with value: 0.6198880730568503.
[I 2023-11-02 14:05:09,145] Trial 1 finished with value: 1.0492221097331935 and parameters: {'learning_rate': 2.744061427318628e-07, 'n_estimators': 143, 'num_leaves': 50, 'max_depth': 3, 'min_child_samples': 36, 'subsample': 0.25626028850186333, 'colsample_bytree': 0.6410651636201727, 'reg_alpha': 0.20695169538681307, 'reg_lambda': 2.4193174215976123e-07}. Best is trial 0 with value: 0.6198880730568503.
[I 2023-11-02 14:05:10,857] Trial 2 finished with value: 0.74822126179

In [18]:
model = MultiOutputRegressor(lgb.LGBMRegressor(**study_lgb.best_params))
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
eval_pred = (y_pred, y_val)
compute_mcrmse(eval_pred)

{'content_rmse': 0.6395283759714654,
 'wording_rmse': 0.49026515591159103,
 'mcrmse': 0.5648967659415283}

In [19]:
merged_test

Unnamed: 0,student_id,prompt_id,flesch_reading_ease,difficult_words,bigrams_overlap_count,automated_readability_index,coleman_liau_index,linsear_write_formula,gunning_fog,smog_index,word_overlap_count,dale_chall_readability_score,rouge_1,rouge_2,rouge_l
0,000000ffffff,abc123,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0
1,222222cccccc,abc123,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0
2,111111eeeeee,def789,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0
3,333333dddddd,def789,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0


In [20]:
y_pred_final = model.predict(merged_test.drop(["student_id", "prompt_id"], axis=1))

In [21]:
y_pred_final_df = pd.DataFrame(y_pred_final, columns=["wording", "content"])
y_pred_final_df

Unnamed: 0,wording,content
0,-1.340299,-1.746087
1,-1.340299,-1.746087
2,-1.340299,-1.746087
3,-1.340299,-1.746087


In [22]:
merged_test_final = merged_test.merge(y_pred_final_df, left_index=True, right_index=True)

In [23]:
merged_test_final

Unnamed: 0,student_id,prompt_id,flesch_reading_ease,difficult_words,bigrams_overlap_count,automated_readability_index,coleman_liau_index,linsear_write_formula,gunning_fog,smog_index,word_overlap_count,dale_chall_readability_score,rouge_1,rouge_2,rouge_l,wording,content
0,000000ffffff,abc123,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0,-1.340299,-1.746087
1,222222cccccc,abc123,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0,-1.340299,-1.746087
2,111111eeeeee,def789,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0,-1.340299,-1.746087
3,333333dddddd,def789,59.97,1,0,-1.2,-2.38,1.5,14.53,0.0,0,19.58,0.0,0.0,0.0,-1.340299,-1.746087


In [24]:
out = merged_test_final[["student_id", "content", "wording"]]

In [25]:
out.to_csv("submission.csv", index=False)