In [None]:
# !pip install -U sentence-transformers
# !pip install xgboost

In [None]:
# # Load from google drive
# from google.colab import drive
# drive.mount('/content/drive')
# data_path = "/content/drive/MyDrive/data/cs7641/train_tokenized.csv"

# Load from local
data_path = "../../data/train_tokenized.csv"

In [1]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
smodel = SentenceTransformer('all-distilroberta-v1')

In [3]:
data = pd.read_csv(data_path) # Palash's file
data.drop(columns=["Unnamed: 0"], inplace=True)
data["full_text"] = data["full_text"].apply(lambda x: x.strip())
tasks = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
data["holistic_score"] = data[tasks].mean(axis=1)
data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions,word_token_nltk,sent_token,word_token_manual,clean_text,lemm_text,freq_dist,most_common_words,distinct_words_cnt,holistic_score
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0,"['I', 'think', 'that', 'students', 'would', 'b...",['I think that students would benefit from lea...,"['I', 'think', 'that', 'students', 'would', 'b...","['think', 'students', 'would', 'benefit', 'lea...","['think', 'student', 'would', 'benefit', 'lear...",<FreqDist with 81 samples and 129 outcomes>,"[('student', 5), ('class', 5), ('go', 5)]",81,3.333333
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5,"['When', 'a', 'problem', 'is', 'a', 'change', ...",['When a problem is a change you have to let i...,"['When', 'a', 'problem', 'is', 'a', 'change', ...","['problem', 'change', 'let', 'best', 'matter',...","['problem', 'change', 'let', 'best', 'matter',...",<FreqDist with 80 samples and 215 outcomes>,"[('change', 16), ('different', 12), ('problem'...",80,2.416667
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5,"['Dear', ',', 'Principal', 'If', 'u', 'change'...","['Dear, Principal\n\nIf u change the school po...","['Dear,', 'Principal\n\nIf', 'u', 'change', 't...","['dear', 'principal', 'u', 'change', 'school',...","['dear', 'principal', 'u', 'change', 'school',...",<FreqDist with 58 samples and 133 outcomes>,"[('school', 9), ('average', 9), ('sport', 8)]",58,3.0
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0,"['The', 'best', 'time', 'in', 'life', 'is', 'w...",['The best time in life is when you become you...,"['The', 'best', 'time', 'in', 'life', 'is', 'w...","['best', 'time', 'life', 'become', 'agree', 'g...","['best', 'time', 'life', 'become', 'agree', 'g...",<FreqDist with 132 samples and 282 outcomes>,"[('make', 16), ('choice', 10), ('others', 8)]",132,4.5
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5,"['Small', 'act', 'of', 'kindness', 'can', 'imp...",['Small act of kindness can impact in other pe...,"['Small', 'act', 'of', 'kindness', 'can', 'imp...","['small', 'act', 'kindness', 'impact', 'people...","['small', 'act', 'kindness', 'impact', 'people...",<FreqDist with 67 samples and 112 outcomes>,"[('people', 6), ('person', 6), ('act', 5)]",67,2.75


In [4]:
# Create cross validation set
from sklearn.model_selection import cross_validate
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import StratifiedKFold, train_test_split

def mcrmse(y_trues, y_preds):
    scores = []
    n_tasks = y_trues.shape[1]
    for i in range(n_tasks):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores

def mcrmse_error(y_trues, y_preds, **kwargs):
    mcrmse_score, _ = mcrmse(y_trues, y_preds)
    return mcrmse_score


mcrmse_scorer = make_scorer(mcrmse_error, greater_is_better=False)

In [5]:

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
skf.split(X=data["full_text"], y=data["holistic_score"].astype(int))

<generator object _BaseKFold.split at 0x159b89690>

In [6]:
class SimpleSentenceEmbeddingGrader(BaseEstimator):
    def __init__(self, encoder, decoder_cls, decoder_kwargs):
        self.encoder = encoder
        self.decoder_cls = decoder_cls
        self.decoder_kwargs = decoder_kwargs
    def fit(self, X, y):
        """
        Fit the model to the data.
        Input:
            X: Essay text (list of strings). Shape: (n_samples,)
            y: Scores (array of floats). Shape: (n_samples,n_tasks)
        """
        self.n_tasks = y.shape[1]
        # Encode the essays
        X_embeddings = self.encoder.encode(X, show_progress_bar=True)
        # Fit the decoder
        self.decoders = []
        for i in range(self.n_tasks):
            decoder = self.decoder_cls(**self.decoder_kwargs)
            decoder.fit(X_embeddings, y[:,i])
            self.decoders.append(decoder)
        return self

    def predict(self, X):
        """
        Predict the scores for the essays.
        Input:
            X: Essay text (list of strings). Shape: (n_samples,)
        Output:
            y_pred: Predicted scores (array of floats). Shape: (n_samples,n_tasks)
        """
        # Encode the essays
        X_embeddings = self.encoder.encode(X, show_progress_bar=True)
        # Predict the scores
        y_pred = np.zeros((len(X), self.n_tasks))
        for i in range(self.n_tasks):
            y_pred[:,i] = self.decoders[i].predict(X_embeddings)
        return y_pred

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

class XGBRegressorEarlyStopping(XGBRegressor):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
    
    def fit(self, X, y):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
        eval_set = [(X_val, y_val)]
        super().fit(X_train, y_train, eval_set=eval_set, verbose=True)

experiments = {
    "LinearRegression": {
        "decoder_cls": LinearRegression,
        "decoder_kwargs": {}
    },
    "RandomForest300_10_7_3_sqrt": {
        "decoder_cls": RandomForestRegressor,
        "decoder_kwargs": {
            "n_estimators": 300,
            "max_depth": 10,
            "min_samples_split": 7,
            "min_samples_leaf": 3,
            "max_features": "sqrt"
        }
    },
    "RandomForest300_20_7_3_0.2": {
        "decoder_cls": RandomForestRegressor,
        "decoder_kwargs": {
            "n_estimators": 300,
            "max_depth": 20,
            "min_samples_split": 7,
            "min_samples_leaf": 3,
            "max_features": 0.2
        }
    },
    "RandomForest300_10_11_5_sqrt": {
        "decoder_cls": RandomForestRegressor,
        "decoder_kwargs": {
            "n_estimators": 300,
            "max_depth": 10,
            "min_samples_split": 11,
            "min_samples_leaf": 5,
            "max_features": "sqrt"
        }
    },
    "RandomForest300_10_3_1_0.1": {
        "decoder_cls": RandomForestRegressor,
        "decoder_kwargs": {
            "n_estimators": 300,
            "max_depth": 10,
            "min_samples_split": 3,
            "min_samples_leaf": 1,
            "max_features": 0.1
        }
    },
    "XGBRegressor": {
        "decoder_cls": XGBRegressorEarlyStopping,
        "decoder_kwargs": {
            "n_estimators": 500,
            "eval_metric":mean_squared_error, 
            "early_stopping_rounds":10,
            "objective": "reg:squarederror",
            }
    }
}

In [9]:
cv_iterators = skf.split(X=data["full_text"], y=data["holistic_score"].astype(int))
experiment_results = {}
for experiment_name, experiment in experiments.items():
    print(f"Running experiment: {experiment_name}")
    model = SimpleSentenceEmbeddingGrader(smodel, experiment["decoder_cls"], experiment["decoder_kwargs"])
    scores = cross_validate(
        model, 
        X=data["full_text"].values, 
        y=data[tasks].values, 
        cv=cv_iterators,
        scoring=mcrmse_scorer,
        n_jobs=1,
        error_score='raise'
        )
    experiment_results[experiment_name] = scores
    print(f"Experiment {experiment_name} finished. Mean score: {np.mean(scores['test_score'])}")

Running experiment: LinearRegression


Batches: 100%|██████████| 110/110 [07:34<00:00,  4.13s/it]
Batches: 100%|██████████| 13/13 [00:51<00:00,  3.95s/it]
Batches:  51%|█████     | 56/110 [04:40<04:30,  5.01s/it]


KeyboardInterrupt: 

In [10]:
smodel.encode(data["full_text"].values, show_progress_bar=True)

Batches:   1%|          | 1/123 [00:08<17:04,  8.40s/it]


KeyboardInterrupt: 