## Instaling Packages
___

In [14]:
!pip install rouge --quiet
!pip install textstat --quiet
!pip install lightgbm --quiet
!pip install optuna --quiet
!pip install joblib --quiet
!pip install torch --quiet
!pip install pandas --quiet
!pip install tqdm --quiet
!pip install scikit-learn==1.2.2 --quiet
!pip install transformers --quiet
!pip install nltk --quiet

## Functions
___

In [15]:
def train_ensemble(ensemble_model, train_df, epochs, criterion, optimizer, verbose=True):
    for epoch in tqdm(range(epochs), desc='Training', disable=not verbose):
        train_df = train_df.sample(frac=1).reset_index(drop=True)

        y_true = []
        y_pred = []

        for index, summary in tqdm(train_df.iterrows(), total=len(train_df), leave=False, disable=not verbose):
            transformer_preds = torch.Tensor([summary.transformer_content, summary.transformer_wording])
            rouge_preds = torch.Tensor([summary.rouge_content, summary.rouge_wording])
            lgbm_preds = torch.Tensor([summary.lgbm_content, summary.lgbm_wording])

            target = torch.Tensor([summary.content, summary.wording]).to(device)
            predictions = ensemble_model(
                summary.text,
                summary.prompt_title,
                summary.prompt_question,
                summary.prompt_text,
                transformer_preds,
                rouge_preds,
                lgbm_preds
            )

            loss = criterion(predictions, target)
            optimizer.zero_grad()

            loss.backward()
            optimizer.step()

            y_true.append([*(float(x) for x in target)])
            y_pred.append([*(float(x) for x in predictions)])

        if verbose:

            rmse = mean_squared_error(y_true, y_pred, squared=False)
            mse = mean_squared_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)

            performance = {'RMSE': rmse, 'R2': r2, 'MSE': mse}

            print("\nEpoch", epoch+1)
            print(performance)
            
    return y_true, y_pred

## Imports
___

In [4]:
import os
import joblib
import torch
from torch import nn
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error, r2_score
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from ROUGE_Model_Loader import ROUGEModelLoader
from Ensemble import EnsembleNN
from AIOLightGBM import AIO

## Device Settings
___

In [17]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('\x1b[0;32mGPU is available.\x1b[0m')
else:
    device = torch.device("cpu")
    print('\x1b[0;34mGPU not available. CPU used.\x1b[0m')

[0;34mGPU not available. CPU used.[0m


## Loading Data
___

In [18]:
DATA_PATH = 'data'
MODEL_PATH = 'models'
TOKENIZER_PATH = 'tokenizers'

In [19]:
print("Loading Data...", end="\r")

prompts_test_df = pd.read_csv(f'{DATA_PATH}/prompts_test.csv')
prompts_train_df = pd.read_csv(f'{DATA_PATH}/prompts_train.csv')
summaries_test_df = pd.read_csv(f'{DATA_PATH}/summaries_test.csv')
summaries_train_df = pd.read_csv(f'{DATA_PATH}/summaries_train.csv')

merged_test_df = pd.merge(summaries_test_df, prompts_test_df, on='prompt_id')
merged_train_df = pd.merge(summaries_train_df, prompts_train_df, on='prompt_id')

print("Loading Data - ok")

Loading Data - ok


# Preparing Models
___

In [20]:
print("Loading Transformer...", end="\r")
# Replace the path for the transformer and tokenizer you want to run
TRANSFORMER_PATH = f'{MODEL_PATH}/deberta-v3-base/checkpoint-4012'
TOKENIZER_PATH = f'{TOKENIZER_PATH}/deberta-v3-base-tokenizer'
# Adjust max length to fitted model
MAX_LENGTH = 1024

transformer = AutoModelForSequenceClassification.from_pretrained(TRANSFORMER_PATH, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

print("Loading Transformer - ok")

Loading Transformer - ok


In [21]:
print("Loading ROUGE model...", end="\r")
    
target_score = 'both'
hidden_dim = 64
epochs = 10
lr = 0.01

ROUGE_MODEL_PATH = f'{MODEL_PATH}/rouge_based_models/ROUGE_Based_Model_{target_score}_{hidden_dim}_{epochs}_{lr}.pt'

model_loader = ROUGEModelLoader(merged_train_df, hidden_dim, target_score)
if os.path.exists(ROUGE_MODEL_PATH):
    rouge_model = model_loader.model
    rouge_model.load_state_dict(torch.load(ROUGE_MODEL_PATH))
else:
    print("Loading ROUGE model...")
    print("Specified ROUGE Based Model doesn't exist. Training it now.")
    model_loader.data_path = '/data/rouge_preprocessed_data.csv'
    print(model_loader.train(epochs, lr))
    rouge_model = model_loader.model

print("Loading ROUGE model - ok")

Loading ROUGE model - ok


In [22]:
print("Loading LGBM model...", end='\r')

LGBM_MODEL_PATH = f'{MODEL_PATH}/lgbm_models/lgbm_model.joblib'  # Adjust the file type if needed
if os.path.exists(LGBM_MODEL_PATH):# and False:
    lgbm_model = joblib.load(LGBM_MODEL_PATH)
    print("Loading LGBM model - ok")
else:
    lgbm_model = AIO(merged_train_df, merged_test_df.head())
    lgbm_model.run()
    # lgbm_model = lgbm.model
    print("Loading LGBM model - ok")

    # Save the model
    print("Saving LGBM model...", end='\r')
    joblib.dump(lgbm_model, f'{MODEL_PATH}/lgbm_models/lgbm_model.joblib')
    print("Saving LGBM model - ok")

Loading LGBM model - ok


In [23]:
print("Preparing Ensemble model...", end="\r")

hidden_layers = 1
hidden_dim = 64
epochs = 10
lr = 0.01

ENSEMBLE_MODEL_PATH = f'{MODEL_PATH}/ensembles/ensemble_{hidden_layers}_{hidden_dim}_{epochs}_{lr}.pt'

model = EnsembleNN(
        transformer,
        tokenizer,
        rouge_model,
        lgbm_model,
        # Adjust max length to fitted model
        MAX_LENGTH,
        device=device
    )

if os.path.exists(ENSEMBLE_MODEL_PATH):
    model.load_state_dict(torch.load(ENSEMBLE_MODEL_PATH, map_location=device))
else:
    pre_predicted_train_df = model.predict(merged_train_df, ensemble=False)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam
    optimizer = optimizer(list(model.parameters()))
    optimizer.lr = lr
    
    y_true, y_pred = train_ensemble(model, pre_predicted_train_df, epochs, criterion, optimizer, verbose=True)
    
    score_map = {1: 'content', 2: 'wording'}

    for i in range(len(y_true[0])):
        y_true_i = [y[i] for y in y_true]
        y_pred_i = [y[i] for y in y_pred]

        rmse = mean_squared_error(y_true_i, y_pred_i, squared=False)
        mse = mean_squared_error(y_true_i, y_pred_i)
        r2 = r2_score(y_true_i, y_pred_i)

        i = score_map[i + 1]
        performance = {f'RMSE_{i}': rmse, f'R2_{i}': r2, f'MSE_{i}': mse}

        print(performance)



print("Preparing Ensemble model - ok")

Preparing Ensemble model - ok


## Predicting Test Set
___

In [24]:
predicted_test_df = model.predict(merged_test_df)

Transformer:  25%|██▌       | 1/4 [00:00<00:00, 72.36it/s]
Transformer:   0%|          | 0/4 [00:00<?, ?it/s][A
Transformer:  50%|█████     | 2/4 [00:11<00:11,  5.54s/it][A
Transformer:  75%|███████▌  | 3/4 [00:18<00:06,  6.37s/it][A
Transformer: 100%|██████████| 4/4 [00:26<00:00,  6.76s/it][A
ROUGE:  50%|█████     | 2/4 [00:32<00:32, 16.47s/it]      [A
ROUGE:   0%|          | 0/4 [00:00<?, ?it/s][A
Ensemble:  75%|███████▌  | 3/4 [00:32<00:16, 16.47s/it]
Ensemble:   0%|          | 0/4 [00:00<?, ?it/s][A
100%|██████████| 4/4 [00:33<00:00,  8.25s/it]          


In [25]:
submission = pd.DataFrame()
submission[['student_id', 'content', 'wording']] = predicted_test_df[['student_id', 'ensemble_content', 'ensemble_wording']]
submission.to_csv('submission.csv',index=False)
display(pd.read_csv('submission.csv'))

Unnamed: 0,student_id,content,wording
0,000000ffffff,-1.236509,-1.484966
1,222222cccccc,-1.249943,-1.501786
2,111111eeeeee,-1.244861,-1.495345
3,333333dddddd,-1.251526,-1.50466


In [26]:
torch.save(model.state_dict(), f'/ensemble_{hidden_layers}_{hidden_dim}_{epochs}_{lr}.pt')