In [1]:
import numpy as np
import pandas as pd
import os
import random
import torch
from PAES.configs import PAESConfig
from models.transfomer_enc import FeatureModel
from dvrl.predictor_model import MLP
from transformers import AutoConfig
from utils.create_embedding_feautres import create_embedding_features, normalize_scores
from utils.dvrl_utils import remove_top_p_sample, fit_func, pred_func, calc_qwk, random_remove_sample, get_dev_sample
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import japanize_matplotlib
import seaborn as sns
from sklearn.model_selection import train_test_split
import pickle
from utils.read_data import get_readability_features, get_linguistic_features, get_features_by_id, scale_features, get_normalized_features
from utils.general_utils import get_min_max_scores

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_prompt_id = 1
# output_path = f'outputs/DVRL_DomainAdaptation/'
# output_path = f'outputs/DVRL_DomainAdaptation{test_prompt_id}/'
# output_path = f'outputs/DVRL_DomainAdaptation{test_prompt_id}_devsize0.01/'
# output_path = f'outputs/DVRL_DomainAdaptation{test_prompt_id}_devsize30/'
output_path = f'outputs/DVRL_DomainAdaptation_FeatureModel{test_prompt_id}_devsize40/'

seed = 12
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
attribute_name = 'score'

# Load data
data_path = 'data/cross_prompt_attributes/' + str(test_prompt_id) + '/'
model_name = 'microsoft/deberta-v3-large'

cpu


In [3]:
seed = 12
np.random.seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [4]:
train_data, val_data, test_data = create_embedding_features(data_path, attribute_name, model_name, device)
# split test data into dev and test
_, _, _, _, dev_idx, _ = get_dev_sample(test_data['essay'], test_data['normalized_label'], dev_size=40)

source_ids = np.concatenate([train_data['essay_id'], val_data['essay_id']])
dev_ids = test_data['essay_id'][dev_idx]
target_ids = np.setdiff1d(test_data['essay_id'], dev_ids)

# load readability and linguistic features
readability_features = get_readability_features('data/allreadability.pickle')
linguistic_features = get_linguistic_features('data/hand_crafted_v3.csv')

x_source_readability = get_features_by_id(readability_features, source_ids, 'dim1').drop('dim1', axis=1).to_numpy()
x_dev_readability = get_features_by_id(readability_features, dev_ids, 'dim1').drop('dim1', axis=1).to_numpy()
x_target_readability = get_features_by_id(readability_features, target_ids, 'dim1').drop('dim1', axis=1).to_numpy()

x_source_linguistic = get_features_by_id(linguistic_features, source_ids, 'item_id')
x_dev_linguistic = get_features_by_id(linguistic_features, dev_ids, 'item_id')
x_target_linguistic = get_features_by_id(linguistic_features, target_ids, 'item_id')

y_source = x_source_linguistic['score'].to_numpy()
y_source_prompt = x_source_linguistic['prompt_id'].to_numpy()
y_source = normalize_scores(y_source, y_source_prompt, 'score')

y_dev = x_dev_linguistic['score'].to_numpy()
y_dev_prompt = x_dev_linguistic['prompt_id'].to_numpy()
y_dev = normalize_scores(y_dev, y_dev_prompt, 'score')

y_target = x_target_linguistic['score'].to_numpy()
y_target_prompt = x_target_linguistic['prompt_id'].to_numpy()
y_target = normalize_scores(y_target, y_target_prompt, 'score')

x_source_linguistic_scaled = scale_features(x_source_linguistic).drop(['item_id', 'prompt_id', 'score'], axis=1).to_numpy()
x_dev_linguistic_scaled = scale_features(x_dev_linguistic).drop(['item_id', 'prompt_id', 'score'], axis=1).to_numpy()
x_target_linguistic_scaled = scale_features(x_target_linguistic).drop(['item_id', 'prompt_id', 'score'], axis=1).to_numpy()

x_source = np.concatenate([x_source_readability, x_source_linguistic_scaled], axis=1)
x_dev = np.concatenate([x_dev_readability, x_dev_linguistic_scaled], axis=1)
x_target = np.concatenate([x_target_readability, x_target_linguistic_scaled], axis=1)

load data from data/cross_prompt_attributes/1/...
Loading embedding from cache...
Selected 40 samples.
Selected sample indices: [1691, 446, 689, 653, 945, 1548, 1244, 452, 1413, 916, 1447, 871, 1645, 476, 1368, 878, 746, 282, 1509, 1347, 151, 1577, 622, 1549, 1570, 915, 1348, 690, 794, 1359, 1299, 195, 908, 305, 1461, 410, 77, 939, 1183, 573]


In [7]:
print('================================')
print('X_source: ', x_source.shape)
print('Y_source: ', y_source.shape)

print('================================')
print('X_dev: ', x_dev.shape)
print('Y_dev: ', y_dev.shape)

print('================================')
print('X_target: ', x_target.shape)
print('Y_target: ', y_target.shape)
print('================================')

X_source:  (11193, 86)
Y_source:  (11193,)
X_dev:  (40, 86)
Y_dev:  (40,)
X_target:  (1743, 86)
Y_target:  (1743,)
