In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression
import random
import pickle
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from sklearn.manifold import TSNE

## Part I: WANDS Dataset

In [5]:
# get GPT prediction results
list_id_gpt_instruct_train, list_truth_gpt_instruct_train_bool, list_pred_gpt_instruct_train_bool, _ = pickle.load(open("../data/relevance_gpt_rslt/rslt_gpt_instruct_20240424_train.p", "rb"))
list_id_gpt_instruct_test, list_truth_gpt_instruct_test_bool, list_pred_gpt_instruct_test_bool, _ = pickle.load(open("../data/relevance_gpt_rslt/rslt_gpt_instruct_20240424_test.p", "rb"))

In [7]:
# we use GPT to predict the first 20000 samples in the training set
list_id_llm_train_20k = list_id_gpt_instruct_train[:20000]

# load all the data for machine learning model
dict_data_all, dict_id = pickle.load(open("../data/relevance_dataset/dict_data_20240424.p", "rb"))
list_id_train = dict_id['project_1']['list_id_train']
list_id_test = dict_id['project_1']['list_id_test']
dict_id2info = dict_data_all['dict_id2info']

# split the dataset, x_train_llm_20k are those with llm prediction
x_train = np.array([dict_id2info[index]['embedding_concat'] for index in list_id_train])
x_test = np.array([dict_id2info[index]['embedding_concat'] for index in list_id_test])
x_train_llm_20k = np.array([dict_id2info[index]['embedding_concat'] for index in list_id_llm_train_20k])

y_train = np.array([dict_id2info[index]['label_truth_bool'] for index in list_id_train])
y_train_llm_20k = np.array([dict_id2info[index]['label_truth_bool'] for index in list_id_llm_train_20k])
y_test = np.array([dict_id2info[index]['label_truth_bool'] for index in list_id_test])

In [8]:
# fit the ml model by using the logistic regression for embedded word vectors
model_ml = LogisticRegression()
model_ml.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [9]:
# predict the scores and binary labels on the train/test set with the ml model
y_train_llm_20k_score = model_ml.predict_proba(x_train_llm_20k)[:, 1]
y_train_llm_20k_label = model_ml.predict(x_train_llm_20k)

y_train_ml_score = model_ml.predict_proba(x_train)[:, 1]
y_test_ml_score = model_ml.predict_proba(x_test)[:, 1]
y_train_ml_label = model_ml.predict(x_train)
y_test_ml_label = model_ml.predict(x_test)

In [10]:
# linear combination of ml and llm estimators in section 3

class model_linear_combine:
    def __init__(self, num_of_piece=1):
        self.n = num_of_piece
        self.alpha = np.zeros((num_of_piece))

    def fit(self, y_ml, y_llm, y_true):
        # fit piecewise linear alpha for alpha * model_ml + (1-alpha) * model_llm through OLS to minimize the MSE
        # y_ml: prediction of model_ml; y_llm: prediction of model_llm; y_true: true label.
        # y_ml and y_llm should be in the same scale.
        # y_true should be binary.
        # alpha should be in [0, 1]

        # assign data to group i if y_ml in [i/n, (i+1)/n), here n is the number of pieces
        self.group = np.floor(y_ml * self.n).astype(int)

        # fit alpha, for each group i, use all data in group i to fit alpha_i to minimize the MSE through OLS
        for i in range(self.n):
            mask = self.group == i
            if np.sum(mask) > 0:    
                tmp = np.sum((y_ml[mask] - y_llm[mask])*(y_true[mask] - y_llm[mask])) / np.sum((y_ml[mask] - y_llm[mask])**2)
                self.alpha[i] = tmp
        
    def predict(self, y_ml, y_llm):
        # predict the label based on the fitted alpha
        group = np.floor(y_ml * self.n).astype(int)
        y_pred = y_ml * self.alpha[group] + y_llm * (1 - self.alpha[group])

        # return the prediction score
        return y_pred


In [11]:
# calibration method in section 4
class model_calibration:
    def __init__(self, m):
        self.m = m 
        
        # delta (m*2) are \hat{\Delta} in the paper; here we use delta_{i,l} for if the ml prediction is in [i/m, (i+1)/m) and llm prediction is l for simplicity,
        # which is slightly different from that in the paper.
        self.delta = np.zeros((m,2)) 

    def fit(self, y_ml, y_llm, y_true):
        # fit delta for each group i, l through the sample mean
        for i in range(self.m):
            for l in range(2):
                mask = (np.floor(y_ml * self.m) == i) & (y_llm == l)
                if np.sum(mask) > 0:
                    tmp = np.mean(y_true[mask]-y_ml[mask])
                    self.delta[i,l] = tmp
    
    def predict(self, y_ml, y_llm):
        # predict the label based on the fitted delta
        y_pred = np.zeros_like(y_ml)
        for i in range(self.m):
            for l in range(2):
                mask = (np.floor(y_ml * self.m) == i) & (y_llm == l)
                y_pred[mask] = y_ml[mask] + self.delta[i,l]

        # return the prediction score
        return y_pred


In [13]:
def train_test_acc(model, model_name_string, y_train_ml, y_train_llm, y_train_true, y_test_ml, y_test_llm, y_test_true):
    model.fit(y_train_ml, y_train_llm, y_train_true)

    y_train_pred_score = model.predict(y_train_ml, y_train_llm)
    y_train_pred_label = (y_train_pred_score>0.5).astype(int)

    y_test_pred_score = model.predict(y_test_ml, y_test_llm)
    y_test_pred_label = (y_test_pred_score>0.5).astype(int)

    acc_train = np.mean(y_train_pred_label==y_train_true)
    acc_test = np.mean(y_test_pred_label==y_test_true)
    print('method: '+model_name_string+' \n train acc/test acc = ',(acc_train,acc_test))

In [15]:
acc_train_ml = np.mean(y_train_llm_20k_label==y_train_llm_20k)
acc_test_ml = np.mean(y_test_ml_label==y_test)
acc_train_llm = np.mean(y_train_llm_20k==list_pred_gpt_instruct_train_bool)
acc_test_llm = np.mean(y_test==list_pred_gpt_instruct_test_bool)
print('method: ml \n train acc/test acc = ',(acc_train_ml,acc_test_ml))
print('method: llm \n train acc/test acc = ',(acc_train_llm,acc_test_llm))

method: ml 
 train acc/test acc =  (0.8271, 0.8027333333333333)
method: llm 
 train acc/test acc =  (0.7407, 0.7750666666666667)


In [16]:
# naive method with 1 piece to fit linear weight, see section 3.1 in the paper
num_of_piece = 1 
model_linear_naive = model_linear_combine(num_of_piece=num_of_piece)

train_test_acc(model_linear_naive, 'linear combine with '+str(num_of_piece)+' piece', y_train_llm_20k_score, np.array(list_pred_gpt_instruct_train_bool), y_train_llm_20k,
y_test_ml_score, np.array(list_pred_gpt_instruct_test_bool), y_test)

method: linear combine with 1 piece 
 train acc/test acc =  (0.84575, 0.8383333333333334)


In [17]:
# naive method with several pieces to fit linear weight, see section 3.2 in the paper

# tune the num of pieces here as you want; we use equal size pieces
num_of_piece = 4
model_linear_piecewise = model_linear_combine(num_of_piece=num_of_piece)

train_test_acc(model_linear_piecewise, 'linear combine with '+str(num_of_piece)+' piece', y_train_llm_20k_score, np.array(list_pred_gpt_instruct_train_bool), y_train_llm_20k,
y_test_ml_score, np.array(list_pred_gpt_instruct_test_bool), y_test)

method: linear combine with 4 piece 
 train acc/test acc =  (0.8451, 0.8461333333333333)


In [18]:
# calibration method, see section 4 in the paper

# tune the number of bins for calibration
calibration_m = 10
model_calibrate = model_calibration(calibration_m)

train_test_acc(model_calibrate, 'calibration with '+str(calibration_m)+' bins', y_train_llm_20k_score, np.array(list_pred_gpt_instruct_train_bool), y_train_llm_20k,
y_test_ml_score, np.array(list_pred_gpt_instruct_test_bool), y_test)

method: calibration with 10 bins 
 train acc/test acc =  (0.8483, 0.8404666666666667)


## Part II: Other Dataset

In [132]:
# Do the same thing for other datasets

def validate_data(path_gpt_train, path_gpt_test, path_data, **kargs):
    list_id_gpt_train, list_truth_bool_train, list_gpt_pred_bool_train, list_rslt_detail_train = pickle.load(open(path_gpt_train, "rb"))
    list_id_gpt_test, list_truth_bool_test, list_gpt_pred_bool_test, list_rslt_detail_test = pickle.load(open(path_gpt_test, "rb"))
    dict_data_all, dict_id2info, dict_id = pickle.load(open(path_data, "rb"))


    list_id_train = dict_id['list_id_train']
    list_id_test = dict_id['list_id_test']

    x_train = np.array([dict_id2info[index]['embedding'] for index in list_id_train])
    x_test = np.array([dict_id2info[index]['embedding'] for index in list_id_test])

    y_train = np.array([dict_id2info[index]['label_bool'] for index in list_id_train])
    y_test = np.array([dict_id2info[index]['label_bool'] for index in list_id_test])

    # ml model
    model_ml = LogisticRegression()
    model_ml.fit(x_train, y_train)
    #print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

    y_train_ml_score = model_ml.predict_proba(x_train)[:, 1]
    y_test_ml_score = model_ml.predict_proba(x_test)[:, 1]
    y_train_ml_label = model_ml.predict(x_train)
    y_test_ml_label = model_ml.predict(x_test)

    acc_train_ml = np.mean(y_train_ml_label==y_train)
    acc_test_ml = np.mean(y_test_ml_label==y_test)
    acc_train_llm = np.mean(y_train==list_gpt_pred_bool_train)
    acc_test_llm = np.mean(y_test==list_gpt_pred_bool_test)
    print('method: ml \n train acc/test acc = ',(acc_train_ml,acc_test_ml))
    print('method: llm \n train acc/test acc = ',(acc_train_llm,acc_test_llm))

    # linear combination with 1 piece
    num_of_piece = 1 
    model_linear_naive = model_linear_combine(num_of_piece=num_of_piece)
    train_test_acc(model_linear_naive, 'linear combine with '+str(num_of_piece)+' piece', y_train_ml_score, np.array(list_gpt_pred_bool_train), y_train,
    y_test_ml_score, np.array(list_gpt_pred_bool_test), y_test)

    # linear combination with several pieces
    num_of_piece = 4 if 'num_of_piece' not in kargs else kargs['num_of_piece']
    model_linear_piecewise = model_linear_combine(num_of_piece=num_of_piece)
    train_test_acc(model_linear_piecewise, 'linear combine with '+str(num_of_piece)+' piece', y_train_ml_score, np.array(list_gpt_pred_bool_train), y_train,
    y_test_ml_score, np.array(list_gpt_pred_bool_test), y_test)

    # calibration method
    calibration_m = 20 if 'calibration_m' not in kargs else kargs['calibration_m']
    model_calibrate = model_calibration(calibration_m)

    train_test_acc(model_calibrate, 'calibration with '+str(calibration_m)+' bins', y_train_ml_score, np.array(list_gpt_pred_bool_train), y_train,
    y_test_ml_score, np.array(list_gpt_pred_bool_test), y_test)

In [96]:
#########################
# Yelp dataset 

path_gpt_train = "../data/other_gpt_rslt/rslt_gpt_20240429_yelp_train.p"
path_gpt_test = "../data/other_gpt_rslt/rslt_gpt_20240429_yelp_test.p"
path_data = "../data/other_dataset/yelp/data/dict_data_20240429_yelp.p"

print("\n\nValidating Yelp Data")
validate_data(path_gpt_train, path_gpt_test, path_data, num_of_piece = 4, calibration_m = 20) # tune the parameters here as you want



Validating Yelp Data
method: ml 
 train acc/test acc =  (0.7344285714285714, 0.691)
method: llm 
 train acc/test acc =  (0.7295714285714285, 0.724)
method: linear combine with 1 piece 
 train acc/test acc =  (0.7608571428571429, 0.739)
method: linear combine with 4 piece 
 train acc/test acc =  (0.7674285714285715, 0.742)
method: calibration with 20 bins 
 train acc/test acc =  (0.7711428571428571, 0.743)


In [129]:
#########################
# Emotion dataset

path_gpt_train = "../data/other_gpt_rslt/rslt_gpt_20240429_emotion_train.p"
path_gpt_test = "../data/other_gpt_rslt/rslt_gpt_20240429_emotion_test.p"
path_data = "../data/other_dataset/emotion/data/dict_data_20240429_emotion.p"

print("\n\nValidating Emotion Data")
validate_data(path_gpt_train, path_gpt_test, path_data, num_of_piece = 10, calibration_m = 10) # tune the parameters here as you want



Validating Emotion Data
method: ml 
 train acc/test acc =  (0.8669907202828104, 0.7993827160493827)
method: llm 
 train acc/test acc =  (0.7445868316394167, 0.7592592592592593)
method: linear combine with 1 piece 
 train acc/test acc =  (0.8727353071144498, 0.8055555555555556)
method: linear combine with 10 piece 
 train acc/test acc =  (0.8740609809986744, 0.8117283950617284)
method: calibration with 10 bins 
 train acc/test acc =  (0.8718515245249668, 0.8055555555555556)


In [153]:
#########################
# Hate dataset

path_gpt_train = "../data/other_gpt_rslt/rslt_gpt_20240429_hate_train.p"
path_gpt_test = "../data/other_gpt_rslt/rslt_gpt_20240429_hate_test.p"
path_data = "../data/other_dataset/hate/data/dict_data_20240429_hate.p"

print("\n\nValidating Hate Data")
validate_data(path_gpt_train, path_gpt_test, path_data, num_of_piece = 4, calibration_m = 12) # tune the parameters here as you want



Validating Hate Data
method: ml 
 train acc/test acc =  (0.7384228790235731, 0.7169179229480737)
method: llm 
 train acc/test acc =  (0.6544214431015914, 0.6716917922948074)
method: linear combine with 1 piece 
 train acc/test acc =  (0.7371066172071318, 0.7202680067001676)
method: linear combine with 4 piece 
 train acc/test acc =  (0.7387818595189661, 0.7236180904522613)
method: calibration with 12 bins 
 train acc/test acc =  (0.7414143831518487, 0.7311557788944724)


## Part III: Transfer learning with LLM

In [2]:
# load data for tranfer learning part, see section 5 in the paper

list_id_gpt_instruct_bed_test, list_truth_gpt_instruct_bed_test_bool, list_pred_gpt_instruct_bed_bool, _ = pickle.load(open("../data/relevance_gpt_rslt/rslt_gpt_instruct_20240424_project2_bed_test.p", "rb"))
dict_data_all, dict_id = pickle.load(open("../data/relevance_dataset/dict_data_20240424.p", "rb"))
dict_id_cat = pickle.load(open("../data/relevance_dataset/dict_id_cat_20240424.p", "rb"))

In [3]:
dict_id2info = dict_data_all['dict_id2info']

np.random.seed(2024)
num_train_samples = 1000
idx_train = np.random.choice(len(dict_id_cat['table']['train']), num_train_samples, replace=False)

x_train_table = np.array([dict_id2info[index]['embedding_concat'] for index in dict_id_cat['table']['train']])[idx_train,:]
y_train_table = np.array([dict_id2info[index]['label_truth_bool'] for index in dict_id_cat['table']['train']])[idx_train]
x_test_table = np.array([dict_id2info[index]['embedding_concat'] for index in dict_id_cat['table']['test']])
y_test_table = np.array([dict_id2info[index]['label_truth_bool'] for index in dict_id_cat['table']['test']])

# x_train_bed = np.array([dict_id2info[index]['embedding_concat'] for index in dict_id_cat['bed']['train']])
# y_train_bed = np.array([dict_id2info[index]['label_truth_bool'] for index in dict_id_cat['bed']['train']])
x_test_bed = np.array([dict_id2info[index]['embedding_concat'] for index in dict_id_cat['bed']['test']])
y_test_bed = np.array([dict_id2info[index]['label_truth_bool'] for index in dict_id_cat['bed']['test']])

In [9]:
model_table = LogisticRegression().fit(x_train_table, y_train_table)

In [10]:
# direct transfer the table model to bed dataset
acc_table_train = np.mean(model_table.predict(x_train_table)==y_train_table)
acc_table_test = np.mean(model_table.predict(x_test_table)==y_test_table)
acc_table_model_on_bed = np.mean(model_table.predict(x_test_bed)==y_test_bed)
acc_gpt_instruct_bed = np.mean(list_pred_gpt_instruct_bed_bool==y_test_bed)
print(f' Table model for table train/test acc: {acc_table_train,acc_table_test}\n Table model for bed test acc: {acc_table_model_on_bed}\n GPT for bed test acc: {acc_gpt_instruct_bed}')

 Table model for table train/test acc: (0.888, 0.8873091100579252)
 Table model for bed test acc: 0.7199256850905713
 GPT for bed test acc: 0.7570831398049234


In [34]:
np.random.seed(2024)
num_transfer_samples = 500
idx_transfer = np.random.choice(len(x_test_bed), num_transfer_samples, replace=False)
x_train_bed_transfer, y_train_bed_transfer = x_test_bed[idx_transfer], y_test_bed[idx_transfer]
x_test_bed_transfer, y_test_bed_transfer = np.delete(x_test_bed, idx_transfer, axis=0), np.delete(y_test_bed, idx_transfer)

In [35]:
# train a transfer learning model with the labeled table data and gpt-labeled bed data, as in section 5
x_train_transfer = np.concatenate((x_train_table, x_train_bed_transfer), axis=0)
y_train_transfer = np.concatenate((y_train_table, np.array(list_pred_gpt_instruct_bed_bool)[idx_transfer]), axis=0)
model_transfer = LogisticRegression().fit(x_train_transfer, y_train_transfer)

acc_transfer_table_train = np.mean(model_transfer.predict(x_train_table)==y_train_table)
acc_transfer_table_test = np.mean(model_transfer.predict(x_test_table)==y_test_table)
acc_transfer_bed_train = np.mean(model_transfer.predict(x_train_bed_transfer)==y_train_bed_transfer)
acc_transfer_bed_test = np.mean(model_transfer.predict(x_test_bed_transfer)==y_test_bed_transfer)
print(f' Transfer model for table train/test acc: {acc_transfer_table_train,acc_transfer_table_test}\n Transfer model for bed train/test acc: {acc_transfer_bed_train,acc_transfer_bed_test}')

 Transfer model for table train/test acc: (0.892, 0.889942074776198)
 Transfer model for bed train/test acc: (0.76, 0.7652752571082879)
