In [None]:
import subprocess
import os

# proxy for download in cloud server
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [None]:
%pip install datasets sentence_transformers setfit pandas nlpcda openpyxl optuna gpustat

In [None]:
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset
import pandas as pd
from datasets import Dataset, ClassLabel, Features, Value
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib
import os
import datetime

In [None]:
def model_init(params):
    params = params or {}
    max_iter = params.get("max_iter", 100)
    solver = params.get("solver", "liblinear")
    params = {
        "head_params": {
            "max_iter": max_iter,
            "solver": solver,
        }
    }
    return SetFitModel.from_pretrained("./DMetaSoul_sbert-chinese-general-v2", **params)


def hp_space(trial):  # Training parameters just for demonstration
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_epochs": trial.suggest_int("num_epochs", 1, 2),
        "batch_size": trial.suggest_categorical("batch_size", [16, 32]),
        "seed": trial.suggest_int("seed", 1, 40),
        "num_iterations": trial.suggest_categorical("num_iterations", [5, 10, 20]),
        "max_iter": trial.suggest_int("max_iter", 50, 300),
        "solver": trial.suggest_categorical(
            "solver", ["newton-cg", "lbfgs", "liblinear"]
        ),
    }

In [None]:
from sklearn.metrics import f1_score

def compute_f1(y_pred, y_test):

    return {"f1": f1_score(y_test, y_pred, average="binary", zero_division=1.0)}
from sklearn.metrics import f1_score

def compute_f1_a(y_pred, y_test):

    return {"f1": f1_score(y_test, y_pred, average="macro", zero_division=1.0)}

In [None]:
col_list = [
    "Practical barriers to vaccination (-)",
    "Perceived barriers to accepting vaccines (-)",
    "Perceived benefits (+)",
    "Misinformation (-)",
    "Perceived Disease Risk (+)",
    "Social norms  cues to action (+)",
    "Attitude",
]

# load data
train_all, test_all = pd.read_csv('train1.csv'), pd.read_csv('test1.csv')

In [None]:
# model training
for col in col_list[:]:

    train_data, test_data = train_all.loc[:,['content',col]], test_all.loc[:,['content',col]]
    train_dataset = Dataset.from_pandas(train_data)
    test_dataset = Dataset.from_pandas(test_data)

    if col == 'attitude':
        trainer = SetFitTrainer(
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        model_init=model_init,
        metric=compute_f1_a,
        column_mapping={"content": "text", col: "label"},
    )
    else:
        trainer = SetFitTrainer(
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            model_init=model_init,
            metric=compute_f1,
            column_mapping={"content": "text", col: "label"},
        )
    # search for the best model
    best_run = trainer.hyperparameter_search(
        direction="maximize", hp_space=hp_space, n_trials=3
    )
    
    trainer.apply_hyperparameters(best_run.hyperparameters, final_model=True)
    trainer.train()

    metrics = trainer.evaluate()

    trainer.model.save_pretrained(f'./model/{col}')

    with open("results.txt", "a") as file:
        file.write("---" * 20)
        file.write(f"\n{datetime.datetime.now()}")
        file.write(f"\nlabel: {col}\n")
        file.write(f"metrics: {metrics}\n")