In [None]:
pip install multimodal-transformers

In [None]:
pip install --user datasets
pip install --user openpyxl

In [None]:
pip install --user tensorboard
pip install --user tf-keras

In [None]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoConfig, Trainer, EvalPrediction, set_seed
from transformers.training_args import TrainingArguments

import sys

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular
from multimodal_transformers.multimodal_arguments import (
    ModelArguments,
    MultimodalDataTrainingArguments,
)

logging.basicConfig(level=logging.INFO)
os.environ["COMET_MODE"] = "DISABLED"
# print(multimodal_transformers.getsitepackages())

In [None]:
text_cols = ["GPT4O-Contribution"]
cat_cols =[]
numerical_cols = ["Hit_1pct",'Hit_5pct','Hit_10pct','Atyp_10pct_Z','Atyp_Median_Z',
                  'Atyp_Pairs','C10','C5','C_f','Citation_Count','NCT_Count','NSF_Count',
                  'Newsfeed_Count','Patent_Count','Reference_Count','SB_B','SB_T','Team_Size',
                  'Tweet_Count','WSB_Cinf','WSB_sigma','cit_d','important_cit_per','ref_5_per',
                  'ref_avg_age','ref_cit_mean','ref_d','ref_median_age']

column_info_dict = {
    "text_cols": text_cols,
    "num_cols": numerical_cols,
    "cat_cols": cat_cols,
    "label_col": "B/NB",
    "label_list": ["NB", "IB", "B"],
}
model_args = ModelArguments(model_name_or_path="roberta-base")

# weighted_feature_sum_on_transformer_cat_and_numerical_feats
# attention_on_cat_and_numerical_feats
#gating_on_cat_and_num_feats_then_sum
data_args = MultimodalDataTrainingArguments(
    data_path="./only_text_robert",
    combine_feat_method="text_only",
    column_info=column_info_dict,
    task="classification",
    categorical_encode_type=None,
    categorical_handle_na=True,
    categorical_na_value="Unknown",
    ohe_handle_unknown="error",
    numerical_transformer_method='none',
    numerical_handle_na=True,
    numerical_how_handle_na="medium",
)
training_args = TrainingArguments(
    output_dir="./only_text_robert",
    logging_dir="./only_text_robert/log",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    #seed=42,
    seed=42,
    per_device_train_batch_size=64,
    num_train_epochs=100,
    evaluation_strategy="epoch",  # 每个 epoch 进行评估
    save_strategy = "epoch",
    load_best_model_at_end=True,  # 加载最佳模型
    metric_for_best_model='f1',  # 选择用于比较的指标
    logging_steps=25,
    eval_steps=250,
    greater_is_better=True  # 选择的指标越大越好
)
print(training_args.seed)
set_seed(training_args.seed)

In [None]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
    truncation=True,     # 自动截断超过 max_length 的序列
    max_length=512,      # 设置最大序列长度为 512
    padding='max_length'
)

In [None]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    data_args.data_path,
    data_args.column_info["text_cols"],
    tokenizer,
    label_col=data_args.column_info["label_col"],
    label_list=data_args.column_info["label_list"],
    categorical_encode_type = data_args.categorical_encode_type,
    numerical_transformer_method = data_args.numerical_transformer_method,
    categorical_cols=data_args.column_info["cat_cols"],
    numerical_cols=data_args.column_info["num_cols"],
    sep_text_token_str=tokenizer.sep_token,
    max_token_length=512,
)

In [None]:
num_labels = len(np.unique(train_dataset.labels))
num_labels

In [6]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)
tabular_config = TabularConfig(
    num_labels=num_labels,
    #cat_feat_dim=train_dataset.cat_feats.shape[1],
    numerical_feat_dim=train_dataset.numerical_feats.shape[1],
    **vars(data_args)
)
config.tabular_config = tabular_config

In [None]:
model = AutoModelWithTabular.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    config=config,
    cache_dir=model_args.cache_dir,
)

In [8]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
)


def calc_classification_metrics(p: EvalPrediction):
    predictions = p.predictions[0]
    pred_labels = np.argmax(predictions, axis=1)
    pred_scores = softmax(predictions, axis=1)[:, 1]
    labels = p.label_ids
    if len(np.unique(labels)) == 2:  # binary classification
        roc_auc_pred_score = roc_auc_score(labels, pred_scores)
        precisions, recalls, thresholds = precision_recall_curve(labels, pred_scores)
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        fscore[np.isnan(fscore)] = 0
        ix = np.argmax(fscore)
        threshold = thresholds[ix].item()
        pr_auc = auc(recalls, precisions)
        tn, fp, fn, tp = confusion_matrix(labels, pred_labels, labels=[0, 1]).ravel()
        result = {
            "roc_auc": roc_auc_pred_score,
            "threshold": threshold,
            "pr_auc": pr_auc,
            "recall": recalls[ix].item(),
            "precision": precisions[ix].item(),
            "f1": fscore[ix].item(),
            "tn": tn.item(),
            "fp": fp.item(),
            "fn": fn.item(),
            "tp": tp.item(),
        }
    else:
        # [None, 'micro', 'macro', 'weighted']
        acc = (pred_labels == labels).mean()
        f1 = f1_score(y_true=labels, y_pred=pred_labels, average='weighted')
        result = {
            "acc": acc,
            "f1": f1,
            "acc_and_f1": (acc + f1) / 2,
            "mcc": matthews_corrcoef(labels, pred_labels),

        }

    return result

In [9]:
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # 3个评估周期内无改进则停止
    early_stopping_threshold=0.0001  # 改进幅度必须超过1%
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_classification_metrics,
    callbacks=[early_stopping_callback]
)

In [None]:
%%time
trainer.train()

In [None]:
%%time
trainer.evaluate(eval_dataset=val_dataset)

In [None]:
from torch.utils.data import DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
from sklearn.metrics import precision_score, recall_score, f1_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 确保模型在 cuda:0 上
model.to(device)
all_logits = []
all_labels = []
# 确保所有输入张量都在同一设备上
# test_batch = {k: v.to(device) for k, v in test_batch.items()}

model.eval()
with torch.no_grad():
    for test_batch in test_loader:
        test_batch = {k: v.to(device) for k, v in test_batch.items()}
        _, logits, classifier_outputs = model(
            test_batch["input_ids"],
            attention_mask=test_batch["attention_mask"],
            # token_type_ids=test_batch["token_type_ids"],
            cat_feats=test_batch["cat_feats"],
            numerical_feats=test_batch["numerical_feats"],
        )
        all_logits.append(logits)
        labels = test_batch["labels"].to(device)
        all_labels.append(labels)

# 确保 labels 在同一设备上
all_logits = torch.cat(all_logits, dim=0)
all_labels = torch.cat(all_labels, dim=0)  # 合并标签列表为一个张量

# 计算预测类别
predictions = all_logits.argmax(axis=1)

# 计算指标
accuracy = (predictions == all_labels).float().mean().item()
precision = precision_score(all_labels.cpu(), predictions.cpu(), average='weighted')
recall = recall_score(all_labels.cpu(), predictions.cpu(), average='weighted')
f1 = f1_score(all_labels.cpu(), predictions.cpu(), average='weighted')

# 输出结果
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")