In [1]:
pip install multimodal-transformers

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting multimodal-transformers
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/2f/ab/7c839a1da28bd7dd74ef5a8b87d1e3e41225269bb4c7b05a68575809df7c/multimodal_transformers-0.4.0-py3-none-any.whl (29 kB)
Installing collected packages: multimodal-transformers
Successfully installed multimodal-transformers-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install --user datasets

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting datasets
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/be/3e/e58d4db4cfe71e3ed07d169af24db30cfd582e16f977378bd43fd7ec1998/datasets-3.0.1-py3-none-any.whl (471 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m194.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/4c/21/9ca93b84b92ef927814cb7ba37f0774a484c849d58f0b692b16af8eebcfb/pyarrow-17.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m304.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c9/7a/cef76fd8438a42f96db64ddaa85280485a9c395e7df3db8158cfec1eee34/dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
pip install --user openpyxl

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install --user tensorboard


In [None]:
pip install --user tf-keras

In [1]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoConfig, Trainer, EvalPrediction, set_seed
from transformers.training_args import TrainingArguments

import sys

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular
from multimodal_transformers.multimodal_arguments import (
    ModelArguments,
    MultimodalDataTrainingArguments,
)

logging.basicConfig(level=logging.INFO)
os.environ["COMET_MODE"] = "DISABLED"
# print(multimodal_transformers.getsitepackages())

  from .autonotebook import tqdm as notebook_tqdm
2024-10-22 02:56:33.714305: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-22 02:56:33.724636: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-22 02:56:33.736761: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-22 02:56:33.740510: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-22 02:56:33.7

In [2]:
text_cols = ["GPT4O-Contribution"]
cat_cols =[]
numerical_cols = ["Hit_1pct",'Hit_5pct','Hit_10pct','Atyp_10pct_Z','Atyp_Median_Z','Atyp_Pairs','C10','C5','C_f','Citation_Count','NCT_Count','NSF_Count','Newsfeed_Count','Patent_Count','Reference_Count','SB_B','SB_T','Team_Size','Tweet_Count','WSB_Cinf','WSB_sigma','cit_d','important_cit_per','ref_5_per','ref_avg_age','ref_cit_mean','ref_d','ref_median_age']
column_info_dict = {
    "text_cols": text_cols,
    "num_cols": numerical_cols,
    "cat_cols": cat_cols,
    "label_col": "B/NB",
    "label_list": ["NB", "IB", "B"],
}
model_args = ModelArguments(model_name_or_path="bert-base-uncased")

data_args = MultimodalDataTrainingArguments(
    data_path="./only_text",
    # combine_feat_method="text_only",  weighted_feature_sum_on_transformer_cat_and_numerical_feats
    combine_feat_method="text_only",
    column_info=column_info_dict,
    task="classification",
    categorical_encode_type=None,
    categorical_handle_na=True,
    categorical_na_value="Unknown",
    ohe_handle_unknown="error",
    numerical_transformer_method='none',
    numerical_handle_na=True,
    numerical_how_handle_na="zero",
)
training_args = TrainingArguments(
    output_dir="./only_text",
    logging_dir="./only_text/log",
    overwrite_output_dir=True,
    #seed=42,
    seed=1234,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=64,
    num_train_epochs=100,
    evaluation_strategy="epoch",  # 每个 epoch 进行评估
    save_strategy = "epoch",
    load_best_model_at_end=True,  # 加载最佳模型
    metric_for_best_model='f1',  # 选择用于比较的指标
    logging_steps=25,
    eval_steps=250,
    greater_is_better=True  # 选择的指标越大越好
)
print(training_args.seed)
set_seed(training_args.seed)

Using the `COMET_MODE=DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


1234


In [3]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir,
    truncation=True,     # 自动截断超过 max_length 的序列
    max_length=512,      # 设置最大序列长度为 512
    padding='max_length'
)

Specified tokenizer:  bert-base-uncased


In [4]:
# Get Datasets
train_dataset, val_dataset, test_dataset = load_data_from_folder(
    data_args.data_path,
    data_args.column_info["text_cols"],
    tokenizer,
    label_col=data_args.column_info["label_col"],
    label_list=data_args.column_info["label_list"],
    categorical_encode_type = data_args.categorical_encode_type,
    numerical_transformer_method = data_args.numerical_transformer_method,
    categorical_cols=data_args.column_info["cat_cols"],
    numerical_cols=data_args.column_info["num_cols"],
    sep_text_token_str=tokenizer.sep_token,
    max_token_length=512,
)

INFO:multimodal_transformers.data.load_data:Text columns: ['GPT4O-Contribution']
INFO:multimodal_transformers.data.load_data:Raw text example: The paper titled "Inheritance of Fruit Shape and Seed Size of Watermelon" presents significant insights into the genetic mechanisms governing fruit morphology and seed dimensions in watermelon cultivars. Key contributions of this study include the following:

1. **Genetic Control**: The study identifies the inheritance patterns of fruit shape (spherical and oval) and seed size (short and medium) in two different watermelon cultivars. It highlights the role of a single allele with incomplete dominance affecting the fruit shape and differentiates the genetic control of seed size through a dominant short seed gene (Ti) and a recessive medium seed gene (ti). This nuanced understanding aids in elucidating the complexities of watermelon genetics.

2. **Methodological Advances**: The authors employ a quantitative approach to measure seed size, providin

In [5]:
num_labels = len(np.unique(train_dataset.labels))
num_labels

3

In [6]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
)
tabular_config = TabularConfig(
    num_labels=num_labels,
    #cat_feat_dim=train_dataset.cat_feats.shape[1],
    numerical_feat_dim=train_dataset.numerical_feats.shape[1],
    **vars(data_args)
)
config.tabular_config = tabular_config

In [7]:
model = AutoModelWithTabular.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    config=config,
    cache_dir=model_args.cache_dir,
)

Some weights of BertWithTabular were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'tabular_classifier.bias', 'tabular_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
)


def calc_classification_metrics(p: EvalPrediction):
    predictions = p.predictions[0]
    pred_labels = np.argmax(predictions, axis=1)
    pred_scores = softmax(predictions, axis=1)[:, 1]
    labels = p.label_ids
    if len(np.unique(labels)) == 2:  # binary classification
        roc_auc_pred_score = roc_auc_score(labels, pred_scores)
        precisions, recalls, thresholds = precision_recall_curve(labels, pred_scores)
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        fscore[np.isnan(fscore)] = 0
        ix = np.argmax(fscore)
        threshold = thresholds[ix].item()
        pr_auc = auc(recalls, precisions)
        tn, fp, fn, tp = confusion_matrix(labels, pred_labels, labels=[0, 1]).ravel()
        result = {
            "roc_auc": roc_auc_pred_score,
            "threshold": threshold,
            "pr_auc": pr_auc,
            "recall": recalls[ix].item(),
            "precision": precisions[ix].item(),
            "f1": fscore[ix].item(),
            "tn": tn.item(),
            "fp": fp.item(),
            "fn": fn.item(),
            "tp": tp.item(),
        }
    else:
        # [None, 'micro', 'macro', 'weighted']
        acc = (pred_labels == labels).mean()
        f1 = f1_score(y_true=labels, y_pred=pred_labels, average='weighted')
        result = {
            "acc": acc,
            "f1": f1,
            "acc_and_f1": (acc + f1) / 2,
            "mcc": matthews_corrcoef(labels, pred_labels),
        }

    return result

In [9]:
from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=10,  # 3个评估周期内无改进则停止
    early_stopping_threshold=0.0001  # 改进幅度必须超过1%
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=calc_classification_metrics,
    callbacks=[early_stopping_callback]
)

In [10]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss,Acc,F1,Acc And F1,Mcc
1,No log,0.574391,0.8,0.764756,0.782378,0.177183
2,0.670500,0.556956,0.807143,0.743084,0.775113,-0.00317
3,0.491800,0.521002,0.814286,0.756362,0.785324,0.083594
4,0.491800,0.539887,0.8,0.765004,0.782502,0.176996
5,0.401900,0.602466,0.8,0.757148,0.778574,0.120472
6,0.288800,0.801611,0.821429,0.750281,0.785855,0.03776
7,0.233400,0.738897,0.821429,0.777675,0.799552,0.211829
8,0.233400,0.847829,0.807143,0.749029,0.778086,0.066122
9,0.173700,0.788418,0.792857,0.767053,0.779955,0.18108
10,0.149400,0.965663,0.828571,0.770612,0.799592,0.166421


CPU times: user 7min 15s, sys: 40.7 s, total: 7min 56s
Wall time: 8min 33s


TrainOutput(global_step=684, training_loss=0.09754397057756585, metrics={'train_runtime': 512.937, 'train_samples_per_second': 217.961, 'train_steps_per_second': 3.509, 'total_flos': 1.1178311164440576e+16, 'train_loss': 0.09754397057756585, 'epoch': 38.0})

In [11]:
%%time
trainer.evaluate(eval_dataset=val_dataset)

CPU times: user 498 ms, sys: 8.5 ms, total: 507 ms
Wall time: 504 ms


{'eval_loss': 1.3413046598434448,
 'eval_acc': 0.8357142857142857,
 'eval_f1': 0.8197843858573471,
 'eval_acc_and_f1': 0.8277493357858164,
 'eval_mcc': 0.4509716964670214,
 'eval_runtime': 0.4983,
 'eval_samples_per_second': 280.959,
 'eval_steps_per_second': 36.123,
 'epoch': 38.0}

In [13]:
from torch.utils.data import DataLoader
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
from sklearn.metrics import precision_score, recall_score, f1_score

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# 确保模型在 cuda:0 上
model.to(device)
all_logits = []
all_labels = []
# 确保所有输入张量都在同一设备上
# test_batch = {k: v.to(device) for k, v in test_batch.items()}

model.eval()
with torch.no_grad():
    for test_batch in test_loader:
        test_batch = {k: v.to(device) for k, v in test_batch.items()}
        _, logits, classifier_outputs = model(
            test_batch["input_ids"],
            attention_mask=test_batch["attention_mask"],
            # token_type_ids=test_batch["token_type_ids"],
            cat_feats=test_batch["cat_feats"],
            numerical_feats=test_batch["numerical_feats"],
        )
        all_logits.append(logits)
        labels = test_batch["labels"].to(device)
        all_labels.append(labels)

# 确保 labels 在同一设备上
all_logits = torch.cat(all_logits, dim=0)
all_labels = torch.cat(all_labels, dim=0)  # 合并标签列表为一个张量

# 计算预测类别
predictions = all_logits.argmax(axis=1)

# 计算指标
accuracy = (predictions == all_labels).float().mean().item()
precision = precision_score(all_labels.cpu(), predictions.cpu(), average='weighted')
recall = recall_score(all_labels.cpu(), predictions.cpu(), average='weighted')
f1 = f1_score(all_labels.cpu(), predictions.cpu(), average='weighted')

# 输出结果
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.7929
Precision: 0.8344
Recall: 0.7929
F1 Score: 0.8102
