In [1]:
import os
import sys
import json
import torch
import numpy as np
import sys
import warnings
import torch.nn.functional as F
import torch.nn as nn
import random
import ast
from torch.utils.tensorboard import SummaryWriter
from typing import Dict, Any, Optional
import numpy as np
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
#from numpyencoder import NumpyEncoder
import datetime
from sklearn.metrics import (
    accuracy_score, f1_score, precision_score, recall_score,
    roc_curve, auc, precision_recall_curve, average_precision_score,
    confusion_matrix
)
from tqdm import tqdm, trange

from transformers.optimization import get_linear_schedule_with_warmup       # AdamW seems no longer available here
from torch.optim import AdamW

from sklearn.model_selection import train_test_split
from datasets import load_dataset

warnings.filterwarnings('ignore')

project_path = os.path.abspath('')
if project_path not in sys.path:
    sys.path.append(project_path)
#import backend_model_info

from dataloader import DataManager, DataManagerTest
from model_4 import MultiModalConcatLineFocalBMESBinaryClassifier

from sklearn.metrics import roc_curve, precision_recall_curve, auc, classification_report

from torch.utils.data import Subset, DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True)
    os.environ["PYTHONHASHSEED"] = str(seed)
    os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

import re
from collections import Counter, defaultdict
with open('./pylint.txt','r') as f:
    error_list = f.read()
    error_codes = re.findall(r"\((\w\d{4})\)", error_list)
    
def analyze_pylint_output(eval_result: str) -> Counter:
    analysis = [0]*len(error_codes)
    error_pattern = re.compile(r"\d:\d+:\s(\w\d{4}):\s")
    errors = error_pattern.findall(eval_result)

    error_counts = Counter(errors)
    
    analysis = [error_counts[e] for e in error_codes]

    return analysis


def analyze_pylint_output_line(eval_result: str, total_lines: int):
    error_pattern = re.compile(r"(\d+):\d+:\s(\w\d{4}):\s")
    errors = error_pattern.findall(eval_result)
    
    line_error_counts = defaultdict(Counter)

    for line, code in errors:
        line_error_counts[int(line)][code] += 1
    
    analysis = [[0]*len(error_codes) for _ in range(total_lines)]
    
    # 각 줄별 에러 코드 카운트를 분석 결과 리스트에 저장
    for line in range(total_lines):
        if line in line_error_counts:
            analysis[line] = [line_error_counts[line][code] for code in error_codes]
    
    return analysis

def split_code_sentence(code, use_sp=False):
        import re
        pattern = re.compile(
        r'"""|\'\'\'|"|\'|#|==|'
        r'\n|'
        r'[^\S\n]+|'
        r'\w+|[.,()\[\]{};:\=\_\+\-\*\/\~\!\%\^\&\<\>\?]')
        
        tokens = pattern.findall(code)
        return tokens

def ccfeature_line_to_token_level(code):
    code_tokens = split_code_sentence(code)
    count = 0
    line_num_list = []
    for token in code_tokens:
        line_num_list.append(count)
        if token == '\n':
            count += 1
    return line_num_list[:1024]

class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

class CustomDataset(Dataset):
    def __init__(self, original_dataset, indices):
        self.original_dataset = original_dataset
        self.indices = [int(idx) for idx in indices]

    def __getitem__(self, index):
        real_idx = self.indices[index]
        return self.original_dataset[int(real_idx)]

    def __len__(self):
        return len(self.indices)
    

def get_roc_metrics(true_labels, pred_labels):
    fpr, tpr, thresholds = roc_curve(true_labels, pred_labels)
    roc_auc = auc(fpr, tpr)
    J = tpr - fpr
    ix = np.argmax(J)
    best_thresh = thresholds[ix]
    print('Best Threshold=%f, sensitivity = %.3f, specificity = %.3f, J=%.3f' % (best_thresh, tpr[ix], 1-fpr[ix], J[ix]))
    return float(roc_auc)

class SupervisedTrainer:
    def __init__(self, data, model, en_labels, id2label, args):
        self.data = data
        self.model = model
        self.en_labels = en_labels
        self.id2label = id2label

        self.seq_len = args.seq_len
        self.num_train_epochs = args.num_train_epochs
        self.weight_decay = args.weight_decay
        self.lr = args.lr
        self.warm_up_ratio = args.warm_up_ratio

        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self._create_optimizer_and_scheduler()
        
        self.best_val_loss = float('inf')
        self.best_f1_score = 0.0
        self.best_model_path = None
        self.writer = None
        self.loss_function = nn.CrossEntropyLoss(ignore_index=-1)
        self.threshold = 0.5

    def _create_optimizer_and_scheduler(self):
        num_training_steps = len(
            self.data.train_dataloader) * self.num_train_epochs
        no_decay = ["bias", "LayerNorm.weight"]

        named_parameters = self.model.named_parameters()
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in named_parameters
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.weight_decay,
            },
            {
                "params": [
                    p for n, p in named_parameters
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]
        self.optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.lr,
            betas=(0.9, 0.98),
            eps=1e-8,
        )
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=int(self.warm_up_ratio * num_training_steps),
            num_training_steps=num_training_steps)

    def train(self, ckpt_name='linear_en.pt', prediction_method="most_common"):
        
        for epoch in trange(int(self.num_train_epochs), desc="Epoch"):
            self.model.train()
            tr_loss = 0
            nb_tr_steps = 0
            # train
            for step, inputs in enumerate(
                    tqdm(self.data.train_dataloader, desc="Iteration")):
                # send batch data to GPU
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        inputs[k] = v.to(self.device)
                with torch.set_grad_enabled(True):
                    labels = inputs['labels']
                    output = self.model(inputs['features'], inputs['labels'], inputs['ccfeatures'])#, inputs['line_indices'])
                    logits = output['logits']
                    loss = output['loss']
                    self.optimizer.zero_grad()
                    loss.backward()
                    
                    # print("KSY =======================")
                    # for name, p in self.model.named_parameters():
                    #     if 'feature_encoder' in name:
                    #         print(name)
                    #         print(p.grad)
                    #         exit()
                            
                    self.optimizer.step()
                    self.scheduler.step()

                    tr_loss += loss.item()
                    nb_tr_steps += 1
            
                if step % 50 == 0:
                    self.writer.add_scalar('Training Loss', loss.item(), epoch * len(self.data.train_dataloader) + step)
            
            
            avg_train_loss = tr_loss / nb_tr_steps
            print(f'epoch {epoch+1}: train_loss {avg_train_loss}')
            self.writer.add_scalar('Average Training Loss', avg_train_loss, epoch)

            # Validate data at the end of every epoch
            val_loss, sent_result = self.valid(prediction_method=prediction_method)
            self.writer.add_scalar('Validation Loss', val_loss, epoch)

            # save the best model
            if val_loss < self.best_val_loss:
                self.best_val_loss = val_loss
                self.best_model_path = f"{ckpt_name}"
                self.writer.add_scalar('Best Validation Loss', self.best_val_loss, epoch)
                torch.save(self.model.cpu(), self.best_model_path)
                self.model.to(self.device)

        # then reload the best model in the end
        if self.best_model_path:
            print(f"Reloading best model from {self.best_model_path}")
            self.model.load_state_dict(torch.load(self.best_model_path, weights_only=False).state_dict())
            self.model.to(self.device)
        
        self.writer.close()
        return
    
    def valid(self, content_level_eval=False, prediction_method="most_common"):
        self.model.eval()
        texts = []
        true_labels = []
        pred_labels = []
        total_logits = []
        total_probs = []
        total_loss = 0.0
        total_steps = 0
        
        for step, inputs in enumerate(
                tqdm(self.data.val_dataloader, desc="Iteration")):
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(self.device)
            with torch.no_grad():
                labels_ = inputs['labels']
                output = self.model(inputs['features'], inputs['labels'], inputs['ccfeatures'])
                preds = output['preds']
    
                logits_ = output['logits']
                
                probabilities = F.softmax(logits_, dim=-1)
                
                logits = logits_.view(-1, logits_.size(-1))
                labels = labels_.view(-1)
                loss = self.loss_function(logits, labels)
                total_loss += loss.item()
                total_steps += 1

                texts.extend(inputs['text'])
                pred_labels.extend(preds.cpu().tolist())
                true_labels.extend(labels_.cpu().tolist())
                total_probs.extend(probabilities)

        avg_val_loss = total_loss / total_steps
        print(f"Validation Loss: {avg_val_loss}")
        
        print("*" * 8, "Sentence Level Evalation", "*" * 8)
        #word_result, sent_result = self.sent_level_eval(texts, true_labels, pred_labels, total_probs, prediction_method)
        sent_result = self.sent_level_eval(texts, true_labels, pred_labels, total_probs, prediction_method)
        
        return avg_val_loss, sent_result
    
    def test(self, test_dataloader, content_level_eval=False, prediction_method="most_common"):
        self.model.eval()
        texts = []
        true_labels = []
        pred_labels = []
        total_logits = []
        total_probs = []
        problem_ids = []
        user_ids = []
        
        for step, inputs in enumerate(
                tqdm(test_dataloader, desc="Iteration")):
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(self.device)
            with torch.no_grad():
                labels = inputs['labels']
                output = self.model(inputs['features'], inputs['labels'], inputs['ccfeatures'])#, inputs['line_indices'])
                logits = output['logits']
                preds = output['preds']
                problem_id = inputs['problem_id']
                user_id = inputs['user_id']
                
                probabilities = F.softmax(logits, dim=-1)

                texts.extend(inputs['text'])
                pred_labels.extend(preds.cpu().tolist())
                true_labels.extend(labels.cpu().tolist())
                problem_ids.extend(problem_id)
                user_ids.extend(user_id)
                total_logits.extend(logits.cpu().tolist())
                total_probs.extend(probabilities)
        
        line_counts = [len(text.split('\n')) for text in texts]
        
        if content_level_eval:
            # content level evaluation
            print("*" * 8, "Content Level Evalation", "*" * 8)
            content_result = self.content_level_eval(texts, true_labels, pred_labels, total_probs, prediction_method)
        else:
            content_result = None
        print("*" * 8, "Sentence Level Evalation", "*" * 8)
        #word_result, sent_result = self.sent_level_eval(texts, true_labels, pred_labels, total_probs, prediction_method)
        sent_result = self.sent_level_eval(texts, true_labels, pred_labels, total_probs, prediction_method)
            
        # return sent_result, content_result, {'text':texts,'pred':pred_labels, 'true':true_labels, 'problem_id':problem_ids, 'user_id': user_ids}
        return sent_result, content_result, {'text': texts, 'pred': pred_labels, 'true': true_labels, 'problem_id':problem_ids, 'user_id':user_ids, 'line_count':line_counts}

    
    def content_level_eval(self, texts, true_labels, pred_labels, pred_probs, prediction_method='most_common'):
        if prediction_method =='threshold':
            threshold = self.threshold
        else:
            threshold = None
            pred_labels_threshold = pred_labels
        
        true_content_labels = []
        pred_content_labels = []
        pred_content_probs = []
        
        for text, true_label, pred_label, pred_prob in zip(texts, true_labels, pred_labels_threshold, pred_probs):
            true_label = np.array(true_label)
            pred_label = np.array(pred_label)
            pred_prob = np.array(pred_prob.cpu())
            
            mask = true_label != -1
            true_label = true_label[mask].tolist()
            pred_label = pred_label[mask].tolist()
            
            pred_prob = torch.tensor(pred_prob[mask])
            true_common_tag = self._get_most_common_tag(true_label)
            true_content_labels.append(true_common_tag[0])
            
            pred_common_tag = self._get_most_common_tag(pred_label)
            pred_content_labels.append(pred_common_tag[0])
            
            cont_prob = pred_prob[:, 4:8].sum(dim=1)
            pred_content_prob = torch.mean(cont_prob, dim=0)
            pred_content_probs.append(pred_content_prob.item())
            
        true_content_labels = [self.en_labels[label] for label in true_content_labels]
        pred_content_labels = [self.en_labels[label] for label in pred_content_labels]
        
        result = self._get_precision_recall_acc_f1(true_content_labels, pred_content_labels, pred_content_probs)
        
        return result

    def sent_level_eval(self, texts, true_labels, pred_labels, pred_probs, prediction_method='most_common'):
        if prediction_method =='threshold':
            threshold = self.threshold
        else:
            threshold = None
            pred_labels_threshold = pred_labels
        
        # For line-wise labeling
        true_sent_labels = []
        pred_sent_labels = []
        pred_sent_probs = []
        for text, true_label, pred_label, pred_prob in zip(texts, true_labels, pred_labels_threshold, pred_probs):
            true_label = np.array(true_label)
            pred_label = np.array(pred_label)
            pred_prob = np.array(pred_prob.cpu())
            mask = true_label != -1
            true_label = true_label[mask].tolist()
            pred_label = pred_label[mask].tolist()
            pred_prob = torch.tensor(pred_prob[mask])
            sents = text.split('\n')
            for true_label_idx in range(len(true_label)):
                if sents[true_label_idx] == '' or sents[true_label_idx].isspace():  # 빈 문장일 경우 처리하지 않음
                    continue
                true_sent_label = self.id2label[true_label[true_label_idx]]
                pred_sent_label = self.id2label[pred_label[true_label_idx]]
                
                true_sent_labels.append(true_sent_label.split('-')[-1])
                pred_sent_prob = pred_prob[true_label_idx, 4:8].sum()
                pred_sent_probs.append(pred_sent_prob.item())
                pred_sent_labels.append(pred_sent_label.split('-')[-1])
            
        true_sent_labels = [self.en_labels[label] for label in true_sent_labels]
        pred_sent_labels = [self.en_labels[label] for label in pred_sent_labels]
        
        sent_result = self._get_precision_recall_acc_f1(true_sent_labels, pred_sent_labels, pred_sent_probs)
        return sent_result
    
    
    def _get_threshold_tag(self, logits, machine_threshold=0.5):
        human_logits = logits[:, :, :4]  # Human Classes
        machine_logits = logits[:, :, 4:] # Machine Classes
        human_scores = torch.sum(human_logits, dim=-1)  # Shape: [batch_size, seq_len]
        machine_scores = torch.sum(machine_logits, dim=-1)        # Shape: [batch_size, seq_len]
        pred_labels = torch.where(machine_scores >= machine_threshold, 4, 0)  # 0 for Human, 4 for AI
        
        return pred_labels.cpu().tolist()
    
    def _get_most_common_tag(self, tags):
        """most_common_tag is a tuple: (tag, times)"""
        from collections import Counter
        tags = [self.id2label[tag] for tag in tags]
        tags = [tag.split('-')[-1] for tag in tags]
        tag_counts = Counter(tags)
        most_common_tag = tag_counts.most_common(1)[0]
        return most_common_tag
    
    def _get_precision_recall_acc_f1(self, true_labels, pred_labels, pred_probs=None, pos_label: int = 1) -> Dict[str, Any]:
        """
        true_labels: [0/1]
        pred_labels: 이미 threshold가 적용된 0/1 예측
        pred_probs : 선택. 점수(양성=pos_label의 확률/로짓 등). 있으면 ROC/AUPRC과 임계값 탐색 리포트 추가.
        pos_label  : 양성 클래스(기본 1)
        """
        y_true = np.asarray(true_labels).astype(int)
        y_pred = np.asarray(pred_labels).astype(int)

        # --- 기본 리포트(주어진 라벨 기준) ---
        acc  = accuracy_score(y_true, y_pred)
        mF1  = f1_score(y_true, y_pred, average='macro', zero_division=0)
        bF1  = f1_score(y_true, y_pred, average='binary', pos_label=pos_label, zero_division=0)
        prec = precision_score(y_true, y_pred, average=None, zero_division=0)
        rec  = recall_score(y_true, y_pred, average=None, zero_division=0)
        cm   = confusion_matrix(y_true, y_pred, labels=[0,1])

        print("=== Given labels (as-is) ===")
        print("Accuracy: {:.3f}".format(acc*100))
        print("Macro F1 Score: {:.3f}".format(mF1*100))
        print("Binary F1 Score (pos): {:.3f}".format(bF1*100))
        print("Precision/Recall per class:")
        print("{:.1f},{:.1f},{:.1f},{:.1f}".format(prec[0]*100, rec[0]*100, prec[1]*100, rec[1]*100))
        print(f"CM [[TN FP],[FN TP]] = {cm.tolist()}")

        # 결과 dict 시작
        result: Dict[str, Any] = {
            "given_labels": {
                "accuracy": acc, "macro_f1": mF1, "binary_f1": bF1,
                "precision": prec, "recall": rec, "cm": cm
            },
            "roc_auc": None,
            "auprc": None,
            "thresholds": {}
        }

        # --- 점수 기반 추가 리포트 ---
        if pred_probs is not None:
            y_score = np.asarray(pred_probs, dtype=float)

            # ROC / AUPRC
            try:
                fpr, tpr, thr_roc = roc_curve(y_true, y_score, pos_label=pos_label)
                roc_auc = float(auc(fpr, tpr))
            except Exception:
                roc_auc = None

            try:
                auprc = float(average_precision_score(y_true, y_score, pos_label=pos_label))
            except Exception:
                auprc = None

            print(f"ROC_AUC (fpr-tpr): {roc_auc:.3f}" if roc_auc is not None else "ROC_AUC: N/A")
            print(f"AUPRC: {auprc:.3f}" if auprc is not None else "AUPRC: N/A")

            # Helper: 특정 threshold에서 평가
            def eval_at(thr: float, tag: str) -> Dict[str, Any]:
                y_hat = (y_score > thr).astype(int)
                acc_  = accuracy_score(y_true, y_hat)
                mF1_  = f1_score(y_true, y_hat, average='macro', zero_division=0)
                bF1_  = f1_score(y_true, y_hat, average='binary', pos_label=pos_label, zero_division=0)
                pr_   = precision_score(y_true, y_hat, average=None, zero_division=0)
                rc_   = recall_score(y_true, y_hat, average=None, zero_division=0)
                cm_   = confusion_matrix(y_true, y_hat, labels=[0,1])
                print(f"[{tag}] thr={thr:.3f} | Acc={acc_*100:.1f}  MacroF1={mF1_*100:.1f}  BinF1(pos)={bF1_*100:.1f}")
                print(" P/R per class -> 0(H): {:.1f}/{:.1f} , 1(AI): {:.1f}/{:.1f}".format(pr_[0]*100, rc_[0]*100, pr_[1]*100, rc_[1]*100))
                print(f" CM [[TN FP],[FN TP]] = {cm_.tolist()}")
                return {"thr": float(thr), "accuracy": acc_, "macro_f1": mF1_, "binary_f1": bF1_, "precision": pr_, "recall": rc_, "cm": cm_}

            # Youden J (TPR - FPR) 최대
            def best_thr_youden() -> float:
                if roc_auc is None or len(thr_roc) == 0:
                    return 0.5
                J = tpr - fpr
                i = int(np.argmax(J))
                return float(thr_roc[i])

            # 양성 F1 최대(PR 기반)
            def best_thr_posF1() -> float:
                prec_curve, rec_curve, thr_pr = precision_recall_curve(y_true, y_score, pos_label=pos_label)
                if len(thr_pr) == 0:
                    return 0.5
                f1_curve = (2 * prec_curve * rec_curve) / (prec_curve + rec_curve + 1e-12)
                i = int(np.nanargmax(f1_curve[:-1]))  # 마지막 점은 threshold 없음
                return float(thr_pr[i])

            thr05     = 0.5
            thrJ      = best_thr_youden()
            thrBestF1 = best_thr_posF1()

            print("=== Threshold sweeps on scores ===")
            res05  = eval_at(thr05, "thr=0.5")
            resJ   = eval_at(thrJ, "thr=YoudenJ")
            resF1  = eval_at(thrBestF1, "thr=bestPosF1")

            result.update({
                "roc_auc": roc_auc,
                "auprc": auprc,
                "thresholds": {
                    "thr@0.5": res05,
                    "thr@youden": resJ,
                    "thr@best_posF1": resF1
                }
            })
        else:
            print("ROC_AUC (fpr-tpr): N/A (pred_probs is None)")
            print("AUPRC: N/A (pred_probs is None)")

        # CSV 한 줄 요약(기존 포맷과 유사)
        pr_line = "{:.1f},{:.1f},{:.1f},{:.1f}".format(prec[0]*100, rec[0]*100, prec[1]*100, rec[1]*100)
        print("{:.1f},{:.1f},{:.1f},{},{:.3f},{}".format(
            acc*100, mF1*100, bF1*100, pr_line, result["roc_auc"] if result["roc_auc"] is not None else float("nan"),
            f"{result['auprc']:.3f}" if result["auprc"] is not None else "N/A"
        ))

        return result
    # def _get_precision_recall_acc_f1(self, true_labels, pred_labels, pred_probs = None):
    #     accuracy = accuracy_score(true_labels, pred_labels)
    #     macro_f1 = f1_score(true_labels, pred_labels, average='macro')
    #     binary_f1 = f1_score(true_labels, pred_labels, average='binary')
    #     print("Accuracy: {:.3f}".format(accuracy*100))
    #     print("Macro F1 Score: {:.3f}".format(macro_f1*100))
    #     print("Binary F1 Score: {:.3f}".format(binary_f1*100))

    #     precision = precision_score(true_labels, pred_labels, average=None)
    #     recall = recall_score(true_labels, pred_labels, average=None)
    #     print("Precision/Recall per class: ")
    #     precision_recall = ','.join(["{:.1f},{:.1f}".format(p*100, r*100) for p, r in zip(precision, recall)])
    #     print(precision_recall)
    #     roc_auc = get_roc_metrics(true_labels, pred_probs)
    #     print(f"ROC_AUC (fpr-tpr): {roc_auc:.3f}")
        
    #     if pred_probs is not None:
    #         pred_probs = np.array(pred_probs)
    #         ai_probs = pred_probs
    #         auprc = average_precision_score(true_labels, ai_probs)
    #         print(f"AUPRC: {auprc:.3f}")
    #     else:
    #         auprc = None
    #         print("AUPRC: N/A (pred_probs in None)")
    #     result = {"precision":precision, "recall":recall, "accuracy":accuracy, "macro_f1":macro_f1, "binary_f1": binary_f1, "roc_auc":roc_auc, "auprc": auprc}
    #     print("{:.1f},{:.1f},{:.1f},{},{:.3f},{:.3f}".format(accuracy*100, macro_f1*100, binary_f1*100,precision_recall,roc_auc, auprc))
    #     return result


def construct_bmes_labels(labels):
    prefix = ['B-', 'M-', 'E-', 'S-']
    id2label = {}
    counter = 0

    for label, id in labels.items():
        for pre in prefix:
            id2label[counter] = pre + label
            counter += 1
    
    return id2label

def remove_duplicates(prob_dict):
    total_p = 0
    total = 0
    for problem_id, entries in prob_dict.items():
        n = 0
        unique_texts = set()
        unique_entries = []
        
        for entry in entries:
            if entry['text'] not in unique_texts:
                unique_entries.append(entry)
                unique_texts.add(entry['text'])
            else:
                n += 1
        if n != 0:
            total_p += 1
        total += n
        
        prob_dict[problem_id] = unique_entries     

# def split_dataset(data_path, dataset):
#     with open(data_path+f"/codenet(python)_{dataset}_features_aigcodeset.jsonl", 'r') as f:
#         full_samples = [json.loads(line) for line in f]
    
#     seed_everything(42)
    
#     labels = [sample['label'] for sample in full_samples]
    
#     train_samples, test_samples = train_test_split(
#         full_samples, test_size=0.2, stratify=labels, random_state=42
#     )

#     # Validation set을 train에서 10%만 샘플링 (stratified 유지)
#     train_samples, val_samples = train_test_split(
#         train_samples, test_size=0.1, stratify=[s['label'] for s in train_samples], random_state=42
#     )
        
#     print(f"Train: {len(train_samples)}, Validation: {len(val_samples)}, Test: {len(test_samples)}")
    
#     return [train_samples, val_samples, test_samples]
    

# def split_dataset(data_path, dataset):
#     # Train (full set)
#     with open(data_path+f"/codenet(python)_{dataset}_features_all_models.jsonl", 'r') as f:
#         full_train_set = [json.loads(line) for line in f]
    
#     seed_everything(42)

#     for sample in full_train_set:
#             if 'line' in dataset:
#                 ccfeature_line = analyze_pylint_output_line(sample['eval'], len(sample['text'].split('\n')))
#                 sample['ccfeature'] = ccfeature_line
#             else:
#                 sample['ccfeature'] = analyze_pylint_output(sample['eval'])

#     labels = [sample['label'] for sample in full_train_set]
#     train_set, test_set = train_test_split(full_train_set, test_size=0.2, random_state=42, stratify=labels)

#     train_set, val_set = train_test_split(train_set, test_size=0.1, random_state=42, stratify=[s['label'] for s in train_set])
#     print(f"Train: {len(train_set)}, Validation: {len(val_set)}, Test: {len(test_set)}")
#     # ai = 0
#     # filtered_test_set = []
#     # for sample in test_set:
#     #     if sample['label'] == 'AI' and sample.get('status_in_folder') == "Wrong":
#     #         filtered_test_set.append(sample)
#     #         ai += 1
#     #     elif sample['label'] == 'human':
#     #         filtered_test_set.append(sample)
    
#     # print(f"Train: {len(train_set)}, Validation: {len(val_set)}, Test: {len(filtered_test_set)}")
#     # print(f"the number of AI code: {ai}")
#     # return [train_set, val_set, filtered_test_set]
    
#     return [train_set, val_set, test_set]
    # # data filtering for test_set
    # filtered_test_set = []
    # for sample in test_set:
    #     if sample['label'] == 'human' and sample.get('status_in_folder') == 'Accepted':
    #         filtered_test_set.append(sample)
    #     elif sample['label'] == 'AI' and sample.get('LLM') == 'GEMINI':
    #         filtered_test_set.append(sample)
    
        
    # print(f"Train: {len(train_set)}, Validation: {len(val_set)}, Test: {len(filtered_test_set)}")

    # return [train_set, val_set, filtered_test_set] # 여기도 수정!
    # ====================================================================================

In [3]:
from sklearn.model_selection import GroupShuffleSplit
import numpy as np
from collections import Counter

def warn_group_overlap(groups_arr, idx_a, idx_b, name_a="A", name_b="B"):
    ga = set(groups_arr[idx_a])
    gb = set(groups_arr[idx_b])
    inter = ga & gb
    if inter:
        print(f"[WARN] {name_a} and {name_b} share {len(inter)} problem_ids (leak risk).")
    else:
        print(f"[OK] No problem_id overlap between {name_a} and {name_b}.")

def split_dataset(data_path, dataset, seed=42, test_size=0.2, val_size=0.1):
    # 1) Load full set
    with open(os.path.join(data_path, f"codenet(python)_{dataset}_features.jsonl"), "r", encoding="utf-8") as f:
        #full_train_set = [json.loads(line) for line in f]

        full_train_set = []
        for line in f:
            dumped_line = json.loads(line)
            dumped_line["user_id"] = ""
            if dumped_line["LLM"] == "Human":
                dumped_line["label_int"] = 0
            else:
                dumped_line["label_int"] = 1

            full_train_set.append(dumped_line)



    # full_train_set = [x for x in full_train_set if x.get("LLM") != "GPT3.5" and x.get("LLM") != "GEMINI"]
    seed_everything(seed)

    # 2) Build features (pylint 기반)
    for i, sample in enumerate(full_train_set):
        # problem_id가 없을 수도 있으니 안전하게 기본값
        if sample.get("problem_id") is None:
            sample["problem_id"] = f"__none__#{i}"

        if 'line' in dataset:
            n_lines = len(sample.get('text', '').split('\n'))
            ccfeature_line = analyze_pylint_output_line(sample.get('eval', ''), n_lines)
            sample['ccfeature'] = ccfeature_line
        else:
            sample['ccfeature'] = analyze_pylint_output(sample.get('eval', ''))

    # 3) Arrays for splitting
    labels = np.array([sample['label'] for sample in full_train_set])
    groups = np.array([sample['problem_id'] for sample in full_train_set])

    # 4) Group-aware Train/Test split
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    train_full_idx, test_idx = next(
        gss.split(
            np.zeros(len(full_train_set)),
            labels,
            groups=groups
        )
    )

    # 5) Group-aware Train/Val split (within train_full)
    gss_val = GroupShuffleSplit(n_splits=1, test_size=val_size, random_state=seed)
    train_idx, val_idx = next(
        gss_val.split(
            np.zeros(len(train_full_idx)),
            labels[train_full_idx],
            groups=groups[train_full_idx]
        )
    )
    # 인덱스를 원본 기준으로 변환
    train_idx = train_full_idx[train_idx]
    val_idx   = train_full_idx[val_idx]

    # 6) 누수(그룹 겹침) 점검
    warn_group_overlap(groups, train_idx, val_idx, "Train", "Val")
    warn_group_overlap(groups, train_idx, test_idx, "Train", "Test")
    warn_group_overlap(groups, val_idx,   test_idx, "Val",   "Test")

    # 7) 실제 세트 구성
    train_set = [full_train_set[i] for i in train_idx]
    val_set   = [full_train_set[i] for i in val_idx]
    test_set  = [full_train_set[i] for i in test_idx]

    # 8) 라벨 분포 확인(옵션이지만 유용)
    def distrib(name, arr):
        c = Counter([s['label'] for s in arr])
        total = len(arr)
        print(f"{name}: {total}  | human={c.get('human',0)} ({c.get('human',0)/total:.2%}), AI={c.get('AI',0)} ({c.get('AI',0)/total:.2%})")

    print(f"Train: {len(train_set)}, Validation: {len(val_set)}, Test: {len(test_set)}")
    distrib("Train", train_set)
    distrib("Val",   val_set)
    distrib("Test",  test_set)
    
    
    # ai = 0
    # filtered_test_set = []
    # for sample in test_set:
    #     if sample['LLM'] == 'GPT3.5':
    #         filtered_test_set.append(sample)
    #         ai += 1
    # print(f"Train: {len(train_set)}, Validation: {len(val_set)}, Test: {len(filtered_test_set)}")
    
    
    # test_save_path = "./codenet(python)_python_document_level_merged_file_test_0820.jsonl"
    # with open(test_save_path, "w", encoding="utf-8") as f:
    #     for sample in test_set:
    #         f.write(json.dumps(sample, ensure_ascii=False) + "\n")
    # print(f"Test set saved to {test_save_path}")
    # exit
    return [train_set, val_set, test_set]

# import json
# import random
# from collections import defaultdict

# def split_dataset(data_path, dataset):
#     # 데이터 로딩
#     with open(data_path + f"/codenet(python)_{dataset}_features.jsonl", 'r') as f:
#         total_samples = [json.loads(line) for line in f]

#     # seed 고정
#     seed_everything(42)

#     # 문제 ID 기준 그룹화 + ccfeature 추가
#     prob_dict = defaultdict(list)
#     for i in total_samples:
#         if 'line' in dataset:
#             ccfeature_line = analyze_pylint_output_line(i['eval'], len(i['text'].split('\n')))
#             i['ccfeature'] = ccfeature_line
#         else:
#             i['ccfeature'] = analyze_pylint_output(i['eval'])
#         prob_dict[i['problem_id']].append(i)

#     prob_list = list(prob_dict.keys())
    
#     # revised인 경우 전체 반환
#     if 'revised' in dataset:
#         df_test = []
#         for p in prob_list:
#             df_test.extend(prob_dict[p])
#         return {0: (df_test, df_test, df_test)}

#     # fold split: 문제 ID 단위로 겹치지 않게 나눔
#     random.shuffle(prob_list)
#     fold_size = len(prob_list) // 3
#     folds = [prob_list[i * fold_size: (i + 1) * fold_size] for i in range(3)]

#     df_ = {}
#     for fold in range(3):
#         print(f"\n[Fold {fold}]")
#         test_ids = folds[fold]
#         val_ids = folds[(fold + 1) % 3]
#         train_ids = [pid for i in range(3) if i != fold and i != (fold + 1) % 3 for pid in folds[i]]

#         df_train = [s for pid in train_ids for s in prob_dict[pid]]
#         df_valid = [s for pid in val_ids for s in prob_dict[pid]]
#         df_test  = [s for pid in test_ids for s in prob_dict[pid]]

#         print(f"Train: {len(df_train)} / Valid: {len(df_valid)} / Test: {len(df_test)}")
#         df_[fold] = (df_train, df_valid, df_test)

#     return df_

In [4]:
import argparse
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default='Transformer')
    parser.add_argument('--gpu', type=str, default='0')
    parser.add_argument('--train_mode', type=str, default='classify')
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--seq_len', type=int, default=1024)
    parser.add_argument('--dataset', type=str, default="")
    parser.add_argument('--method', type=str, default="focalbmesbinary_embedconcat_transformer256")
    
    parser.add_argument('--train_ratio', type=float, default=0.9)
    parser.add_argument('--split_dataset', action='store_true')
    parser.add_argument('--data_path', type=str, default='')
    parser.add_argument('--train_path', type=str, default='')
    parser.add_argument('--valid_path', type=str, default='')
    parser.add_argument('--test_path', type=str, default='')

    parser.add_argument('--num_train_epochs', type=int, default=20)
    parser.add_argument('--weight_decay', type=float, default=0.1)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--warm_up_ratio', type=float, default=0.1)
    parser.add_argument('--seed', type=int, default=42, required=True)
    parser.add_argument('--do_test', action='store_true')
    parser.add_argument('--test_content', action='store_true')
    
    parser.add_argument('--ckpt_name', type=str, default='')
    parser.add_argument('--alpha', type=float, default=0.5)
    parser.add_argument('--testbed', type=str, required=True)
    
    return parser.parse_args()

In [5]:
if __name__ == "__main__":

    sys.argv = [
        "train.py",
        "--dataset", "gemini_hybrid_line",
        "--data_path", "./data",
        "--seed", "42",
        "--testbed", "toplevel",
        "--ckpt_name", "gemini_hybrid_line"
    ]

    args = parse_args()
    
    print("Log INFO: split dataset...")
    df_ = split_dataset(data_path=args.data_path, seed=args.seed, dataset=args.dataset)  # [train, val, test]

    en_labels = {
        'human': 0,
        'AI': 1
    }
    
    id2label = construct_bmes_labels(en_labels)
    label2id = {v: k for k, v in id2label.items()}

    prediction_method = 'most_common'

    experiment_results = []

    if 'revised' in args.dataset:
        datas = DataManagerTest(datas=df_, batch_size=args.batch_size, max_len=args.seq_len, human_label='human', id2label=id2label)
    else:
        datas = DataManager(datas=df_, batch_size=args.batch_size, max_len=args.seq_len, human_label='human', id2label=id2label)

    # classifier 선택
    if args.method == 'focalbmesbinary_embedconcat_transformer256':
        if args.testbed == 'toplevel':
            if 'gemini' in args.dataset or 'gpt4' in args.dataset:
                classifier = MultiModalConcatLineFocalBMESBinaryClassifier(id2labels=id2label, seq_len=args.seq_len, alpha=args.alpha)

    ckpt_name = f'ckpt/{args.ckpt_name}_best_f1.pt'

    trainer = SupervisedTrainer(datas, classifier, en_labels, id2label, args)
    trainer.writer = SummaryWriter(log_dir=f"runs/python_{args.ckpt_name}")

    experiment_result = {}

    if args.do_test:
        print("Log INFO: do test...")
        saved_model = torch.load(ckpt_name)
        trainer.model.load_state_dict(saved_model.state_dict())
        if 'hybrid' in args.dataset or 'revised' in args.dataset:
            test_sent_result, _, test_raw_results = trainer.test(datas.test_dataloader, content_level_eval=False, prediction_method=prediction_method)
            experiment_result['test_result'] = {'line': test_sent_result, 'raw': test_raw_results}
        else:
            test_sent_result, test_content_result, test_raw_results = trainer.test(datas.test_dataloader, content_level_eval=True, prediction_method=prediction_method)
            experiment_result['test_result'] = {'line': test_sent_result, 'document': test_content_result, 'raw': test_raw_results}
    else:
        print("Log INFO: do train...")
        trainer.train(ckpt_name=ckpt_name, prediction_method=prediction_method)

        if 'hybrid' in args.dataset or 'revised' in args.dataset:
            test_sent_result, _, test_raw_results = trainer.test(datas.test_dataloader, content_level_eval=False, prediction_method=prediction_method)
            experiment_result['test_result'] = {'line': test_sent_result, 'raw': test_raw_results}
        else:
            test_sent_result, test_content_result, test_raw_results = trainer.test(datas.test_dataloader, content_level_eval=True, prediction_method=prediction_method)
            experiment_result['test_result'] = {'line': test_sent_result, 'document': test_content_result, 'raw': test_raw_results}

    experiment_results.append(experiment_result)

    with open(f'result/experiment_results_{args.ckpt_name}.json', 'w') as file:
        json.dump(experiment_results, file, ensure_ascii=False, cls=NpEncoder)

Log INFO: split dataset...
[OK] No problem_id overlap between Train and Val.
[OK] No problem_id overlap between Train and Test.
[OK] No problem_id overlap between Val and Test.
Train: 1992, Validation: 231, Test: 564
Train: 1992  | human=0 (0.00%), AI=1992 (100.00%)
Val: 231  | human=0 (0.00%), AI=231 (100.00%)
Test: 564  | human=0 (0.00%), AI=564 (100.00%)


100%|██████████| 1992/1992 [00:00<00:00, 4235.24it/s]
100%|██████████| 231/231 [00:00<00:00, 21684.96it/s]
100%|██████████| 564/564 [00:00<00:00, 21888.59it/s]


Log INFO: do train...


Iteration: 100%|██████████| 63/63 [00:33<00:00,  1.87it/s]


epoch 1: train_loss 0.04996196831029559


Iteration: 100%|██████████| 8/8 [00:06<00:00,  1.31it/s]


Validation Loss: 1.3829176872968674
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 67.122
Macro F1 Score: 40.163
Binary F1 Score (pos): 0.000
Precision/Recall per class:
67.1,100.0,0.0,0.0
CM [[TN FP],[FN TP]] = [[3148, 0], [1542, 0]]
ROC_AUC (fpr-tpr): 0.492
AUPRC: 0.311
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=67.1  MacroF1=40.2  BinF1(pos)=0.0
 P/R per class -> 0(H): 67.1/100.0 , 1(AI): 0.0/0.0
 CM [[TN FP],[FN TP]] = [[3148, 0], [1542, 0]]
[thr=YoudenJ] thr=0.200 | Acc=52.0  MacroF1=50.5  BinF1(pos)=41.8
 P/R per class -> 0(H): 69.0/51.8 , 1(AI): 34.8/52.4
 CM [[TN FP],[FN TP]] = [[1631, 1517], [734, 808]]
[thr=bestPosF1] thr=0.174 | Acc=35.5  MacroF1=29.5  BinF1(pos)=50.0
 P/R per class -> 0(H): 84.2/4.7 , 1(AI): 33.5/98.2
 CM [[TN FP],[FN TP]] = [[149, 2999], [28, 1514]]
67.1,40.2,0.0,67.1,100.0,0.0,0.0,0.492,0.311


Iteration: 100%|██████████| 63/63 [00:33<00:00,  1.87it/s]


epoch 2: train_loss 0.036352718396792334


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.41it/s]


Validation Loss: 1.246462494134903
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 67.122
Macro F1 Score: 40.163
Binary F1 Score (pos): 0.000
Precision/Recall per class:
67.1,100.0,0.0,0.0
CM [[TN FP],[FN TP]] = [[3148, 0], [1542, 0]]
ROC_AUC (fpr-tpr): 0.707
AUPRC: 0.523
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=67.1  MacroF1=40.2  BinF1(pos)=0.0
 P/R per class -> 0(H): 67.1/100.0 , 1(AI): 0.0/0.0
 CM [[TN FP],[FN TP]] = [[3148, 0], [1542, 0]]
[thr=YoudenJ] thr=0.171 | Acc=57.3  MacroF1=57.3  BinF1(pos)=56.3
 P/R per class -> 0(H): 84.8/44.3 , 1(AI): 42.4/83.7
 CM [[TN FP],[FN TP]] = [[1396, 1752], [251, 1291]]
[thr=bestPosF1] thr=0.171 | Acc=57.3  MacroF1=57.3  BinF1(pos)=56.3
 P/R per class -> 0(H): 84.8/44.3 , 1(AI): 42.4/83.7
 CM [[TN FP],[FN TP]] = [[1396, 1752], [251, 1291]]
67.1,40.2,0.0,67.1,100.0,0.0,0.0,0.707,0.523


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]


epoch 3: train_loss 0.03397092716916213


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.39it/s]


Validation Loss: 1.032549187541008
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 74.520
Macro F1 Score: 65.733
Binary F1 Score (pos): 48.380
Precision/Recall per class:
74.9,93.2,72.4,36.3
CM [[TN FP],[FN TP]] = [[2935, 213], [982, 560]]
ROC_AUC (fpr-tpr): 0.790
AUPRC: 0.643
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=73.8  MacroF1=63.2  BinF1(pos)=43.5
 P/R per class -> 0(H): 73.7/94.9 , 1(AI): 74.6/30.7
 CM [[TN FP],[FN TP]] = [[2987, 161], [1068, 474]]
[thr=YoudenJ] thr=0.281 | Acc=73.0  MacroF1=70.5  BinF1(pos)=61.9
 P/R per class -> 0(H): 82.3/76.1 , 1(AI): 57.8/66.7
 CM [[TN FP],[FN TP]] = [[2396, 752], [514, 1028]]
[thr=bestPosF1] thr=0.279 | Acc=72.9  MacroF1=70.4  BinF1(pos)=61.9
 P/R per class -> 0(H): 82.4/75.8 , 1(AI): 57.5/67.0
 CM [[TN FP],[FN TP]] = [[2385, 763], [509, 1033]]
74.5,65.7,48.4,74.9,93.2,72.4,36.3,0.790,0.643


Iteration: 100%|██████████| 63/63 [00:33<00:00,  1.91it/s]


epoch 4: train_loss 0.031550832004064604


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.42it/s]


Validation Loss: 0.953242652118206
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 75.330
Macro F1 Score: 65.373
Binary F1 Score (pos): 46.805
Precision/Recall per class:
74.5,96.1,80.4,33.0
CM [[TN FP],[FN TP]] = [[3024, 124], [1033, 509]]
ROC_AUC (fpr-tpr): 0.796
AUPRC: 0.659
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=73.8  MacroF1=62.6  BinF1(pos)=42.1
 P/R per class -> 0(H): 73.4/95.8 , 1(AI): 77.3/28.9
 CM [[TN FP],[FN TP]] = [[3017, 131], [1096, 446]]
[thr=YoudenJ] thr=0.209 | Acc=71.9  MacroF1=69.7  BinF1(pos)=61.5
 P/R per class -> 0(H): 82.6/73.7 , 1(AI): 56.0/68.4
 CM [[TN FP],[FN TP]] = [[2319, 829], [488, 1054]]
[thr=bestPosF1] thr=0.165 | Acc=68.2  MacroF1=67.3  BinF1(pos)=61.9
 P/R per class -> 0(H): 85.8/63.0 , 1(AI): 51.0/78.8
 CM [[TN FP],[FN TP]] = [[1982, 1166], [327, 1215]]
75.3,65.4,46.8,74.5,96.1,80.4,33.0,0.796,0.659


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.91it/s]


epoch 5: train_loss 0.03004901051994354


Iteration: 100%|██████████| 8/8 [00:06<00:00,  1.24it/s]


Validation Loss: 0.8525397256016731
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 76.652
Macro F1 Score: 68.509
Binary F1 Score (pos): 52.495
Precision/Recall per class:
76.1,95.0,79.3,39.2
CM [[TN FP],[FN TP]] = [[2990, 158], [937, 605]]
ROC_AUC (fpr-tpr): 0.819
AUPRC: 0.687
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=74.9  MacroF1=65.7  BinF1(pos)=47.9
 P/R per class -> 0(H): 74.8/94.5 , 1(AI): 75.7/35.1
 CM [[TN FP],[FN TP]] = [[2974, 174], [1001, 541]]
[thr=YoudenJ] thr=0.248 | Acc=73.6  MacroF1=71.8  BinF1(pos)=64.7
 P/R per class -> 0(H): 85.0/73.6 , 1(AI): 57.7/73.5
 CM [[TN FP],[FN TP]] = [[2316, 832], [408, 1134]]
[thr=bestPosF1] thr=0.248 | Acc=73.6  MacroF1=71.8  BinF1(pos)=64.7
 P/R per class -> 0(H): 85.0/73.6 , 1(AI): 57.7/73.5
 CM [[TN FP],[FN TP]] = [[2316, 832], [408, 1134]]
76.7,68.5,52.5,76.1,95.0,79.3,39.2,0.819,0.687


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]


epoch 6: train_loss 0.029299531545903947


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.44it/s]


Validation Loss: 0.8280503004789352
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 76.013
Macro F1 Score: 66.608
Binary F1 Score (pos): 48.887
Precision/Recall per class:
75.1,96.2,81.6,34.9
CM [[TN FP],[FN TP]] = [[3027, 121], [1004, 538]]
ROC_AUC (fpr-tpr): 0.831
AUPRC: 0.706
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=75.4  MacroF1=65.3  BinF1(pos)=46.6
 P/R per class -> 0(H): 74.5/96.3 , 1(AI): 81.3/32.7
 CM [[TN FP],[FN TP]] = [[3032, 116], [1038, 504]]
[thr=YoudenJ] thr=0.192 | Acc=74.6  MacroF1=72.8  BinF1(pos)=65.9
 P/R per class -> 0(H): 85.8/74.6 , 1(AI): 59.0/74.7
 CM [[TN FP],[FN TP]] = [[2347, 801], [390, 1152]]
[thr=bestPosF1] thr=0.192 | Acc=74.6  MacroF1=72.8  BinF1(pos)=65.9
 P/R per class -> 0(H): 85.8/74.6 , 1(AI): 59.0/74.7
 CM [[TN FP],[FN TP]] = [[2347, 801], [390, 1152]]
76.0,66.6,48.9,75.1,96.2,81.6,34.9,0.831,0.706


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.96it/s]


epoch 7: train_loss 0.028878397234375516


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.42it/s]


Validation Loss: 0.8255745619535446
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 75.842
Macro F1 Score: 66.004
Binary F1 Score (pos): 47.716
Precision/Recall per class:
74.8,96.6,82.7,33.5
CM [[TN FP],[FN TP]] = [[3040, 108], [1025, 517]]
ROC_AUC (fpr-tpr): 0.832
AUPRC: 0.712
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=74.9  MacroF1=63.8  BinF1(pos)=43.7
 P/R per class -> 0(H): 73.8/97.1 , 1(AI): 83.5/29.6
 CM [[TN FP],[FN TP]] = [[3058, 90], [1086, 456]]
[thr=YoudenJ] thr=0.159 | Acc=74.3  MacroF1=72.6  BinF1(pos)=65.6
 P/R per class -> 0(H): 85.6/74.2 , 1(AI): 58.6/74.5
 CM [[TN FP],[FN TP]] = [[2337, 811], [393, 1149]]
[thr=bestPosF1] thr=0.159 | Acc=74.3  MacroF1=72.6  BinF1(pos)=65.6
 P/R per class -> 0(H): 85.6/74.2 , 1(AI): 58.6/74.5
 CM [[TN FP],[FN TP]] = [[2337, 811], [393, 1149]]
75.8,66.0,47.7,74.8,96.6,82.7,33.5,0.832,0.712


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.96it/s]


epoch 8: train_loss 0.028617681905863775


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.45it/s]


Validation Loss: 0.7775556221604347
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 76.866
Macro F1 Score: 68.075
Binary F1 Score (pos): 51.323
Precision/Recall per class:
75.8,96.3,83.3,37.1
CM [[TN FP],[FN TP]] = [[3033, 115], [970, 572]]
ROC_AUC (fpr-tpr): 0.845
AUPRC: 0.732
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=75.9  MacroF1=66.2  BinF1(pos)=48.0
 P/R per class -> 0(H): 74.9/96.6 , 1(AI): 83.0/33.8
 CM [[TN FP],[FN TP]] = [[3041, 107], [1021, 521]]
[thr=YoudenJ] thr=0.192 | Acc=76.2  MacroF1=74.3  BinF1(pos)=67.3
 P/R per class -> 0(H): 86.1/76.9 , 1(AI): 61.3/74.6
 CM [[TN FP],[FN TP]] = [[2422, 726], [391, 1151]]
[thr=bestPosF1] thr=0.197 | Acc=76.4  MacroF1=74.5  BinF1(pos)=67.3
 P/R per class -> 0(H): 85.9/77.7 , 1(AI): 61.9/73.9
 CM [[TN FP],[FN TP]] = [[2446, 702], [403, 1139]]
76.9,68.1,51.3,75.8,96.3,83.3,37.1,0.845,0.732


Iteration: 100%|██████████| 63/63 [00:33<00:00,  1.90it/s]


epoch 9: train_loss 0.028052987678656504


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.37it/s]


Validation Loss: 0.7292047441005707
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 79.168
Macro F1 Score: 74.325
Binary F1 Score (pos): 63.174
Precision/Recall per class:
80.3,91.3,75.4,54.3
CM [[TN FP],[FN TP]] = [[2875, 273], [704, 838]]
ROC_AUC (fpr-tpr): 0.853
AUPRC: 0.746
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=78.8  MacroF1=73.4  BinF1(pos)=61.5
 P/R per class -> 0(H): 79.5/92.1 , 1(AI): 76.2/51.6
 CM [[TN FP],[FN TP]] = [[2899, 249], [746, 796]]
[thr=YoudenJ] thr=0.343 | Acc=78.4  MacroF1=75.9  BinF1(pos)=68.2
 P/R per class -> 0(H): 85.0/82.4 , 1(AI): 66.1/70.3
 CM [[TN FP],[FN TP]] = [[2593, 555], [458, 1084]]
[thr=bestPosF1] thr=0.343 | Acc=78.4  MacroF1=75.9  BinF1(pos)=68.2
 P/R per class -> 0(H): 85.0/82.4 , 1(AI): 66.1/70.3
 CM [[TN FP],[FN TP]] = [[2593, 555], [458, 1084]]
79.2,74.3,63.2,80.3,91.3,75.4,54.3,0.853,0.746


Iteration: 100%|██████████| 63/63 [00:33<00:00,  1.89it/s]


epoch 10: train_loss 0.027808418733969567


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.40it/s]


Validation Loss: 0.7218530997633934
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 78.145
Macro F1 Score: 71.943
Binary F1 Score (pos): 58.753
Precision/Recall per class:
78.3,93.2,77.4,47.3
CM [[TN FP],[FN TP]] = [[2935, 213], [812, 730]]
ROC_AUC (fpr-tpr): 0.851
AUPRC: 0.745
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=78.1  MacroF1=71.8  BinF1(pos)=58.5
 P/R per class -> 0(H): 78.2/93.5 , 1(AI): 77.9/46.8
 CM [[TN FP],[FN TP]] = [[2944, 204], [821, 721]]
[thr=YoudenJ] thr=0.291 | Acc=77.3  MacroF1=75.2  BinF1(pos)=68.0
 P/R per class -> 0(H): 85.9/79.3 , 1(AI): 63.4/73.3
 CM [[TN FP],[FN TP]] = [[2495, 653], [411, 1131]]
[thr=bestPosF1] thr=0.291 | Acc=77.3  MacroF1=75.2  BinF1(pos)=68.0
 P/R per class -> 0(H): 85.9/79.3 , 1(AI): 63.4/73.3
 CM [[TN FP],[FN TP]] = [[2495, 653], [411, 1131]]
78.1,71.9,58.8,78.3,93.2,77.4,47.3,0.851,0.745


Iteration: 100%|██████████| 63/63 [00:31<00:00,  1.97it/s]


epoch 11: train_loss 0.02756033536224138


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.41it/s]
Epoch:  55%|█████▌    | 11/20 [07:12<05:50, 38.96s/it]

Validation Loss: 0.7234265506267548
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 77.569
Macro F1 Score: 70.302
Binary F1 Score (pos): 55.612
Precision/Recall per class:
77.1,94.6,79.6,42.7
CM [[TN FP],[FN TP]] = [[2979, 169], [883, 659]]
ROC_AUC (fpr-tpr): 0.857
AUPRC: 0.754
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=77.8  MacroF1=70.4  BinF1(pos)=55.5
 P/R per class -> 0(H): 77.1/95.4 , 1(AI): 81.6/42.1
 CM [[TN FP],[FN TP]] = [[3002, 146], [893, 649]]
[thr=YoudenJ] thr=0.211 | Acc=76.4  MacroF1=74.8  BinF1(pos)=68.5
 P/R per class -> 0(H): 87.6/75.5 , 1(AI): 61.0/78.1
 CM [[TN FP],[FN TP]] = [[2378, 770], [337, 1205]]
[thr=bestPosF1] thr=0.255 | Acc=78.3  MacroF1=76.0  BinF1(pos)=68.5
 P/R per class -> 0(H): 85.5/81.5 , 1(AI): 65.5/71.8
 CM [[TN FP],[FN TP]] = [[2566, 582], [435, 1107]]
77.6,70.3,55.6,77.1,94.6,79.6,42.7,0.857,0.754


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]


epoch 12: train_loss 0.027326575849973965


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.43it/s]


Validation Loss: 0.7016537934541702
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 78.891
Macro F1 Score: 72.880
Binary F1 Score (pos): 60.113
Precision/Recall per class:
78.8,93.8,79.4,48.4
CM [[TN FP],[FN TP]] = [[2954, 194], [796, 746]]
ROC_AUC (fpr-tpr): 0.858
AUPRC: 0.757
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=78.7  MacroF1=72.4  BinF1(pos)=59.2
 P/R per class -> 0(H): 78.4/94.2 , 1(AI): 79.8/47.0
 CM [[TN FP],[FN TP]] = [[2965, 183], [817, 725]]
[thr=YoudenJ] thr=0.248 | Acc=76.2  MacroF1=74.6  BinF1(pos)=68.2
 P/R per class -> 0(H): 87.3/75.6 , 1(AI): 60.9/77.6
 CM [[TN FP],[FN TP]] = [[2380, 768], [346, 1196]]
[thr=bestPosF1] thr=0.314 | Acc=79.0  MacroF1=76.4  BinF1(pos)=68.5
 P/R per class -> 0(H): 84.9/83.6 , 1(AI): 67.5/69.6
 CM [[TN FP],[FN TP]] = [[2631, 517], [469, 1073]]
78.9,72.9,60.1,78.8,93.8,79.4,48.4,0.858,0.757


Iteration: 100%|██████████| 63/63 [00:33<00:00,  1.85it/s]


epoch 13: train_loss 0.027245092457012524


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.40it/s]
Epoch:  65%|██████▌   | 13/20 [08:30<04:34, 39.14s/it]

Validation Loss: 0.7034503370523453
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 78.081
Macro F1 Score: 71.172
Binary F1 Score (pos): 57.059
Precision/Recall per class:
77.6,94.6,80.2,44.3
CM [[TN FP],[FN TP]] = [[2979, 169], [859, 683]]
ROC_AUC (fpr-tpr): 0.864
AUPRC: 0.763
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=78.5  MacroF1=71.5  BinF1(pos)=57.3
 P/R per class -> 0(H): 77.6/95.4 , 1(AI): 82.5/43.9
 CM [[TN FP],[FN TP]] = [[3004, 144], [865, 677]]
[thr=YoudenJ] thr=0.187 | Acc=75.9  MacroF1=74.6  BinF1(pos)=68.8
 P/R per class -> 0(H): 88.7/73.4 , 1(AI): 59.8/80.9
 CM [[TN FP],[FN TP]] = [[2310, 838], [294, 1248]]
[thr=bestPosF1] thr=0.260 | Acc=79.1  MacroF1=76.7  BinF1(pos)=69.2
 P/R per class -> 0(H): 85.5/82.8 , 1(AI): 67.1/71.4
 CM [[TN FP],[FN TP]] = [[2607, 541], [441, 1101]]
78.1,71.2,57.1,77.6,94.6,80.2,44.3,0.864,0.763


Iteration: 100%|██████████| 63/63 [00:33<00:00,  1.87it/s]


epoch 14: train_loss 0.027111405477164282


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.34it/s]


Validation Loss: 0.6848049014806747
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 79.595
Macro F1 Score: 74.336
Binary F1 Score (pos): 62.719
Precision/Recall per class:
79.9,93.0,78.5,52.2
CM [[TN FP],[FN TP]] = [[2928, 220], [737, 805]]
ROC_AUC (fpr-tpr): 0.863
AUPRC: 0.763
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=79.5  MacroF1=74.0  BinF1(pos)=61.9
 P/R per class -> 0(H): 79.5/93.7 , 1(AI): 79.7/50.6
 CM [[TN FP],[FN TP]] = [[2949, 199], [761, 781]]
[thr=YoudenJ] thr=0.237 | Acc=76.2  MacroF1=74.8  BinF1(pos)=68.8
 P/R per class -> 0(H): 88.3/74.5 , 1(AI): 60.5/79.8
 CM [[TN FP],[FN TP]] = [[2346, 802], [312, 1230]]
[thr=bestPosF1] thr=0.238 | Acc=76.3  MacroF1=74.9  BinF1(pos)=68.8
 P/R per class -> 0(H): 88.2/74.7 , 1(AI): 60.7/79.6
 CM [[TN FP],[FN TP]] = [[2352, 796], [315, 1227]]
79.6,74.3,62.7,79.9,93.0,78.5,52.2,0.863,0.763


Iteration: 100%|██████████| 63/63 [00:34<00:00,  1.85it/s]


epoch 15: train_loss 0.027114815241287626


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.37it/s]


Validation Loss: 0.6831010282039642
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 78.955
Macro F1 Score: 73.389
Binary F1 Score (pos): 61.218
Precision/Recall per class:
79.3,92.9,77.7,50.5
CM [[TN FP],[FN TP]] = [[2924, 224], [763, 779]]
ROC_AUC (fpr-tpr): 0.865
AUPRC: 0.765
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=79.3  MacroF1=73.4  BinF1(pos)=60.8
 P/R per class -> 0(H): 79.0/94.2 , 1(AI): 80.4/48.9
 CM [[TN FP],[FN TP]] = [[2964, 184], [788, 754]]
[thr=YoudenJ] thr=0.211 | Acc=75.9  MacroF1=74.6  BinF1(pos)=69.0
 P/R per class -> 0(H): 88.9/73.2 , 1(AI): 59.8/81.4
 CM [[TN FP],[FN TP]] = [[2305, 843], [287, 1255]]
[thr=bestPosF1] thr=0.261 | Acc=78.1  MacroF1=76.1  BinF1(pos)=69.1
 P/R per class -> 0(H): 86.5/79.8 , 1(AI): 64.4/74.6
 CM [[TN FP],[FN TP]] = [[2511, 637], [391, 1151]]
79.0,73.4,61.2,79.3,92.9,77.7,50.5,0.865,0.765


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.94it/s]


epoch 16: train_loss 0.026888944918201083


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.38it/s]
Epoch:  80%|████████  | 16/20 [10:30<02:37, 39.37s/it]

Validation Loss: 0.6855407729744911
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 78.443
Macro F1 Score: 72.006
Binary F1 Score (pos): 58.583
Precision/Recall per class:
78.2,94.2,79.5,46.4
CM [[TN FP],[FN TP]] = [[2964, 184], [827, 715]]
ROC_AUC (fpr-tpr): 0.866
AUPRC: 0.768
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=78.8  MacroF1=72.1  BinF1(pos)=58.5
 P/R per class -> 0(H): 78.1/95.1 , 1(AI): 82.1/45.4
 CM [[TN FP],[FN TP]] = [[2995, 153], [842, 700]]
[thr=YoudenJ] thr=0.215 | Acc=77.0  MacroF1=75.5  BinF1(pos)=69.3
 P/R per class -> 0(H): 88.1/76.0 , 1(AI): 61.7/79.1
 CM [[TN FP],[FN TP]] = [[2392, 756], [323, 1219]]
[thr=bestPosF1] thr=0.247 | Acc=78.4  MacroF1=76.4  BinF1(pos)=69.4
 P/R per class -> 0(H): 86.6/80.2 , 1(AI): 64.9/74.7
 CM [[TN FP],[FN TP]] = [[2524, 624], [390, 1152]]
78.4,72.0,58.6,78.2,94.2,79.5,46.4,0.866,0.768


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.97it/s]


epoch 17: train_loss 0.02688767287939314


Iteration: 100%|██████████| 8/8 [00:06<00:00,  1.32it/s]


Validation Loss: 0.6771578267216682
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 79.488
Macro F1 Score: 74.018
Binary F1 Score (pos): 62.096
Precision/Recall per class:
79.6,93.4,79.1,51.1
CM [[TN FP],[FN TP]] = [[2940, 208], [754, 788]]
ROC_AUC (fpr-tpr): 0.868
AUPRC: 0.770
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=79.5  MacroF1=73.7  BinF1(pos)=61.3
 P/R per class -> 0(H): 79.2/94.3 , 1(AI): 80.8/49.4
 CM [[TN FP],[FN TP]] = [[2967, 181], [781, 761]]
[thr=YoudenJ] thr=0.222 | Acc=76.8  MacroF1=75.4  BinF1(pos)=69.6
 P/R per class -> 0(H): 88.8/74.9 , 1(AI): 61.1/80.7
 CM [[TN FP],[FN TP]] = [[2357, 791], [298, 1244]]
[thr=bestPosF1] thr=0.222 | Acc=76.8  MacroF1=75.4  BinF1(pos)=69.6
 P/R per class -> 0(H): 88.8/74.9 , 1(AI): 61.1/80.7
 CM [[TN FP],[FN TP]] = [[2357, 791], [298, 1244]]
79.5,74.0,62.1,79.6,93.4,79.1,51.1,0.868,0.770


Iteration: 100%|██████████| 63/63 [00:31<00:00,  1.99it/s]


epoch 18: train_loss 0.02679358600150971


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.35it/s]


Validation Loss: 0.6744687110185623
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 79.296
Macro F1 Score: 73.716
Binary F1 Score (pos): 61.605
Precision/Recall per class:
79.4,93.4,78.9,50.5
CM [[TN FP],[FN TP]] = [[2940, 208], [763, 779]]
ROC_AUC (fpr-tpr): 0.869
AUPRC: 0.771
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=79.7  MacroF1=73.9  BinF1(pos)=61.6
 P/R per class -> 0(H): 79.3/94.4 , 1(AI): 81.4/49.5
 CM [[TN FP],[FN TP]] = [[2973, 175], [778, 764]]
[thr=YoudenJ] thr=0.222 | Acc=76.7  MacroF1=75.3  BinF1(pos)=69.5
 P/R per class -> 0(H): 88.8/74.7 , 1(AI): 61.0/80.8
 CM [[TN FP],[FN TP]] = [[2350, 798], [296, 1246]]
[thr=bestPosF1] thr=0.249 | Acc=77.8  MacroF1=76.0  BinF1(pos)=69.5
 P/R per class -> 0(H): 87.4/78.2 , 1(AI): 63.4/77.0
 CM [[TN FP],[FN TP]] = [[2461, 687], [354, 1188]]
79.3,73.7,61.6,79.4,93.4,78.9,50.5,0.869,0.771


Iteration: 100%|██████████| 63/63 [00:31<00:00,  1.97it/s]


epoch 19: train_loss 0.026752432246529866


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.34it/s]


Validation Loss: 0.6726795211434364
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 79.616
Macro F1 Score: 74.231
Binary F1 Score (pos): 62.451
Precision/Recall per class:
79.7,93.4,79.2,51.6
CM [[TN FP],[FN TP]] = [[2939, 209], [747, 795]]
ROC_AUC (fpr-tpr): 0.869
AUPRC: 0.772
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=79.9  MacroF1=74.2  BinF1(pos)=62.2
 P/R per class -> 0(H): 79.5/94.3 , 1(AI): 81.1/50.5
 CM [[TN FP],[FN TP]] = [[2967, 181], [764, 778]]
[thr=YoudenJ] thr=0.226 | Acc=76.8  MacroF1=75.4  BinF1(pos)=69.5
 P/R per class -> 0(H): 88.7/75.0 , 1(AI): 61.2/80.4
 CM [[TN FP],[FN TP]] = [[2361, 787], [302, 1240]]
[thr=bestPosF1] thr=0.258 | Acc=78.1  MacroF1=76.3  BinF1(pos)=69.6
 P/R per class -> 0(H): 87.1/79.2 , 1(AI): 64.1/76.1
 CM [[TN FP],[FN TP]] = [[2492, 656], [369, 1173]]
79.6,74.2,62.5,79.7,93.4,79.2,51.6,0.869,0.772


Iteration: 100%|██████████| 63/63 [00:32<00:00,  1.96it/s]


epoch 20: train_loss 0.026741520782548284


Iteration: 100%|██████████| 8/8 [00:05<00:00,  1.41it/s]
Epoch: 100%|██████████| 20/20 [13:03<00:00, 39.19s/it]


Validation Loss: 0.6735561937093735
******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 79.190
Macro F1 Score: 73.548
Binary F1 Score (pos): 61.331
Precision/Recall per class:
79.3,93.4,78.8,50.2
CM [[TN FP],[FN TP]] = [[2940, 208], [768, 774]]
ROC_AUC (fpr-tpr): 0.869
AUPRC: 0.772
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=79.7  MacroF1=73.9  BinF1(pos)=61.6
 P/R per class -> 0(H): 79.2/94.6 , 1(AI): 81.7/49.4
 CM [[TN FP],[FN TP]] = [[2977, 171], [780, 762]]
[thr=YoudenJ] thr=0.225 | Acc=76.9  MacroF1=75.5  BinF1(pos)=69.6
 P/R per class -> 0(H): 88.6/75.3 , 1(AI): 61.4/80.2
 CM [[TN FP],[FN TP]] = [[2371, 777], [305, 1237]]
[thr=bestPosF1] thr=0.258 | Acc=78.3  MacroF1=76.4  BinF1(pos)=69.6
 P/R per class -> 0(H): 87.0/79.6 , 1(AI): 64.5/75.6
 CM [[TN FP],[FN TP]] = [[2506, 642], [376, 1166]]
79.2,73.5,61.3,79.3,93.4,78.8,50.2,0.869,0.772
Reloading best model from ckpt/gemini_hybrid_line_best_f1.pt


Iteration: 100%|██████████| 18/18 [00:16<00:00,  1.12it/s]


******** Sentence Level Evalation ********
=== Given labels (as-is) ===
Accuracy: 77.471
Macro F1 Score: 71.051
Binary F1 Score (pos): 57.418
Precision/Recall per class:
75.8,95.9,85.0,43.4
CM [[TN FP],[FN TP]] = [[7934, 342], [2528, 1935]]
ROC_AUC (fpr-tpr): 0.848
AUPRC: 0.765
=== Threshold sweeps on scores ===
[thr=0.5] thr=0.500 | Acc=76.6  MacroF1=69.8  BinF1(pos)=55.6
 P/R per class -> 0(H): 75.2/95.4 , 1(AI): 83.0/41.8
 CM [[TN FP],[FN TP]] = [[7894, 382], [2599, 1864]]
[thr=YoudenJ] thr=0.244 | Acc=76.3  MacroF1=74.9  BinF1(pos)=69.1
 P/R per class -> 0(H): 85.3/76.7 , 1(AI): 63.6/75.6
 CM [[TN FP],[FN TP]] = [[6348, 1928], [1090, 3373]]
[thr=bestPosF1] thr=0.244 | Acc=76.3  MacroF1=74.9  BinF1(pos)=69.1
 P/R per class -> 0(H): 85.3/76.7 , 1(AI): 63.6/75.6
 CM [[TN FP],[FN TP]] = [[6348, 1928], [1090, 3373]]
77.5,71.1,57.4,75.8,95.9,85.0,43.4,0.848,0.765
