# 最初から機械学習でやってみる

In [7]:
import os
import json
import time
import glob
from typing import TypeVar, Type, List, Optional
from dataclasses import dataclass
from pydantic import BaseModel
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from openai import AzureOpenAI
from tqdm import tqdm
from typing import List, Dict, Optional, Any, Tuple

In [8]:
csv_files = glob.glob('*.csv')
print(csv_files)

['extracted_500_first_20241130_検索結果1204.csv', 'pubmed_true_data.csv']


In [9]:
df = pd.read_csv(csv_files[0])
# Assuming your dataframe is named 'df'
df['included'] = df['notes'].astype(str).str.contains('Included')

print(df['included'].value_counts())
print(f'不一致の数: {df[df["included"] == True]["notes"].str.contains("Excluded").sum()}')
#予測の元となる変数

import json

def create_json_text(row):
    return json.dumps({
        'title': str(row['title']),
        'abstract': str(row['abstract'])
    })

df['tiab'] = df.apply(create_json_text, axis=1)

included
False    493
True       7
Name: count, dtype: int64
不一致の数: 0


In [14]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
import optuna
from lightgbm import LGBMClassifier

def prepare_text_data(df, text_column='tiab', label_column='included'):
    """テキストデータの前処理とTF-IDF変換"""
    texts = df[text_column].fillna('')

    vectorizer = TfidfVectorizer(
        max_features=10000,
        min_df=2,
        max_df=0.95,
        ngram_range=(1, 2)
    )

    X_vec = vectorizer.fit_transform(texts)
    y = df[label_column]

    class_weights = dict(zip(
        y.unique(),
        [1 / (len(y) * (y == label).mean()) for label in y.unique()]
    ))

    return X_vec, y, vectorizer, class_weights

def fbeta_score_custom(y_true, y_pred, beta=1):
    """カスタムF-betaスコアの計算"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    if precision == 0.0 and recall == 0.0:
        return 0.0
    fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    return fbeta

def calculate_detailed_metrics(y_true, y_pred):
    """詳細な評価指標の計算と表示"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0

    print("\n=== 詳細な評価指標 ===")
    print(f"感度 (Sensitivity/Recall): {sensitivity:.3f}")
    print(f"特異度 (Specificity): {specificity:.3f}")
    print(f"適合率 (Precision): {precision:.3f}")
    print(f"F1スコア: {f1:.3f}")

    print("\n=== 混同行列 ===")
    print("                  Predicted")
    print("                  Negative  Positive")
    print(f"Actual Negative    {tn:^8} {fp:^8}")
    print(f"      Positive    {fn:^8} {tp:^8}")

    return {
        'sensitivity': sensitivity,
        'specificity': specificity,
        'precision': precision,
        'f1': f1,
        'confusion_matrix': {'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}
    }

def evaluate_model(X_vec, y, params, threshold, class_weights, beta=2):
    """モデルの評価（交差検証）"""
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    all_predictions = []
    all_true_values = []

    for train_idx, val_idx in skf.split(X_vec, y):
        X_tr = X_vec[train_idx]
        X_val = X_vec[val_idx]
        y_tr = y.iloc[train_idx]
        y_val = y.iloc[val_idx]

        model = LGBMClassifier(
            **params,
            random_state=42,
            class_weight=class_weights
        )

        model.fit(X_tr, y_tr)
        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)

        score = fbeta_score_custom(y_val, y_pred, beta=beta)
        scores.append(score)

        all_predictions.extend(y_pred)
        all_true_values.extend(y_val)

    if len(scores) == 5:
        print("\n=== 交差検証全体の評価 ===")
        metrics = calculate_detailed_metrics(all_true_values, all_predictions)
        print("\nClassification Report:")
        print(classification_report(all_true_values, all_predictions))

    return np.mean(scores)

def objective(trial, X_vec, y, class_weights, beta=2):
    """Optunaの目的関数"""
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 128),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "verbosity": -1
    }
    threshold = trial.suggest_float("threshold", 0.1, 0.9)

    score = evaluate_model(X_vec, y, params, threshold, class_weights, beta=beta)
    return score

def train_final_model(df, beta_value=2, n_trials=50):
    """完全なトレーニングパイプライン"""
    # データの準備
    X_vec, y, vectorizer, class_weights = prepare_text_data(df)

    print(f"データセットの形状: {X_vec.shape}")
    print(f"クラスの分布:\n{y.value_counts(normalize=True)}")
    print(f"クラスの重み: {class_weights}")

    # ハイパーパラメータの最適化
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(trial, X_vec, y, class_weights, beta=beta_value),
        n_trials=n_trials
    )

    best_params = study.best_trial.params
    threshold = best_params.pop("threshold")

    print("\nBest parameters:", best_params)
    print("Best threshold:", threshold)

    final_score = evaluate_model(X_vec, y, best_params, threshold, class_weights, beta=beta_value)
    print(f"\nFinal Mean F{beta_value} Score:", final_score)

    # 最終モデルの学習
    final_model = LGBMClassifier(
        **best_params,
        random_state=42,
        class_weight=class_weights
    )
    final_model.fit(X_vec, y)

    # 最終評価
    print("\n=== 最終モデルの評価（全データ） ===")
    y_prob = final_model.predict_proba(X_vec)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)
    final_metrics = calculate_detailed_metrics(y, y_pred)

    return final_model, vectorizer, threshold, best_params

def predict_new_text(text, model, vectorizer, threshold):
    """新しいテキストの予測"""
    X_new = vectorizer.transform([text])
    prob = model.predict_proba(X_new)[0, 1]
    prediction = prob >= threshold
    return prediction, prob



In [15]:
# 使用例
if __name__ == "__main__":
    # データの読み込み
    #df = pd.read_csv('your_data.csv')  # tiab列とincluded列を含むデータ

    # モデルのトレーニング
    model, vectorizer, threshold, best_params = train_final_model(
        df,
        beta_value=4,  # より高い値で再現率を重視
        n_trials=50
    )


[I 2024-12-07 08:43:28,467] A new study created in memory with name: no-name-e1d0eb7e-c3f2-46ee-9206-27933ba53b0f


データセットの形状: (500, 10000)
クラスの分布:
included
False    0.986
True     0.014
Name: proportion, dtype: float64
クラスの重み: {np.False_: np.float64(0.002028397565922921), np.True_: np.float64(0.14285714285714285)}


[I 2024-12-07 08:43:31,733] Trial 0 finished with value: 0.1619047619047619 and parameters: {'num_leaves': 76, 'learning_rate': 0.0251301969404423, 'n_estimators': 221, 'reg_lambda': 0.004093897052729685, 'min_child_samples': 13, 'threshold': 0.11723547712465986}. Best is trial 0 with value: 0.1619047619047619.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.980
適合率 (Precision): 0.091
F1スコア: 0.111

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      483       10   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.98      0.98       493
        True       0.09      0.14      0.11         7

    accuracy                           0.97       500
   macro avg       0.54      0.56      0.55       500
weighted avg       0.98      0.97      0.97       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:43:33,236] Trial 1 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 80, 'learning_rate': 0.0029227290249230985, 'n_estimators': 301, 'reg_lambda': 4.01762225160193, 'min_child_samples': 58, 'threshold': 0.3892638318263474}. Best is trial 1 with value: 0.19096133751306166.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:43:34,659] Trial 2 finished with value: 0.0 and parameters: {'num_leaves': 47, 'learning_rate': 0.004723800251722145, 'n_estimators': 216, 'reg_lambda': 1.7259404683849873, 'min_child_samples': 37, 'threshold': 0.5299693383273998}. Best is trial 1 with value: 0.19096133751306166.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.959
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      473       20   
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.96      0.97       493
        True       0.00      0.00      0.00         7

    accuracy                           0.95       500
   macro avg       0.49      0.48      0.49       500
weighted avg       0.97      0.95      0.96       500



[I 2024-12-07 08:43:35,886] Trial 3 finished with value: 0.28190410199556537 and parameters: {'num_leaves': 64, 'learning_rate': 0.0030301292486102807, 'n_estimators': 336, 'reg_lambda': 0.01705486589753941, 'min_child_samples': 98, 'threshold': 0.28280577223852565}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.857
特異度 (Specificity): 0.535
適合率 (Precision): 0.026
F1スコア: 0.050

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      264      229   
      Positive       1        6    

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.54      0.70       493
        True       0.03      0.86      0.05         7

    accuracy                           0.54       500
   macro avg       0.51      0.70      0.37       500
weighted avg       0.98      0.54      0.69       500



[I 2024-12-07 08:43:37,125] Trial 4 finished with value: 0.0 and parameters: {'num_leaves': 94, 'learning_rate': 0.021184195232166844, 'n_estimators': 60, 'reg_lambda': 0.003334401005889195, 'min_child_samples': 29, 'threshold': 0.7891492208977606}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.990
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      488       5    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.98       500
   macro avg       0.49      0.49      0.49       500
weighted avg       0.97      0.98      0.97       500



[I 2024-12-07 08:43:38,371] Trial 5 finished with value: 0.1 and parameters: {'num_leaves': 23, 'learning_rate': 0.24290976919723986, 'n_estimators': 466, 'reg_lambda': 0.0035589570563803762, 'min_child_samples': 85, 'threshold': 0.16770469788484546}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.996
適合率 (Precision): 0.333
F1スコア: 0.200

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      491       2    
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       493
        True       0.33      0.14      0.20         7

    accuracy                           0.98       500
   macro avg       0.66      0.57      0.60       500
weighted avg       0.98      0.98      0.98       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:43:39,793] Trial 6 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 114, 'learning_rate': 0.0015677756771254733, 'n_estimators': 436, 'reg_lambda': 0.09173890983588573, 'min_child_samples': 63, 'threshold': 0.12014987813222673}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:43:42,352] Trial 7 finished with value: 0.0 and parameters: {'num_leaves': 45, 'learning_rate': 0.0017679950751297254, 'n_estimators': 374, 'reg_lambda': 0.07367192354830651, 'min_child_samples': 27, 'threshold': 0.7827385567858356}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 1.000
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      493       0    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.99       500
   macro avg       0.49      0.50      0.50       500
weighted avg       0.97      0.99      0.98       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:43:43,298] Trial 8 finished with value: 0.0 and parameters: {'num_leaves': 92, 'learning_rate': 0.012367194740297746, 'n_estimators': 156, 'reg_lambda': 3.261685195449988, 'min_child_samples': 48, 'threshold': 0.8430578916417784}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 1.000
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      493       0    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.99       500
   macro avg       0.49      0.50      0.50       500
weighted avg       0.97      0.99      0.98       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:43:44,204] Trial 9 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 10, 'learning_rate': 0.0041725922440056795, 'n_estimators': 244, 'reg_lambda': 0.743672950804456, 'min_child_samples': 96, 'threshold': 0.2351704378209866}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:43:45,635] Trial 10 finished with value: 0.0 and parameters: {'num_leaves': 128, 'learning_rate': 0.08520637998887129, 'n_estimators': 353, 'reg_lambda': 0.019437819561725, 'min_child_samples': 77, 'threshold': 0.5003600520525294}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.998
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      492       1    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.98       500
   macro avg       0.49      0.50      0.50       500
weighted avg       0.97      0.98      0.98       500



[I 2024-12-07 08:43:46,790] Trial 11 finished with value: 0.2705855288647811 and parameters: {'num_leaves': 59, 'learning_rate': 0.004852020551742182, 'n_estimators': 323, 'reg_lambda': 0.4463991979689878, 'min_child_samples': 65, 'threshold': 0.34279233244333485}. Best is trial 3 with value: 0.28190410199556537.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.225
適合率 (Precision): 0.018
F1スコア: 0.035

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      111      382   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.23      0.37       493
        True       0.02      1.00      0.04         7

    accuracy                           0.24       500
   macro avg       0.51      0.61      0.20       500
weighted avg       0.99      0.24      0.36       500



[I 2024-12-07 08:43:48,096] Trial 12 finished with value: 0.3136284973022002 and parameters: {'num_leaves': 57, 'learning_rate': 0.00948725055624355, 'n_estimators': 339, 'reg_lambda': 0.28458344184834894, 'min_child_samples': 70, 'threshold': 0.3366346637118037}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.857
特異度 (Specificity): 0.606
適合率 (Precision): 0.030
F1スコア: 0.058

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      299      194   
      Positive       1        6    

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.61      0.75       493
        True       0.03      0.86      0.06         7

    accuracy                           0.61       500
   macro avg       0.51      0.73      0.41       500
weighted avg       0.98      0.61      0.74       500



[I 2024-12-07 08:43:49,308] Trial 13 finished with value: 0.17435897435897435 and parameters: {'num_leaves': 38, 'learning_rate': 0.013950897312353175, 'n_estimators': 408, 'reg_lambda': 0.02268864499345206, 'min_child_samples': 82, 'threshold': 0.3235777525407526}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.286
特異度 (Specificity): 0.943
適合率 (Precision): 0.067
F1スコア: 0.108

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      465       28   
      Positive       5        2    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.94      0.97       493
        True       0.07      0.29      0.11         7

    accuracy                           0.93       500
   macro avg       0.53      0.61      0.54       500
weighted avg       0.98      0.93      0.95       500



[I 2024-12-07 08:43:50,900] Trial 14 finished with value: 0.08095238095238096 and parameters: {'num_leaves': 60, 'learning_rate': 0.009829801962202688, 'n_estimators': 496, 'reg_lambda': 0.2974765709015301, 'min_child_samples': 98, 'threshold': 0.5889061339761256}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.907
適合率 (Precision): 0.021
F1スコア: 0.037

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      447       46   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.91      0.95       493
        True       0.02      0.14      0.04         7

    accuracy                           0.90       500
   macro avg       0.50      0.52      0.49       500
weighted avg       0.97      0.90      0.93       500



[I 2024-12-07 08:43:52,568] Trial 15 finished with value: 0.0 and parameters: {'num_leaves': 33, 'learning_rate': 0.03596782671471688, 'n_estimators': 294, 'reg_lambda': 0.01554980614511453, 'min_child_samples': 72, 'threshold': 0.25905505548564756}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.972
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      479       14   
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.97      0.98       493
        True       0.00      0.00      0.00         7

    accuracy                           0.96       500
   macro avg       0.49      0.49      0.49       500
weighted avg       0.97      0.96      0.96       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:43:53,967] Trial 16 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 65, 'learning_rate': 0.0011780932940602888, 'n_estimators': 386, 'reg_lambda': 9.748418056648548, 'min_child_samples': 92, 'threshold': 0.44456512252221037}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:43:55,586] Trial 17 finished with value: 0.0 and parameters: {'num_leaves': 92, 'learning_rate': 0.00863160082408215, 'n_estimators': 158, 'reg_lambda': 0.20905867297892994, 'min_child_samples': 52, 'threshold': 0.6448945433648662}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.994
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      490       3    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.98       500
   macro avg       0.49      0.50      0.49       500
weighted avg       0.97      0.98      0.98       500



[I 2024-12-07 08:43:56,909] Trial 18 finished with value: 0.0 and parameters: {'num_leaves': 53, 'learning_rate': 0.07392032402386872, 'n_estimators': 339, 'reg_lambda': 0.0010974430255517888, 'min_child_samples': 71, 'threshold': 0.24410334150548146}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.996
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      491       2    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.98       500
   macro avg       0.49      0.50      0.50       500
weighted avg       0.97      0.98      0.98       500



[I 2024-12-07 08:43:57,917] Trial 19 finished with value: 0.2794865547119424 and parameters: {'num_leaves': 76, 'learning_rate': 0.007601866063584542, 'n_estimators': 261, 'reg_lambda': 0.045466124209956216, 'min_child_samples': 87, 'threshold': 0.40640267650728895}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.571
特異度 (Specificity): 0.785
適合率 (Precision): 0.036
F1スコア: 0.068

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      387      106   
      Positive       3        4    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.78      0.88       493
        True       0.04      0.57      0.07         7

    accuracy                           0.78       500
   macro avg       0.51      0.68      0.47       500
weighted avg       0.98      0.78      0.87       500



[I 2024-12-07 08:43:59,584] Trial 20 finished with value: 0.0 and parameters: {'num_leaves': 25, 'learning_rate': 0.0025221369965576653, 'n_estimators': 423, 'reg_lambda': 0.008943414228586077, 'min_child_samples': 44, 'threshold': 0.6628640827028814}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.963
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      475       18   
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.96      0.97       493
        True       0.00      0.00      0.00         7

    accuracy                           0.95       500
   macro avg       0.49      0.48      0.49       500
weighted avg       0.97      0.95      0.96       500



[I 2024-12-07 08:44:00,642] Trial 21 finished with value: 0.2592328109201213 and parameters: {'num_leaves': 77, 'learning_rate': 0.006871953968599803, 'n_estimators': 260, 'reg_lambda': 0.0485929504598478, 'min_child_samples': 89, 'threshold': 0.4160842746480555}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.571
特異度 (Specificity): 0.773
適合率 (Precision): 0.034
F1スコア: 0.065

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      381      112   
      Positive       3        4    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.77      0.87       493
        True       0.03      0.57      0.07         7

    accuracy                           0.77       500
   macro avg       0.51      0.67      0.47       500
weighted avg       0.98      0.77      0.86       500



[I 2024-12-07 08:44:01,442] Trial 22 finished with value: 0.28190410199556537 and parameters: {'num_leaves': 73, 'learning_rate': 0.006329781954977487, 'n_estimators': 175, 'reg_lambda': 0.03971313050356087, 'min_child_samples': 100, 'threshold': 0.31552672424070793}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.857
特異度 (Specificity): 0.542
適合率 (Precision): 0.026
F1スコア: 0.050

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      267      226   
      Positive       1        6    

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.54      0.70       493
        True       0.03      0.86      0.05         7

    accuracy                           0.55       500
   macro avg       0.51      0.70      0.38       500
weighted avg       0.98      0.55      0.69       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:02,334] Trial 23 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 69, 'learning_rate': 0.0028737947959449816, 'n_estimators': 165, 'reg_lambda': 0.16320640646872733, 'min_child_samples': 99, 'threshold': 0.303104389910844}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:44:03,208] Trial 24 finished with value: 0.2113596800752764 and parameters: {'num_leaves': 109, 'learning_rate': 0.03549800635148116, 'n_estimators': 191, 'reg_lambda': 0.780740750788975, 'min_child_samples': 77, 'threshold': 0.19778206564089207}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.857
特異度 (Specificity): 0.349
適合率 (Precision): 0.018
F1スコア: 0.036

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      172      321   
      Positive       1        6    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.35      0.52       493
        True       0.02      0.86      0.04         7

    accuracy                           0.36       500
   macro avg       0.51      0.60      0.28       500
weighted avg       0.98      0.36      0.51       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:03,875] Trial 25 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 53, 'learning_rate': 0.005273465936267655, 'n_estimators': 87, 'reg_lambda': 0.008938510487937949, 'min_child_samples': 99, 'threshold': 0.29121045157852843}. Best is trial 12 with value: 0.3136284973022002.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:44:04,817] Trial 26 finished with value: 0.3319798598470154 and parameters: {'num_leaves': 87, 'learning_rate': 0.015557878258663196, 'n_estimators': 97, 'reg_lambda': 0.037316974565052, 'min_child_samples': 81, 'threshold': 0.3538510720869471}. Best is trial 26 with value: 0.3319798598470154.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.857
特異度 (Specificity): 0.645
適合率 (Precision): 0.033
F1スコア: 0.064

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      318      175   
      Positive       1        6    

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.65      0.78       493
        True       0.03      0.86      0.06         7

    accuracy                           0.65       500
   macro avg       0.52      0.75      0.42       500
weighted avg       0.98      0.65      0.77       500



[I 2024-12-07 08:44:05,747] Trial 27 finished with value: 0.3386454283021091 and parameters: {'num_leaves': 89, 'learning_rate': 0.016608812657268206, 'n_estimators': 118, 'reg_lambda': 0.18836366455631795, 'min_child_samples': 79, 'threshold': 0.4566199915963566}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.714
特異度 (Specificity): 0.736
適合率 (Precision): 0.037
F1スコア: 0.070

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      363      130   
      Positive       2        5    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.74      0.85       493
        True       0.04      0.71      0.07         7

    accuracy                           0.74       500
   macro avg       0.52      0.73      0.46       500
weighted avg       0.98      0.74      0.84       500



[I 2024-12-07 08:44:06,637] Trial 28 finished with value: 0.08717948717948718 and parameters: {'num_leaves': 102, 'learning_rate': 0.058163055163663796, 'n_estimators': 93, 'reg_lambda': 0.13752265809675435, 'min_child_samples': 66, 'threshold': 0.4578820333006548}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.921
適合率 (Precision): 0.025
F1スコア: 0.043

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      454       39   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.92      0.95       493
        True       0.03      0.14      0.04         7

    accuracy                           0.91       500
   macro avg       0.51      0.53      0.50       500
weighted avg       0.97      0.91      0.94       500



[I 2024-12-07 08:44:09,267] Trial 29 finished with value: 0.08717948717948718 and parameters: {'num_leaves': 85, 'learning_rate': 0.029278284891202648, 'n_estimators': 116, 'reg_lambda': 0.5353431766271068, 'min_child_samples': 7, 'threshold': 0.5679119433676689}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.941
適合率 (Precision): 0.033
F1スコア: 0.054

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      464       29   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.94      0.96       493
        True       0.03      0.14      0.05         7

    accuracy                           0.93       500
   macro avg       0.51      0.54      0.51       500
weighted avg       0.97      0.93      0.95       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:10,034] Trial 30 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 85, 'learning_rate': 0.017485725908342985, 'n_estimators': 128, 'reg_lambda': 1.3990813244716809, 'min_child_samples': 79, 'threshold': 0.3590483431810362}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:44:11,161] Trial 31 finished with value: 0.08095238095238096 and parameters: {'num_leaves': 98, 'learning_rate': 0.013465820225237951, 'n_estimators': 218, 'reg_lambda': 0.07887267048077193, 'min_child_samples': 72, 'threshold': 0.46677681315236436}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.907
適合率 (Precision): 0.021
F1スコア: 0.037

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      447       46   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.91      0.95       493
        True       0.02      0.14      0.04         7

    accuracy                           0.90       500
   macro avg       0.50      0.52      0.49       500
weighted avg       0.97      0.90      0.93       500



[I 2024-12-07 08:44:11,857] Trial 32 finished with value: 0.3372663468372424 and parameters: {'num_leaves': 69, 'learning_rate': 0.024723761768681785, 'n_estimators': 54, 'reg_lambda': 0.2818858399552767, 'min_child_samples': 58, 'threshold': 0.3657108073138502}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.857
特異度 (Specificity): 0.635
適合率 (Precision): 0.032
F1スコア: 0.062

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      313      180   
      Positive       1        6    

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.63      0.78       493
        True       0.03      0.86      0.06         7

    accuracy                           0.64       500
   macro avg       0.51      0.75      0.42       500
weighted avg       0.98      0.64      0.77       500



[I 2024-12-07 08:44:12,663] Trial 33 finished with value: 0.272202380952381 and parameters: {'num_leaves': 84, 'learning_rate': 0.049061732339355346, 'n_estimators': 58, 'reg_lambda': 0.19467881253761182, 'min_child_samples': 57, 'threshold': 0.36983553056904883}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.571
特異度 (Specificity): 0.789
適合率 (Precision): 0.037
F1スコア: 0.070

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      389      104   
      Positive       3        4    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.79      0.88       493
        True       0.04      0.57      0.07         7

    accuracy                           0.79       500
   macro avg       0.51      0.68      0.47       500
weighted avg       0.98      0.79      0.87       500



[I 2024-12-07 08:44:13,496] Trial 34 finished with value: 0.25414598108747044 and parameters: {'num_leaves': 106, 'learning_rate': 0.02268071046264776, 'n_estimators': 127, 'reg_lambda': 0.3743128390678571, 'min_child_samples': 60, 'threshold': 0.396230962648822}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.571
特異度 (Specificity): 0.730
適合率 (Precision): 0.029
F1スコア: 0.056

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      360      133   
      Positive       3        4    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.73      0.84       493
        True       0.03      0.57      0.06         7

    accuracy                           0.73       500
   macro avg       0.51      0.65      0.45       500
weighted avg       0.98      0.73      0.83       500



[I 2024-12-07 08:44:14,365] Trial 35 finished with value: 0.18531976744186046 and parameters: {'num_leaves': 69, 'learning_rate': 0.12763919880398347, 'n_estimators': 83, 'reg_lambda': 1.6059297936587391, 'min_child_samples': 70, 'threshold': 0.506990678950586}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.429
特異度 (Specificity): 0.813
適合率 (Precision): 0.032
F1スコア: 0.059

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      401       92   
      Positive       4        3    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.81      0.89       493
        True       0.03      0.43      0.06         7

    accuracy                           0.81       500
   macro avg       0.51      0.62      0.48       500
weighted avg       0.98      0.81      0.88       500



[I 2024-12-07 08:44:15,202] Trial 36 finished with value: 0.0 and parameters: {'num_leaves': 118, 'learning_rate': 0.017776981490070405, 'n_estimators': 52, 'reg_lambda': 1.1357959243323164, 'min_child_samples': 39, 'threshold': 0.5560158309492307}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.974
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      480       13   
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.97      0.98       493
        True       0.00      0.00      0.00         7

    accuracy                           0.96       500
   macro avg       0.49      0.49      0.49       500
weighted avg       0.97      0.96      0.97       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:16,186] Trial 37 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 82, 'learning_rate': 0.011276171565925279, 'n_estimators': 107, 'reg_lambda': 0.2973016816439695, 'min_child_samples': 56, 'threshold': 0.2018506761766258}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:44:17,278] Trial 38 finished with value: 0.09714285714285714 and parameters: {'num_leaves': 53, 'learning_rate': 0.03075740262735425, 'n_estimators': 302, 'reg_lambda': 0.11117886060242159, 'min_child_samples': 80, 'threshold': 0.4421897884357589}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.959
適合率 (Precision): 0.048
F1スコア: 0.071

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      473       20   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.96      0.97       493
        True       0.05      0.14      0.07         7

    accuracy                           0.95       500
   macro avg       0.52      0.55      0.52       500
weighted avg       0.97      0.95      0.96       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:18,324] Trial 39 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 92, 'learning_rate': 0.020446795718504114, 'n_estimators': 141, 'reg_lambda': 2.4577483084743195, 'min_child_samples': 51, 'threshold': 0.1333675390405924}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



[I 2024-12-07 08:44:19,038] Trial 40 finished with value: 0.09714285714285714 and parameters: {'num_leaves': 59, 'learning_rate': 0.01427438679406793, 'n_estimators': 80, 'reg_lambda': 0.057574414048725445, 'min_child_samples': 84, 'threshold': 0.6158399849759787}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.923
適合率 (Precision): 0.026
F1スコア: 0.043

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      455       38   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.92      0.95       493
        True       0.03      0.14      0.04         7

    accuracy                           0.91       500
   macro avg       0.51      0.53      0.50       500
weighted avg       0.97      0.91      0.94       500



[I 2024-12-07 08:44:20,075] Trial 41 finished with value: 0.30133267342413683 and parameters: {'num_leaves': 45, 'learning_rate': 0.003934967623474842, 'n_estimators': 366, 'reg_lambda': 0.006428609850481295, 'min_child_samples': 92, 'threshold': 0.27702844122052245}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.857
特異度 (Specificity): 0.582
適合率 (Precision): 0.028
F1スコア: 0.055

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      287      206   
      Positive       1        6    

Classification Report:
              precision    recall  f1-score   support

       False       1.00      0.58      0.73       493
        True       0.03      0.86      0.05         7

    accuracy                           0.59       500
   macro avg       0.51      0.72      0.39       500
weighted avg       0.98      0.59      0.73       500



[I 2024-12-07 08:44:21,283] Trial 42 finished with value: 0.25119331065759637 and parameters: {'num_leaves': 44, 'learning_rate': 0.0040644812367912975, 'n_estimators': 369, 'reg_lambda': 0.0030323072449634117, 'min_child_samples': 90, 'threshold': 0.37939245288323364}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.571
特異度 (Specificity): 0.761
適合率 (Precision): 0.033
F1スコア: 0.062

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      375      118   
      Positive       3        4    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.76      0.86       493
        True       0.03      0.57      0.06         7

    accuracy                           0.76       500
   macro avg       0.51      0.67      0.46       500
weighted avg       0.98      0.76      0.85       500



[I 2024-12-07 08:44:22,125] Trial 43 finished with value: 0.08499999999999999 and parameters: {'num_leaves': 39, 'learning_rate': 0.024646375227809485, 'n_estimators': 201, 'reg_lambda': 0.0013288592014031977, 'min_child_samples': 93, 'threshold': 0.2666031986149986}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.919
適合率 (Precision): 0.024
F1スコア: 0.042

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      453       40   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.92      0.95       493
        True       0.02      0.14      0.04         7

    accuracy                           0.91       500
   macro avg       0.51      0.53      0.50       500
weighted avg       0.97      0.91      0.94       500



[I 2024-12-07 08:44:23,875] Trial 44 finished with value: 0.07727272727272727 and parameters: {'num_leaves': 71, 'learning_rate': 0.010113973819245605, 'n_estimators': 312, 'reg_lambda': 0.026306599020487046, 'min_child_samples': 75, 'threshold': 0.3425983555692874}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.886
適合率 (Precision): 0.018
F1スコア: 0.031

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      437       56   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.89      0.93       493
        True       0.02      0.14      0.03         7

    accuracy                           0.88       500
   macro avg       0.50      0.51      0.48       500
weighted avg       0.97      0.88      0.92       500



[I 2024-12-07 08:44:25,402] Trial 45 finished with value: 0.0 and parameters: {'num_leaves': 48, 'learning_rate': 0.04772199582583651, 'n_estimators': 400, 'reg_lambda': 0.00593230585080254, 'min_child_samples': 66, 'threshold': 0.21126707533106287}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.986
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      486       7    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.99      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.97       500
   macro avg       0.49      0.49      0.49       500
weighted avg       0.97      0.97      0.97       500



[I 2024-12-07 08:44:26,441] Trial 46 finished with value: 0.07727272727272727 and parameters: {'num_leaves': 25, 'learning_rate': 0.0034952111291648128, 'n_estimators': 354, 'reg_lambda': 0.002265341230113739, 'min_child_samples': 85, 'threshold': 0.48360427253634247}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.143
特異度 (Specificity): 0.846
適合率 (Precision): 0.013
F1スコア: 0.024

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      417       76   
      Positive       6        1    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.85      0.91       493
        True       0.01      0.14      0.02         7

    accuracy                           0.84       500
   macro avg       0.50      0.49      0.47       500
weighted avg       0.97      0.84      0.90       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:27,941] Trial 47 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 63, 'learning_rate': 0.0018285490590022302, 'n_estimators': 447, 'reg_lambda': 0.271958186213709, 'min_child_samples': 61, 'threshold': 0.1617060908086895}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:29,880] Trial 48 finished with value: 0.19096133751306166 and parameters: {'num_leaves': 12, 'learning_rate': 0.006269666507601246, 'n_estimators': 286, 'reg_lambda': 0.7568288796363507, 'min_child_samples': 19, 'threshold': 0.327630870288042}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.014
F1スコア: 0.028

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       493   
      Positive       0        7    

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       493
        True       0.01      1.00      0.03         7

    accuracy                           0.01       500
   macro avg       0.01      0.50      0.01       500
weighted avg       0.00      0.01      0.00       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-07 08:44:31,293] Trial 49 finished with value: 0.0 and parameters: {'num_leaves': 89, 'learning_rate': 0.2903085281205109, 'n_estimators': 327, 'reg_lambda': 0.1147362277587716, 'min_child_samples': 94, 'threshold': 0.7169394954701456}. Best is trial 27 with value: 0.3386454283021091.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 1.000
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      493       0    
      Positive       7        0    

Classification Report:
              precision    recall  f1-score   support

       False       0.99      1.00      0.99       493
        True       0.00      0.00      0.00         7

    accuracy                           0.99       500
   macro avg       0.49      0.50      0.50       500
weighted avg       0.97      0.99      0.98       500


Best parameters: {'num_leaves': 89, 'learning_rate': 0.016608812657268206, 'n_estimators': 118, 'reg_lambda': 0.18836366455631795, 'min_child_samples': 79}
Best threshold: 0.4566199915963566

=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.714
特異度 (Specificity): 0.736
適合率 (Precision): 0.037
F1スコア: 0.070

=== 混同行列 ===
                  Predicted
       

In [16]:
import joblib


In [17]:
# 日時をstringsで
timestamp = time.strftime('%Y%m%d%H%M%S')

In [18]:

# モデルと関連オブジェクトの保存
model_data = {
    'model': model,
    'vectorizer': vectorizer,
    'threshold': threshold,
    'best_params': best_params
}
joblib.dump(model_data, f'{timestamp}_model_data.joblib')
print("\nモデルと関連オブジェクトを 'model_data.joblib' に保存しました。")


モデルと関連オブジェクトを 'model_data.joblib' に保存しました。


In [20]:
models = glob.glob('*.joblib')
print(models)

['model_data.joblib', '20241207084511_model_data.joblib']


In [21]:
# 保存したモデルデータをロード
model_data = joblib.load(models[1])

# 各オブジェクトの取り出し
loaded_model = model_data['model']
loaded_vectorizer = model_data['vectorizer']
loaded_threshold = model_data['threshold']
loaded_best_params = model_data['best_params']

print("モデルと関連オブジェクトをロードしました。")


モデルと関連オブジェクトをロードしました。


In [22]:
df2 = pd.read_csv(csv_files[1])
df2.columns = ["PMID", "title", "abstract"]
df2["tiab"] = df2.apply(create_json_text, axis=1)

In [23]:
df2.shape

(10, 4)

新しいデータの形状: (10, 10000)


In [36]:
def predict_and_summarize(df, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold):
    # "tiab"列の作成 (DataFrame dfに対して適用)
    df["tiab"] = df.apply(create_json_text, axis=1)

    # テキストデータの前処理とベクトル化
    X_new = loaded_vectorizer.transform(df["tiab"].fillna(''))

    print(f"新しいデータの形状: {X_new.shape}")

    # 予測確率の取得
    y_prob_new = loaded_model.predict_proba(X_new)[:, 1]

    # 閾値を適用してクラスラベルを決定
    y_pred_new = (y_prob_new >= loaded_threshold).astype(int)

    # 予測結果をデータフレームに追加
    df['prediction'] = y_pred_new
    df['probability'] = y_prob_new

    print("予測を実行し、結果をデータフレームに追加しました。")

    # 予測結果の表示
    print("\n=== 予測結果 ===")
    print(df[['tiab', 'prediction', 'probability']])

    # クラス分布の確認
    class_dist = df['prediction'].value_counts()
    print("\nクラスの分布:")
    print(class_dist)

    # 確率の統計情報
    prob_stats = df['probability'].describe()
    print("\n予測確率の統計情報:")
    print(prob_stats)

    # 必要に応じて、更新後のdfや統計情報を返す
    return df, class_dist, prob_stats


In [28]:
# 必要なオブジェクトを準備
# df2, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold を用意済みとする
updated_df, class_distribution, probability_stats = predict_and_summarize(
    df2, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold
)


新しいデータの形状: (10, 10000)
予測を実行し、結果をデータフレームに追加しました。

=== 予測結果 ===
                                                tiab  prediction  probability
0  {"title": "A multicenter, randomized, controll...           1     0.649690
1  {"title": "Transfusion requirements after head...           1     0.639201
2  {"title": "Restrictive vs Liberal Transfusion ...           1     0.566336
3  {"title": "Liberal or Restrictive Transfusion ...           1     0.573175
4  {"title": "Effect of Transfusion on Mortality ...           1     0.549861
5  {"title": "Transfusion-related risk of seconda...           1     0.555931
6  {"title": "Impact of Blood Product Transfusion...           1     0.604583
7  {"title": "Transfusion practice and blood stre...           1     0.576636
8  {"title": "Anemia, transfusions and hospital o...           1     0.633104
9  {"title": "Transfusion of platelets, but not o...           1     0.589637

クラスの分布:
prediction
1    10
Name: count, dtype: int64

予測確率の統計情報:
count    10.0

# 残っているファイルに対して予測を行い、Trueであったものだけを残す

In [33]:
df3 = pd.read_csv(r"searchdata\remaining_data_20241130.csv")

In [34]:
df3.head()

Unnamed: 0,key,title,authors,journal,volume,issue,pages,abstract,year,publisher,url,issn
0,cbe5dc78-690a-4f42-aa14-1fa928ee6f4a,Contemporary management of major haemorrhage i...,"[{'author': 'Maier, Cheryl L', 'author_abbrevi...",Intensive care medicine,50,3,319-331,Haemorrhagic shock is frequent in critical car...,2024,United States,https://doi.org/10.1007/s00134-023-07303-5,"{""pmid"": 38189930, ""electronic_issn"": ""1432-12..."
1,a6ee585f-1ba3-45a1-a701-9076213ea3e3,Pediatric Moderate and Severe Traumatic Brain ...,"[{'author': 'Ben Abdeljelil, Anis', 'author_ab...",Journal of neurotrauma,40,21-22,2270-2281,Traumatic brain injury (TBI) is the leading ca...,2023,United States,https://doi.org/10.1089/neu.2023.0149,"{""pmid"": 37341019, ""electronic_issn"": ""1557-90..."
2,fa95dc24-7daa-4768-8820-cf018f87ea59,Obstetric Disorders and Critical Illness.,"[{'author': 'Griffin, Kelly M', 'author_abbrev...",Clinics in chest medicine,43,3,471-488,"In this article, we discuss some of the more c...",2022,United States,https://doi.org/10.1016/j.ccm.2022.04.008,"{""pmid"": 36116815, ""electronic_issn"": ""1557-82..."
3,4c2a0dc7-e665-49bc-bf11-4208d2a5c141,How I diagnose and treat neonatal thrombocytop...,"[{'author': 'Stanworth, Simon J', 'author_abbr...",Blood,141,22,2685-2697,"Neonatal thrombocytopenia, defined as the pres...",2023,United States,https://doi.org/10.1182/blood.2022018017,"{""pmid"": 36787503, ""electronic_issn"": ""1528-00..."
4,426cc244-1011-4f64-935d-e1bada53efcf,Liberal or Restrictive Transfusion Strategy in...,"[{'author': 'Turgeon, Alexis F', 'author_abbre...",The New England journal of medicine,391,8,722-735,BACKGROUND: The effect of a liberal transfusio...,2024,United States,https://doi.org/10.1056/NEJMoa2404360,"{""pmid"": 38869931, ""electronic_issn"": ""1533-44..."


In [37]:
# 必要なオブジェクトを準備
# df2, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold を用意済みとする
updated_df, class_distribution, probability_stats = predict_and_summarize(
    df3, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold
)


新しいデータの形状: (34900, 10000)
予測を実行し、結果をデータフレームに追加しました。

=== 予測結果 ===
                                                    tiab  prediction  \
0      {"title": "Contemporary management of major ha...           1   
1      {"title": "Pediatric Moderate and Severe Traum...           1   
2      {"title": "Obstetric Disorders and Critical Il...           0   
3      {"title": "How I diagnose and treat neonatal t...           1   
4      {"title": "Liberal or Restrictive Transfusion ...           1   
...                                                  ...         ...   
34895  {"title": "Age of Blood evalution.", "abstract...           1   
34896  {"title": "Autologous Umbilical Cord Blood Mon...           1   
34897  {"title": "Age of BLood Evaluation (ABLE) tria...           1   
34898  {"title": "Age of Red Blood Cells in Premature...           1   
34899  {"title": "The effect of transfusion on cerebr...           1   

       probability  
0         0.549861  
1         0.682654  
2     

In [38]:
print(df3.columns)

Index(['key', 'title', 'authors', 'journal', 'volume', 'issue', 'pages',
       'abstract', 'year', 'publisher', 'url', 'issn', 'tiab', 'prediction',
       'probability'],
      dtype='object')


In [39]:
df3["prediction"].value_counts()

prediction
0    23996
1    10904
Name: count, dtype: int64

In [40]:
df_export = df3[df3["prediction"] == 1]

In [41]:
df_export = df_export[['key', 'title', 'authors', 'journal', 'volume', 'issue', 'pages',
       'abstract', 'year', 'publisher', 'url', 'issn']]

In [42]:
df_export.shape

(10904, 12)

In [43]:
# df_exportをランダムに500とそれ以外に分割して、それぞれをCSVファイルとしてエクスポート, seed設定
df_export.sample(frac=1, random_state=42).iloc[:500].to_csv('exported_data500.csv', index=False)
df_export.sample(frac=1, random_state=42).iloc[500:].to_csv('remaining_data10404.csv', index=False)

