# 最初から機械学習でやってみる

In [1]:
import os
import json
import time
import glob
from typing import TypeVar, Type, List, Optional
from dataclasses import dataclass
from pydantic import BaseModel
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from openai import AzureOpenAI
from tqdm import tqdm
from typing import List, Dict, Optional, Any, Tuple

In [2]:
csv_files = glob.glob('*.csv')
print(csv_files)

['extracted_500_first_20241130_検索結果1204.csv', 'pubmed_true_data.csv', 'exported_data500.csv', 'remaining_data10404.csv', 'exported_data500_スクリーニング結果1210.csv']


In [3]:
df = pd.read_csv(csv_files[4])
# Assuming your dataframe is named 'df'
df['included'] = df['notes'].astype(str).str.contains('Included')

print(df['included'].value_counts())
print(f'不一致の数: {df[df["included"] == True]["notes"].str.contains("Excluded").sum()}')
#予測の元となる変数

import json

def create_json_text(row):
    return json.dumps({
        'title': str(row['title']),
        'abstract': str(row['abstract'])
    })

df['tiab'] = df.apply(create_json_text, axis=1)

included
False    473
True      27
Name: count, dtype: int64
不一致の数: 0


In [10]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
import optuna
from lightgbm import LGBMClassifier

def prepare_text_data(df, text_column='tiab', label_column='included'):
    """テキストデータの前処理とTF-IDF変換"""
    texts = df[text_column].fillna('')

    vectorizer = TfidfVectorizer(
        max_features=10000,
        min_df=2,
        max_df=0.95,
        ngram_range=(1, 2)
    )

    X_vec = vectorizer.fit_transform(texts)
    y = df[label_column]

    class_weights = dict(zip(
        y.unique(),
        [1 / (len(y) * (y == label).mean()) for label in y.unique()]
    ))

    return X_vec, y, vectorizer, class_weights

def fbeta_score_custom(y_true, y_pred, beta=1):
    """カスタムF-betaスコアの計算"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    if precision == 0.0 and recall == 0.0:
        return 0.0
    fbeta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    return fbeta

def calculate_detailed_metrics(y_true, y_pred):
    """詳細な評価指標の計算と表示"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity) if (precision + sensitivity) > 0 else 0

    print("\n=== 詳細な評価指標 ===")
    print(f"感度 (Sensitivity/Recall): {sensitivity:.3f}")
    print(f"特異度 (Specificity): {specificity:.3f}")
    print(f"適合率 (Precision): {precision:.3f}")
    print(f"F1スコア: {f1:.3f}")

    print("\n=== 混同行列 ===")
    print("                  Predicted")
    print("                  Negative  Positive")
    print(f"Actual Negative    {tn:^8} {fp:^8}")
    print(f"      Positive    {fn:^8} {tp:^8}")

    return {
        'sensitivity': sensitivity,
        'specificity': specificity,
        'precision': precision,
        'f1': f1,
        'confusion_matrix': {'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}
    }

def evaluate_model(X_vec, y, params, threshold, class_weights, beta=2):
    """モデルの評価（交差検証）"""
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = []
    all_predictions = []
    all_true_values = []

    for train_idx, val_idx in skf.split(X_vec, y):
        X_tr = X_vec[train_idx]
        X_val = X_vec[val_idx]
        y_tr = y.iloc[train_idx]
        y_val = y.iloc[val_idx]

        model = LGBMClassifier(
            **params,
            random_state=42,
            class_weight=class_weights
        )

        model.fit(X_tr, y_tr)
        y_prob = model.predict_proba(X_val)[:, 1]
        y_pred = (y_prob >= threshold).astype(int)

        score = fbeta_score_custom(y_val, y_pred, beta=beta)
        scores.append(score)

        all_predictions.extend(y_pred)
        all_true_values.extend(y_val)

    if len(scores) == 5:
        print("\n=== 交差検証全体の評価 ===")
        metrics = calculate_detailed_metrics(all_true_values, all_predictions)
        print("\nClassification Report:")
        print(classification_report(all_true_values, all_predictions))

    return np.mean(scores)

def objective(trial, X_vec, y, class_weights, beta=2):
    """Optunaの目的関数"""
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 128),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 10.0, log=True),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "verbosity": -1
    }
    threshold = trial.suggest_float("threshold", 0.1, 0.9)

    score = evaluate_model(X_vec, y, params, threshold, class_weights, beta=beta)
    return score

def train_final_model(df, beta_value=2, n_trials=50):
    """完全なトレーニングパイプライン"""
    # データの準備
    X_vec, y, vectorizer, class_weights = prepare_text_data(df)

    print(f"データセットの形状: {X_vec.shape}")
    print(f"クラスの分布:\n{y.value_counts(normalize=True)}")
    print(f"クラスの重み: {class_weights}")

    # ハイパーパラメータの最適化
    study = optuna.create_study(direction="maximize")
    study.optimize(
        lambda trial: objective(trial, X_vec, y, class_weights, beta=beta_value),
        n_trials=n_trials
    )

    best_params = study.best_trial.params
    threshold = best_params.pop("threshold")

    print("\nBest parameters:", best_params)
    print("Best threshold:", threshold)

    final_score = evaluate_model(X_vec, y, best_params, threshold, class_weights, beta=beta_value)
    print(f"\nFinal Mean F{beta_value} Score:", final_score)

    # 最終モデルの学習
    final_model = LGBMClassifier(
        **best_params,
        random_state=42,
        class_weight=class_weights
    )
    final_model.fit(X_vec, y)

    # 最終評価
    print("\n=== 最終モデルの評価（全データ） ===")
    y_prob = final_model.predict_proba(X_vec)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)
    final_metrics = calculate_detailed_metrics(y, y_pred)

    return final_model, vectorizer, threshold, best_params

def predict_new_text(text, model, vectorizer, threshold):
    """新しいテキストの予測"""
    X_new = vectorizer.transform([text])
    prob = model.predict_proba(X_new)[0, 1]
    prediction = prob >= threshold
    return prediction, prob



In [11]:
# 使用例
if __name__ == "__main__":
    # データの読み込み
    #df = pd.read_csv('your_data.csv')  # tiab列とincluded列を含むデータ

    # モデルのトレーニング
    model, vectorizer, threshold, best_params = train_final_model(
        df,
        beta_value=4,  # より高い値で再現率を重視
        n_trials=50
    )


[I 2024-12-11 07:20:16,500] A new study created in memory with name: no-name-05183813-9a77-4c2a-8aee-b5a9b0f2e8c1


データセットの形状: (500, 10000)
クラスの分布:
included
False    0.946
True     0.054
Name: proportion, dtype: float64
クラスの重み: {np.False_: np.float64(0.0021141649048625794), np.True_: np.float64(0.037037037037037035)}


[I 2024-12-11 07:20:18,952] Trial 0 finished with value: 0.6678733424377314 and parameters: {'num_leaves': 37, 'learning_rate': 0.07045554768608817, 'n_estimators': 402, 'reg_lambda': 0.026739358097090483, 'min_child_samples': 42, 'threshold': 0.13010222312353817}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.704
特異度 (Specificity): 0.928
適合率 (Precision): 0.358
F1スコア: 0.475

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      439       34   
      Positive       8        19   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.93      0.95       473
        True       0.36      0.70      0.47        27

    accuracy                           0.92       500
   macro avg       0.67      0.82      0.71       500
weighted avg       0.95      0.92      0.93       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-11 07:20:20,344] Trial 1 finished with value: 0.0 and parameters: {'num_leaves': 43, 'learning_rate': 0.003350803179075622, 'n_estimators': 472, 'reg_lambda': 8.350992043244512, 'min_child_samples': 93, 'threshold': 0.7217472334138532}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 1.000
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      473       0    
      Positive       27       0    

Classification Report:
              precision    recall  f1-score   support

       False       0.95      1.00      0.97       473
        True       0.00      0.00      0.00        27

    accuracy                           0.95       500
   macro avg       0.47      0.50      0.49       500
weighted avg       0.89      0.95      0.92       500



[I 2024-12-11 07:20:23,035] Trial 2 finished with value: 0.31915479372806654 and parameters: {'num_leaves': 25, 'learning_rate': 0.01745596219001907, 'n_estimators': 413, 'reg_lambda': 3.2102298797775664, 'min_child_samples': 20, 'threshold': 0.5671606860257529}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.333
特異度 (Specificity): 0.911
適合率 (Precision): 0.176
F1スコア: 0.231

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      431       42   
      Positive       18       9    

Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.91      0.93       473
        True       0.18      0.33      0.23        27

    accuracy                           0.88       500
   macro avg       0.57      0.62      0.58       500
weighted avg       0.92      0.88      0.90       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-11 07:20:24,072] Trial 3 finished with value: 0.4914965986394558 and parameters: {'num_leaves': 35, 'learning_rate': 0.11389903870651615, 'n_estimators': 70, 'reg_lambda': 3.068573921312616, 'min_child_samples': 40, 'threshold': 0.2473976090933966}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.054
F1スコア: 0.102

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       473   
      Positive       0        27   

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       473
        True       0.05      1.00      0.10        27

    accuracy                           0.05       500
   macro avg       0.03      0.50      0.05       500
weighted avg       0.00      0.05      0.01       500



[I 2024-12-11 07:20:26,136] Trial 4 finished with value: 0.18310341223451085 and parameters: {'num_leaves': 35, 'learning_rate': 0.03126896742544645, 'n_estimators': 380, 'reg_lambda': 0.021063335801917486, 'min_child_samples': 50, 'threshold': 0.5696971622992018}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.185
特異度 (Specificity): 0.975
適合率 (Precision): 0.294
F1スコア: 0.227

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      461       12   
      Positive       22       5    

Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.97      0.96       473
        True       0.29      0.19      0.23        27

    accuracy                           0.93       500
   macro avg       0.62      0.58      0.60       500
weighted avg       0.92      0.93      0.92       500



[I 2024-12-11 07:20:27,483] Trial 5 finished with value: 0.287787963871317 and parameters: {'num_leaves': 59, 'learning_rate': 0.07125105367918415, 'n_estimators': 224, 'reg_lambda': 0.6213750256144609, 'min_child_samples': 76, 'threshold': 0.6893724114034288}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.296
特異度 (Specificity): 0.970
適合率 (Precision): 0.364
F1スコア: 0.327

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      459       14   
      Positive       19       8    

Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.97      0.97       473
        True       0.36      0.30      0.33        27

    accuracy                           0.93       500
   macro avg       0.66      0.63      0.65       500
weighted avg       0.93      0.93      0.93       500



[I 2024-12-11 07:20:30,486] Trial 6 finished with value: 0.178150283321015 and parameters: {'num_leaves': 126, 'learning_rate': 0.007792887449006219, 'n_estimators': 429, 'reg_lambda': 0.006572039781734771, 'min_child_samples': 63, 'threshold': 0.7724077042930896}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.185
特異度 (Specificity): 0.987
適合率 (Precision): 0.455
F1スコア: 0.263

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      467       6    
      Positive       22       5    

Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.99      0.97       473
        True       0.45      0.19      0.26        27

    accuracy                           0.94       500
   macro avg       0.70      0.59      0.62       500
weighted avg       0.93      0.94      0.93       500



[I 2024-12-11 07:20:31,226] Trial 7 finished with value: 0.07481962481962481 and parameters: {'num_leaves': 10, 'learning_rate': 0.2657168387084672, 'n_estimators': 59, 'reg_lambda': 0.046346875268681635, 'min_child_samples': 81, 'threshold': 0.7275076314687147}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.074
特異度 (Specificity): 0.985
適合率 (Precision): 0.222
F1スコア: 0.111

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      466       7    
      Positive       25       2    

Classification Report:
              precision    recall  f1-score   support

       False       0.95      0.99      0.97       473
        True       0.22      0.07      0.11        27

    accuracy                           0.94       500
   macro avg       0.59      0.53      0.54       500
weighted avg       0.91      0.94      0.92       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-11 07:20:32,000] Trial 8 finished with value: 0.0 and parameters: {'num_leaves': 111, 'learning_rate': 0.005622945452729391, 'n_estimators': 169, 'reg_lambda': 1.475670535027922, 'min_child_samples': 89, 'threshold': 0.6436468175429922}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 1.000
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      473       0    
      Positive       27       0    

Classification Report:
              precision    recall  f1-score   support

       False       0.95      1.00      0.97       473
        True       0.00      0.00      0.00        27

    accuracy                           0.95       500
   macro avg       0.47      0.50      0.49       500
weighted avg       0.89      0.95      0.92       500



[I 2024-12-11 07:20:33,264] Trial 9 finished with value: 0.0 and parameters: {'num_leaves': 33, 'learning_rate': 0.017690164946003752, 'n_estimators': 380, 'reg_lambda': 0.06642379523257452, 'min_child_samples': 86, 'threshold': 0.8761246905264085}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.000
特異度 (Specificity): 0.998
適合率 (Precision): 0.000
F1スコア: 0.000

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      472       1    
      Positive       27       0    

Classification Report:
              precision    recall  f1-score   support

       False       0.95      1.00      0.97       473
        True       0.00      0.00      0.00        27

    accuracy                           0.94       500
   macro avg       0.47      0.50      0.49       500
weighted avg       0.89      0.94      0.92       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-11 07:20:40,768] Trial 10 finished with value: 0.4914965986394558 and parameters: {'num_leaves': 85, 'learning_rate': 0.0012847966860804558, 'n_estimators': 304, 'reg_lambda': 0.0011136698593142147, 'min_child_samples': 5, 'threshold': 0.10749639543951567}. Best is trial 0 with value: 0.6678733424377314.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.054
F1スコア: 0.102

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       473   
      Positive       0        27   

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       473
        True       0.05      1.00      0.10        27

    accuracy                           0.05       500
   macro avg       0.03      0.50      0.05       500
weighted avg       0.00      0.05      0.01       500



[I 2024-12-11 07:20:41,943] Trial 11 finished with value: 0.7146593959475703 and parameters: {'num_leaves': 68, 'learning_rate': 0.15494411308967468, 'n_estimators': 98, 'reg_lambda': 0.31081859680313667, 'min_child_samples': 40, 'threshold': 0.17561206246252792}. Best is trial 11 with value: 0.7146593959475703.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.755
適合率 (Precision): 0.171
F1スコア: 0.287

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      357      116   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.75      0.86       473
        True       0.17      0.89      0.29        27

    accuracy                           0.76       500
   macro avg       0.58      0.82      0.57       500
weighted avg       0.95      0.76      0.83       500



[I 2024-12-11 07:20:43,360] Trial 12 finished with value: 0.7325882058193379 and parameters: {'num_leaves': 76, 'learning_rate': 0.09002059617392276, 'n_estimators': 184, 'reg_lambda': 0.2092668749428803, 'min_child_samples': 34, 'threshold': 0.342780460116221}. Best is trial 12 with value: 0.7325882058193379.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.778
特異度 (Specificity): 0.922
適合率 (Precision): 0.362
F1スコア: 0.494

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      436       37   
      Positive       6        21   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.92      0.95       473
        True       0.36      0.78      0.49        27

    accuracy                           0.91       500
   macro avg       0.67      0.85      0.72       500
weighted avg       0.95      0.91      0.93       500



[I 2024-12-11 07:20:44,893] Trial 13 finished with value: 0.5978219790748341 and parameters: {'num_leaves': 85, 'learning_rate': 0.2173513603010754, 'n_estimators': 136, 'reg_lambda': 0.27017312324497195, 'min_child_samples': 27, 'threshold': 0.36672486765577395}. Best is trial 12 with value: 0.7325882058193379.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.630
特異度 (Specificity): 0.937
適合率 (Precision): 0.362
F1スコア: 0.459

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      443       30   
      Positive       10       17   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.94      0.96       473
        True       0.36      0.63      0.46        27

    accuracy                           0.92       500
   macro avg       0.67      0.78      0.71       500
weighted avg       0.94      0.92      0.93       500



[I 2024-12-11 07:20:46,351] Trial 14 finished with value: 0.6869313871171991 and parameters: {'num_leaves': 78, 'learning_rate': 0.041254650982105734, 'n_estimators': 144, 'reg_lambda': 0.2371306479745468, 'min_child_samples': 26, 'threshold': 0.38292687426724703}. Best is trial 12 with value: 0.7325882058193379.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.778
特異度 (Specificity): 0.863
適合率 (Precision): 0.244
F1スコア: 0.372

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      408       65   
      Positive       6        21   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.86      0.92       473
        True       0.24      0.78      0.37        27

    accuracy                           0.86       500
   macro avg       0.61      0.82      0.65       500
weighted avg       0.95      0.86      0.89       500



[I 2024-12-11 07:20:47,537] Trial 15 finished with value: 0.7492490465532535 and parameters: {'num_leaves': 61, 'learning_rate': 0.13291091399446409, 'n_estimators': 250, 'reg_lambda': 0.21273066710582006, 'min_child_samples': 62, 'threshold': 0.2830291880275029}. Best is trial 15 with value: 0.7492490465532535.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.815
特異度 (Specificity): 0.903
適合率 (Precision): 0.324
F1スコア: 0.463

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      427       46   
      Positive       5        22   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.90      0.94       473
        True       0.32      0.81      0.46        27

    accuracy                           0.90       500
   macro avg       0.66      0.86      0.70       500
weighted avg       0.95      0.90      0.92       500



[I 2024-12-11 07:20:48,961] Trial 16 finished with value: 0.33616210547925063 and parameters: {'num_leaves': 103, 'learning_rate': 0.08307164119431713, 'n_estimators': 268, 'reg_lambda': 0.009363805110095456, 'min_child_samples': 63, 'threshold': 0.38614883682692286}. Best is trial 15 with value: 0.7492490465532535.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.333
特異度 (Specificity): 0.970
適合率 (Precision): 0.391
F1スコア: 0.360

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      459       14   
      Positive       18       9    

Classification Report:
              precision    recall  f1-score   support

       False       0.96      0.97      0.97       473
        True       0.39      0.33      0.36        27

    accuracy                           0.94       500
   macro avg       0.68      0.65      0.66       500
weighted avg       0.93      0.94      0.93       500



[I 2024-12-11 07:20:50,556] Trial 17 finished with value: 0.7083614941043311 and parameters: {'num_leaves': 59, 'learning_rate': 0.04214669840270299, 'n_estimators': 226, 'reg_lambda': 0.11747287893174942, 'min_child_samples': 62, 'threshold': 0.28447297520712156}. Best is trial 15 with value: 0.7492490465532535.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.815
特異度 (Specificity): 0.841
適合率 (Precision): 0.227
F1スコア: 0.355

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      398       75   
      Positive       5        22   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.84      0.91       473
        True       0.23      0.81      0.35        27

    accuracy                           0.84       500
   macro avg       0.61      0.83      0.63       500
weighted avg       0.95      0.84      0.88       500



[I 2024-12-11 07:20:51,933] Trial 18 finished with value: 0.4346195586401772 and parameters: {'num_leaves': 54, 'learning_rate': 0.1451837056945193, 'n_estimators': 319, 'reg_lambda': 0.9627914314653203, 'min_child_samples': 71, 'threshold': 0.46660202716066446}. Best is trial 15 with value: 0.7492490465532535.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.481
特異度 (Specificity): 0.890
適合率 (Precision): 0.200
F1スコア: 0.283

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      421       52   
      Positive       14       13   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.89      0.93       473
        True       0.20      0.48      0.28        27

    accuracy                           0.87       500
   macro avg       0.58      0.69      0.60       500
weighted avg       0.93      0.87      0.89       500



[I 2024-12-11 07:20:52,897] Trial 19 finished with value: 0.5043601211161937 and parameters: {'num_leaves': 98, 'learning_rate': 0.028661555090874005, 'n_estimators': 201, 'reg_lambda': 0.1316703863854268, 'min_child_samples': 100, 'threshold': 0.26685980520926356}. Best is trial 15 with value: 0.7492490465532535.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.741
特異度 (Specificity): 0.533
適合率 (Precision): 0.083
F1スコア: 0.149

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      252      221   
      Positive       7        20   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.53      0.69       473
        True       0.08      0.74      0.15        27

    accuracy                           0.54       500
   macro avg       0.53      0.64      0.42       500
weighted avg       0.92      0.54      0.66       500



[I 2024-12-11 07:20:54,898] Trial 20 finished with value: 0.6352066731893987 and parameters: {'num_leaves': 74, 'learning_rate': 0.01078511392254017, 'n_estimators': 264, 'reg_lambda': 0.0024057186034101294, 'min_child_samples': 52, 'threshold': 0.4607536282718918}. Best is trial 15 with value: 0.7492490465532535.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.667
特異度 (Specificity): 0.937
適合率 (Precision): 0.375
F1スコア: 0.480

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      443       30   
      Positive       9        18   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.94      0.96       473
        True       0.38      0.67      0.48        27

    accuracy                           0.92       500
   macro avg       0.68      0.80      0.72       500
weighted avg       0.95      0.92      0.93       500



[I 2024-12-11 07:20:55,899] Trial 21 finished with value: 0.7603845435112412 and parameters: {'num_leaves': 67, 'learning_rate': 0.16529953908242206, 'n_estimators': 97, 'reg_lambda': 0.24905086778477417, 'min_child_samples': 39, 'threshold': 0.19817760193776568}. Best is trial 21 with value: 0.7603845435112412.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.827
適合率 (Precision): 0.226
F1スコア: 0.361

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      391       82   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.83      0.90       473
        True       0.23      0.89      0.36        27

    accuracy                           0.83       500
   macro avg       0.61      0.86      0.63       500
weighted avg       0.95      0.83      0.87       500



[I 2024-12-11 07:20:57,190] Trial 22 finished with value: 0.7285081877857286 and parameters: {'num_leaves': 52, 'learning_rate': 0.2556925008929699, 'n_estimators': 102, 'reg_lambda': 0.42248278534925365, 'min_child_samples': 32, 'threshold': 0.200974981107573}. Best is trial 21 with value: 0.7603845435112412.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.852
特異度 (Specificity): 0.820
適合率 (Precision): 0.213
F1スコア: 0.341

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      388       85   
      Positive       4        23   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.82      0.90       473
        True       0.21      0.85      0.34        27

    accuracy                           0.82       500
   macro avg       0.60      0.84      0.62       500
weighted avg       0.95      0.82      0.87       500



[I 2024-12-11 07:21:00,882] Trial 23 finished with value: 0.5298649219079781 and parameters: {'num_leaves': 65, 'learning_rate': 0.10147803607938774, 'n_estimators': 165, 'reg_lambda': 0.19386995558632233, 'min_child_samples': 9, 'threshold': 0.31906768837337834}. Best is trial 21 with value: 0.7603845435112412.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.556
特異度 (Specificity): 0.953
適合率 (Precision): 0.405
F1スコア: 0.469

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      451       22   
      Positive       12       15   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.95      0.96       473
        True       0.41      0.56      0.47        27

    accuracy                           0.93       500
   macro avg       0.69      0.75      0.72       500
weighted avg       0.94      0.93      0.94       500



[I 2024-12-11 07:21:02,524] Trial 24 finished with value: 0.7883618012422361 and parameters: {'num_leaves': 93, 'learning_rate': 0.058683931871508825, 'n_estimators': 197, 'reg_lambda': 0.06822287904960997, 'min_child_samples': 46, 'threshold': 0.20585375192201869}. Best is trial 24 with value: 0.7883618012422361.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.867
適合率 (Precision): 0.276
F1スコア: 0.421

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      410       63   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.87      0.93       473
        True       0.28      0.89      0.42        27

    accuracy                           0.87       500
   macro avg       0.63      0.88      0.67       500
weighted avg       0.95      0.87      0.90       500



[I 2024-12-11 07:21:04,184] Trial 25 finished with value: 0.6751651665742902 and parameters: {'num_leaves': 93, 'learning_rate': 0.05795283091410832, 'n_estimators': 314, 'reg_lambda': 0.04608112903748101, 'min_child_samples': 48, 'threshold': 0.17860192582758536}. Best is trial 24 with value: 0.7883618012422361.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.741
特異度 (Specificity): 0.899
適合率 (Precision): 0.294
F1スコア: 0.421

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      425       48   
      Positive       7        20   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.90      0.94       473
        True       0.29      0.74      0.42        27

    accuracy                           0.89       500
   macro avg       0.64      0.82      0.68       500
weighted avg       0.95      0.89      0.91       500



[I 2024-12-11 07:21:05,552] Trial 26 finished with value: 0.43122739329635884 and parameters: {'num_leaves': 110, 'learning_rate': 0.1762838058659797, 'n_estimators': 236, 'reg_lambda': 0.013275222331917231, 'min_child_samples': 58, 'threshold': 0.22510126435433248}. Best is trial 24 with value: 0.7883618012422361.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.444
特異度 (Specificity): 0.960
適合率 (Precision): 0.387
F1スコア: 0.414

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      454       19   
      Positive       15       12   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.96      0.96       473
        True       0.39      0.44      0.41        27

    accuracy                           0.93       500
   macro avg       0.68      0.70      0.69       500
weighted avg       0.94      0.93      0.93       500



[I 2024-12-11 07:21:06,959] Trial 27 finished with value: 0.6392269676516311 and parameters: {'num_leaves': 87, 'learning_rate': 0.0474624044742977, 'n_estimators': 92, 'reg_lambda': 0.08067117929720184, 'min_child_samples': 18, 'threshold': 0.10186899126459915}. Best is trial 24 with value: 0.7883618012422361.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.926
特異度 (Specificity): 0.562
適合率 (Precision): 0.108
F1スコア: 0.193

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      266      207   
      Positive       2        25   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.56      0.72       473
        True       0.11      0.93      0.19        27

    accuracy                           0.58       500
   macro avg       0.55      0.74      0.46       500
weighted avg       0.94      0.58      0.69       500



[I 2024-12-11 07:21:07,925] Trial 28 finished with value: 0.42650955308435873 and parameters: {'num_leaves': 127, 'learning_rate': 0.29794249087296987, 'n_estimators': 130, 'reg_lambda': 0.6441889847152067, 'min_child_samples': 71, 'threshold': 0.416292741670685}. Best is trial 24 with value: 0.7883618012422361.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.481
特異度 (Specificity): 0.869
適合率 (Precision): 0.173
F1スコア: 0.255

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      411       62   
      Positive       14       13   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.87      0.92       473
        True       0.17      0.48      0.25        27

    accuracy                           0.85       500
   macro avg       0.57      0.68      0.59       500
weighted avg       0.92      0.85      0.88       500



[I 2024-12-11 07:21:09,990] Trial 29 finished with value: 0.5690834800668203 and parameters: {'num_leaves': 47, 'learning_rate': 0.1349981442750112, 'n_estimators': 354, 'reg_lambda': 0.029501348420970137, 'min_child_samples': 57, 'threshold': 0.15895279434291748}. Best is trial 24 with value: 0.7883618012422361.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.593
特異度 (Specificity): 0.941
適合率 (Precision): 0.364
F1スコア: 0.451

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      445       28   
      Positive       11       16   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.94      0.96       473
        True       0.36      0.59      0.45        27

    accuracy                           0.92       500
   macro avg       0.67      0.77      0.70       500
weighted avg       0.94      0.92      0.93       500



[I 2024-12-11 07:21:12,363] Trial 30 finished with value: 0.7704483858954176 and parameters: {'num_leaves': 66, 'learning_rate': 0.02777901820371329, 'n_estimators': 243, 'reg_lambda': 0.1148788460516164, 'min_child_samples': 45, 'threshold': 0.29479812921049386}. Best is trial 24 with value: 0.7883618012422361.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.844
適合率 (Precision): 0.245
F1スコア: 0.384

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      399       74   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.84      0.91       473
        True       0.24      0.89      0.38        27

    accuracy                           0.85       500
   macro avg       0.62      0.87      0.65       500
weighted avg       0.95      0.85      0.88       500



[I 2024-12-11 07:21:14,852] Trial 31 finished with value: 0.7889312998813136 and parameters: {'num_leaves': 64, 'learning_rate': 0.025999478014992004, 'n_estimators': 248, 'reg_lambda': 0.0998410986638099, 'min_child_samples': 43, 'threshold': 0.30117219313875077}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.869
適合率 (Precision): 0.279
F1スコア: 0.425

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      411       62   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.87      0.93       473
        True       0.28      0.89      0.42        27

    accuracy                           0.87       500
   macro avg       0.64      0.88      0.68       500
weighted avg       0.95      0.87      0.90       500



[I 2024-12-11 07:21:16,368] Trial 32 finished with value: 0.7645192373136903 and parameters: {'num_leaves': 71, 'learning_rate': 0.02549517764486482, 'n_estimators': 201, 'reg_lambda': 0.10136827567482833, 'min_child_samples': 45, 'threshold': 0.3104010301803653}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.835
適合率 (Precision): 0.235
F1スコア: 0.372

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      395       78   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.84      0.91       473
        True       0.24      0.89      0.37        27

    accuracy                           0.84       500
   macro avg       0.61      0.86      0.64       500
weighted avg       0.95      0.84      0.88       500



[I 2024-12-11 07:21:17,985] Trial 33 finished with value: 0.7705304160438297 and parameters: {'num_leaves': 73, 'learning_rate': 0.02380814225412336, 'n_estimators': 204, 'reg_lambda': 0.030406045541309597, 'min_child_samples': 46, 'threshold': 0.314564043777419}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.852
特異度 (Specificity): 0.890
適合率 (Precision): 0.307
F1スコア: 0.451

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      421       52   
      Positive       4        23   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.89      0.94       473
        True       0.31      0.85      0.45        27

    accuracy                           0.89       500
   macro avg       0.65      0.87      0.69       500
weighted avg       0.95      0.89      0.91       500



[I 2024-12-11 07:21:19,689] Trial 34 finished with value: 0.5759641087261433 and parameters: {'num_leaves': 79, 'learning_rate': 0.013333792129859967, 'n_estimators': 284, 'reg_lambda': 0.023645635543574257, 'min_child_samples': 45, 'threshold': 0.5393770026171281}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.593
特異度 (Specificity): 0.962
適合率 (Precision): 0.471
F1スコア: 0.525

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      455       18   
      Positive       11       16   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.96      0.97       473
        True       0.47      0.59      0.52        27

    accuracy                           0.94       500
   macro avg       0.72      0.78      0.75       500
weighted avg       0.95      0.94      0.95       500



[I 2024-12-11 07:21:21,344] Trial 35 finished with value: 0.7416557196278615 and parameters: {'num_leaves': 47, 'learning_rate': 0.01875259144540448, 'n_estimators': 204, 'reg_lambda': 0.04830324547145245, 'min_child_samples': 35, 'threshold': 0.2365576355188396}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.799
適合率 (Precision): 0.202
F1スコア: 0.329

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      378       95   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.80      0.89       473
        True       0.20      0.89      0.33        27

    accuracy                           0.80       500
   macro avg       0.60      0.84      0.61       500
weighted avg       0.95      0.80      0.86       500



[I 2024-12-11 07:21:23,109] Trial 36 finished with value: 0.4768098858948222 and parameters: {'num_leaves': 91, 'learning_rate': 0.024886976106486238, 'n_estimators': 292, 'reg_lambda': 0.0056993522405092475, 'min_child_samples': 54, 'threshold': 0.4222238023103042}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.481
特異度 (Specificity): 0.968
適合率 (Precision): 0.464
F1スコア: 0.473

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      458       15   
      Positive       14       13   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       473
        True       0.46      0.48      0.47        27

    accuracy                           0.94       500
   macro avg       0.72      0.72      0.72       500
weighted avg       0.94      0.94      0.94       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-11 07:21:24,711] Trial 37 finished with value: 0.4914965986394558 and parameters: {'num_leaves': 103, 'learning_rate': 0.004514444042967217, 'n_estimators': 246, 'reg_lambda': 0.013263528369062937, 'min_child_samples': 44, 'threshold': 0.14191288443479877}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.054
F1スコア: 0.102

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       473   
      Positive       0        27   

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       473
        True       0.05      1.00      0.10        27

    accuracy                           0.05       500
   macro avg       0.03      0.50      0.05       500
weighted avg       0.00      0.05      0.01       500



[I 2024-12-11 07:21:26,190] Trial 38 finished with value: 0.4961372375435566 and parameters: {'num_leaves': 82, 'learning_rate': 0.012840477637678345, 'n_estimators': 344, 'reg_lambda': 8.060820367974427, 'min_child_samples': 49, 'threshold': 0.501782557903264}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.593
特異度 (Specificity): 0.825
適合率 (Precision): 0.162
F1スコア: 0.254

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      390       83   
      Positive       11       16   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.82      0.89       473
        True       0.16      0.59      0.25        27

    accuracy                           0.81       500
   macro avg       0.57      0.71      0.57       500
weighted avg       0.93      0.81      0.86       500



[I 2024-12-11 07:21:28,246] Trial 39 finished with value: 0.5140142914912609 and parameters: {'num_leaves': 42, 'learning_rate': 0.058621941088505956, 'n_estimators': 220, 'reg_lambda': 0.03205177259424357, 'min_child_samples': 26, 'threshold': 0.31446962436292897}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.519
特異度 (Specificity): 0.960
適合率 (Precision): 0.424
F1スコア: 0.467

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      454       19   
      Positive       13       14   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.96      0.97       473
        True       0.42      0.52      0.47        27

    accuracy                           0.94       500
   macro avg       0.70      0.74      0.72       500
weighted avg       0.94      0.94      0.94       500



[I 2024-12-11 07:21:32,430] Trial 40 finished with value: 0.5714681882922255 and parameters: {'num_leaves': 12, 'learning_rate': 0.033668569828504476, 'n_estimators': 493, 'reg_lambda': 0.06575262147858754, 'min_child_samples': 19, 'threshold': 0.25471765063106655}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.593
特異度 (Specificity): 0.945
適合率 (Precision): 0.381
F1スコア: 0.464

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      447       26   
      Positive       11       16   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.95      0.96       473
        True       0.38      0.59      0.46        27

    accuracy                           0.93       500
   macro avg       0.68      0.77      0.71       500
weighted avg       0.94      0.93      0.93       500



[I 2024-12-11 07:21:34,186] Trial 41 finished with value: 0.7568688855103819 and parameters: {'num_leaves': 72, 'learning_rate': 0.022187901674744215, 'n_estimators': 201, 'reg_lambda': 0.11914514835948789, 'min_child_samples': 44, 'threshold': 0.32179808724397413}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.822
適合率 (Precision): 0.222
F1スコア: 0.356

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      389       84   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.82      0.90       473
        True       0.22      0.89      0.36        27

    accuracy                           0.83       500
   macro avg       0.61      0.86      0.63       500
weighted avg       0.95      0.83      0.87       500



[I 2024-12-11 07:21:35,459] Trial 42 finished with value: 0.6632479512340245 and parameters: {'num_leaves': 62, 'learning_rate': 0.008299578195847267, 'n_estimators': 178, 'reg_lambda': 0.09982559155691963, 'min_child_samples': 36, 'threshold': 0.3029862413363782}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.660
適合率 (Precision): 0.130
F1スコア: 0.226

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      312      161   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.66      0.79       473
        True       0.13      0.89      0.23        27

    accuracy                           0.67       500
   macro avg       0.56      0.77      0.51       500
weighted avg       0.94      0.67      0.76       500



[I 2024-12-11 07:21:36,742] Trial 43 finished with value: 0.7413098974930471 and parameters: {'num_leaves': 70, 'learning_rate': 0.03178530427286741, 'n_estimators': 152, 'reg_lambda': 0.01707061989868207, 'min_child_samples': 48, 'threshold': 0.22732273936008346}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.852
特異度 (Specificity): 0.848
適合率 (Precision): 0.242
F1スコア: 0.377

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      401       72   
      Positive       4        23   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.85      0.91       473
        True       0.24      0.85      0.38        27

    accuracy                           0.85       500
   macro avg       0.62      0.85      0.65       500
weighted avg       0.95      0.85      0.88       500



[I 2024-12-11 07:21:37,968] Trial 44 finished with value: 0.7413899856130101 and parameters: {'num_leaves': 54, 'learning_rate': 0.01674005194902105, 'n_estimators': 196, 'reg_lambda': 0.04049166014844279, 'min_child_samples': 54, 'threshold': 0.35554921690426783}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.852
特異度 (Specificity): 0.848
適合率 (Precision): 0.242
F1スコア: 0.377

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      401       72   
      Positive       4        23   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.85      0.91       473
        True       0.24      0.85      0.38        27

    accuracy                           0.85       500
   macro avg       0.62      0.85      0.65       500
weighted avg       0.95      0.85      0.88       500



[I 2024-12-11 07:21:39,647] Trial 45 finished with value: 0.48306347227992436 and parameters: {'num_leaves': 115, 'learning_rate': 0.06139838798295169, 'n_estimators': 224, 'reg_lambda': 0.06564159785980235, 'min_child_samples': 40, 'threshold': 0.4238723375459651}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.481
特異度 (Specificity): 0.964
適合率 (Precision): 0.433
F1スコア: 0.456

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      456       17   
      Positive       14       13   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.96      0.97       473
        True       0.43      0.48      0.46        27

    accuracy                           0.94       500
   macro avg       0.70      0.72      0.71       500
weighted avg       0.94      0.94      0.94       500



[I 2024-12-11 07:21:41,323] Trial 46 finished with value: 0.6838748827726051 and parameters: {'num_leaves': 25, 'learning_rate': 0.02118511187411466, 'n_estimators': 251, 'reg_lambda': 0.40175933187207724, 'min_child_samples': 31, 'threshold': 0.2762369374121927}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.700
適合率 (Precision): 0.145
F1スコア: 0.249

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      331      142   
      Positive       3        24   

Classification Report:
              precision    recall  f1-score   support

       False       0.99      0.70      0.82       473
        True       0.14      0.89      0.25        27

    accuracy                           0.71       500
   macro avg       0.57      0.79      0.53       500
weighted avg       0.95      0.71      0.79       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[I 2024-12-11 07:21:42,703] Trial 47 finished with value: 0.4914965986394558 and parameters: {'num_leaves': 91, 'learning_rate': 0.0018888414004855885, 'n_estimators': 167, 'reg_lambda': 0.16170509231857955, 'min_child_samples': 38, 'threshold': 0.20704728135040634}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 1.000
特異度 (Specificity): 0.000
適合率 (Precision): 0.054
F1スコア: 0.102

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative       0       473   
      Positive       0        27   

Classification Report:
              precision    recall  f1-score   support

       False       0.00      0.00      0.00       473
        True       0.05      1.00      0.10        27

    accuracy                           0.05       500
   macro avg       0.03      0.50      0.05       500
weighted avg       0.00      0.05      0.01       500



[I 2024-12-11 07:21:43,760] Trial 48 finished with value: 0.684548664612836 and parameters: {'num_leaves': 79, 'learning_rate': 0.03587619162050904, 'n_estimators': 125, 'reg_lambda': 0.02330293076838238, 'min_child_samples': 45, 'threshold': 0.3362345279738537}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.741
特異度 (Specificity): 0.903
適合率 (Precision): 0.303
F1スコア: 0.430

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      427       46   
      Positive       7        20   

Classification Report:
              precision    recall  f1-score   support

       False       0.98      0.90      0.94       473
        True       0.30      0.74      0.43        27

    accuracy                           0.89       500
   macro avg       0.64      0.82      0.69       500
weighted avg       0.95      0.89      0.91       500



[I 2024-12-11 07:21:45,345] Trial 49 finished with value: 0.4811461412151067 and parameters: {'num_leaves': 57, 'learning_rate': 0.02633184156596739, 'n_estimators': 273, 'reg_lambda': 0.08432237047317603, 'min_child_samples': 57, 'threshold': 0.6335684729987513}. Best is trial 31 with value: 0.7889312998813136.



=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.481
特異度 (Specificity): 0.975
適合率 (Precision): 0.520
F1スコア: 0.500

=== 混同行列 ===
                  Predicted
                  Negative  Positive
Actual Negative      461       12   
      Positive       14       13   

Classification Report:
              precision    recall  f1-score   support

       False       0.97      0.97      0.97       473
        True       0.52      0.48      0.50        27

    accuracy                           0.95       500
   macro avg       0.75      0.73      0.74       500
weighted avg       0.95      0.95      0.95       500


Best parameters: {'num_leaves': 64, 'learning_rate': 0.025999478014992004, 'n_estimators': 248, 'reg_lambda': 0.0998410986638099, 'min_child_samples': 43}
Best threshold: 0.30117219313875077

=== 交差検証全体の評価 ===

=== 詳細な評価指標 ===
感度 (Sensitivity/Recall): 0.889
特異度 (Specificity): 0.869
適合率 (Precision): 0.279
F1スコア: 0.425

=== 混同行列 ===
                  Predicted
       

In [12]:

import joblib

# 日時をstringsで
timestamp = time.strftime('%Y%m%d%H%M%S')

# モデルと関連オブジェクトの保存
model_data = {
    'model': model,
    'vectorizer': vectorizer,
    'threshold': threshold,
    'best_params': best_params
}
joblib.dump(model_data, f'{timestamp}_v2_model_data.joblib')
print("\nモデルと関連オブジェクトを保存しました。")


モデルと関連オブジェクトを保存しました。


In [13]:
models = glob.glob('*.joblib')
print(models)

['model_data.joblib', '20241207084511_model_data.joblib', '20241211072208_v2_model_data.joblib']


In [14]:
# 保存したモデルデータをロード
model_data = joblib.load(models[2])

# 各オブジェクトの取り出し
loaded_model = model_data['model']
loaded_vectorizer = model_data['vectorizer']
loaded_threshold = model_data['threshold']
loaded_best_params = model_data['best_params']

print("モデルと関連オブジェクトをロードしました。")


モデルと関連オブジェクトをロードしました。


# 絶対に落としてはいけない文献を落とさないことを確認

In [15]:
csv_files

['extracted_500_first_20241130_検索結果1204.csv',
 'pubmed_true_data.csv',
 'exported_data500.csv',
 'remaining_data10404.csv',
 'exported_data500_スクリーニング結果1210.csv']

In [16]:
df2 = pd.read_csv(csv_files[1])
df2.columns = ["PMID", "title", "abstract"]
df2["tiab"] = df2.apply(create_json_text, axis=1)

In [17]:
df2.shape

(10, 4)

In [18]:
def predict_and_summarize(df, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold):
    # "tiab"列の作成 (DataFrame dfに対して適用)
    df["tiab"] = df.apply(create_json_text, axis=1)

    # テキストデータの前処理とベクトル化
    X_new = loaded_vectorizer.transform(df["tiab"].fillna(''))

    print(f"新しいデータの形状: {X_new.shape}")

    # 予測確率の取得
    y_prob_new = loaded_model.predict_proba(X_new)[:, 1]

    # 閾値を適用してクラスラベルを決定
    y_pred_new = (y_prob_new >= loaded_threshold).astype(int)

    # 予測結果をデータフレームに追加
    df['prediction'] = y_pred_new
    df['probability'] = y_prob_new

    print("予測を実行し、結果をデータフレームに追加しました。")

    # 予測結果の表示
    print("\n=== 予測結果 ===")
    print(df[['tiab', 'prediction', 'probability']])

    # クラス分布の確認
    class_dist = df['prediction'].value_counts()
    print("\nクラスの分布:")
    print(class_dist)

    # 確率の統計情報
    prob_stats = df['probability'].describe()
    print("\n予測確率の統計情報:")
    print(prob_stats)

    # 必要に応じて、更新後のdfや統計情報を返す
    return df, class_dist, prob_stats


In [19]:
# 必要なオブジェクトを準備
# df2, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold を用意済みとする
updated_df, class_distribution, probability_stats = predict_and_summarize(
    df2, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold
)


新しいデータの形状: (10, 10000)
予測を実行し、結果をデータフレームに追加しました。

=== 予測結果 ===
                                                tiab  prediction  probability
0  {"title": "A multicenter, randomized, controll...           1     0.520263
1  {"title": "Transfusion requirements after head...           1     0.734563
2  {"title": "Restrictive vs Liberal Transfusion ...           1     0.318891
3  {"title": "Liberal or Restrictive Transfusion ...           1     0.341484
4  {"title": "Effect of Transfusion on Mortality ...           1     0.306149
5  {"title": "Transfusion-related risk of seconda...           1     0.770659
6  {"title": "Impact of Blood Product Transfusion...           1     0.529030
7  {"title": "Transfusion practice and blood stre...           1     0.427367
8  {"title": "Anemia, transfusions and hospital o...           1     0.862065
9  {"title": "Transfusion of platelets, but not o...           1     0.793162

クラスの分布:
prediction
1    10
Name: count, dtype: int64

予測確率の統計情報:
count    10.0

# 残っているファイルに対して予測を行い、Trueであったものだけを残す

'remaining_data10404.csv'

In [22]:
df3 = pd.read_csv(csv_files[3])

In [23]:
df3.head()

Unnamed: 0,key,title,authors,journal,volume,issue,pages,abstract,year,publisher,url,issn
0,03fffde5-8aca-49b6-8666-d98a2ba3d4ad,Association of transfusion red blood cell stor...,"[{'author': 'Yamal, José-Miguel', 'author_abbr...",The journal of trauma and acute care surgery,79,5.0,843-9,BACKGROUND: The effect of red blood cell (RBC)...,2015,United States,https://doi.org/10.1097/TA.0000000000000834,"{""pmid"": 26496111, ""electronic_issn"": ""2163-07..."
1,4944c10a-6c8f-4cd3-a1c8-838fff9273d1,Transfusion of packed red blood cells does not...,"['Mas, A.', 'Roig, I.', 'Baigorri, F.', 'Martí...",Medicina Intensiva,20,7.0,327-332,Background. The usefulness of stored red blood...,1996/09//,"Ediciones Doyma, S.L., Travesera de Gracia 17-...",['https://go.openathens.net/redirector/rsm.ac....,"{""accession_number"": ""685936757; 26332003"", ""t..."
2,0d828515-1129-4a05-ae50-14a360c1f16b,Red blood cell transfusion in the critically i...,"[{'author': 'Lelubre, Christophe', 'author_abb...",Annals of intensive care,1,,43,Red blood cell (RBC) transfusion is a common i...,2011,Germany,https://doi.org/10.1186/2110-5820-1-43,"{""pmid"": 21970512, ""electronic_issn"": ""2110-58..."
3,84e99a2c-3267-4239-97b4-ab1d2b1a2d88,Frequency of upper gastrointestinal bleeding i...,"[{'author': 'Lacroix, J', 'author_abbreviated'...",Critical care medicine,20,1.0,35-42,OBJECTIVE: To determine the frequency of upper...,1992,United States,https://doi.org/10.1097/00003246-199201000-00013,"{""pmid"": 1729041, ""print_issn"": ""0090-3493"", ""..."
4,ff429945-9122-41d9-9adc-2b39a77cbc0f,Prehospital fluid management of abdominal orga...,"[{'author': 'Heuer, Matthias', 'author_abbrevi...",Langenbeck's archives of surgery,400,3.0,371-9,PURPOSE: Severe bleeding after trauma frequent...,2015,Germany,https://doi.org/10.1007/s00423-015-1274-2,"{""pmid"": 25681238, ""electronic_issn"": ""1435-24..."


In [24]:
# 必要なオブジェクトを準備
# df2, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold を用意済みとする
updated_df, class_distribution, probability_stats = predict_and_summarize(
    df3, create_json_text, loaded_vectorizer, loaded_model, loaded_threshold
)


新しいデータの形状: (10404, 10000)
予測を実行し、結果をデータフレームに追加しました。

=== 予測結果 ===
                                                    tiab  prediction  \
0      {"title": "Association of transfusion red bloo...           1   
1      {"title": "Transfusion of packed red blood cel...           1   
2      {"title": "Red blood cell transfusion in the c...           1   
3      {"title": "Frequency of upper gastrointestinal...           0   
4      {"title": "Prehospital fluid management of abd...           0   
...                                                  ...         ...   
10399  {"title": "Guidelines on the management of ana...           1   
10400  {"title": "Blood transfusion and lung function...           1   
10401  {"title": "Syncope and epilepsy", "abstract": ...           0   
10402  {"title": "Anemia and blood transfusion in cri...           1   
10403  {"title": "Allogeneic umbilical cord blood red...           0   

       probability  
0         0.313773  
1         0.549822  
2     

In [25]:
print(df3.columns)

Index(['key', 'title', 'authors', 'journal', 'volume', 'issue', 'pages',
       'abstract', 'year', 'publisher', 'url', 'issn', 'tiab', 'prediction',
       'probability'],
      dtype='object')


In [26]:
df3["prediction"].value_counts()

prediction
0    8780
1    1624
Name: count, dtype: int64

In [27]:
df_export = df3[df3["prediction"] == 1]

In [28]:
df_export = df_export[['key', 'title', 'authors', 'journal', 'volume', 'issue', 'pages',
       'abstract', 'year', 'publisher', 'url', 'issn']]

In [29]:
df_export.shape

(1624, 12)

In [30]:
df_export.to_csv('2024-12-11exported_1624.csv', index=False)

In [43]:
# df_exportをランダムに500とそれ以外に分割して、それぞれをCSVファイルとしてエクスポート, seed設定
df_export.sample(frac=1, random_state=42).iloc[:500].to_csv('exported_data500.csv', index=False)
df_export.sample(frac=1, random_state=42).iloc[500:].to_csv('remaining_data10404.csv', index=False)

