In [None]:
# 必要なライブラリのインポート
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# 日本語フォント設定
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

try:
    if os.name == 'posix':
        japanese_fonts = ['Hiragino Sans', 'Hiragino Kaku Gothic Pro', 'Yu Gothic', 'Meiryo']
        for font in japanese_fonts:
            try:
                plt.rcParams['font.family'] = font
                break
            except:
                continue
except:
    pass

plt.rcParams['figure.figsize'] = (10, 6)
print("ライブラリのインポートが完了しました")

In [None]:
def parse_directory_name(dir_name):
    params = dir_name.split(',')
    if len(params) < 30: return None, None, None
    try:
        cpnum = next((int(params[i]) for i in range(10) if int(params[i]) != 0), None)
        cpnum_range = next((int(params[i]) for i in range(10, 20) if int(params[i]) != 0), None)
        cpnum_dir = next((int(params[i]) for i in range(20, 30) if int(params[i]) != 0), None)
        return cpnum, cpnum_range, cpnum_dir
    except (ValueError, IndexError): return None, None, None

def collect_data_task2(logs_root='../../Logs/'):
    data_records = []
    logs_path = Path(logs_root)
    for tree_dir in logs_path.glob('tree=*'):
        tree_value = int(tree_dir.name.split('=')[1])
        for param_dir in tree_dir.iterdir():
            if not param_dir.is_dir(): continue
            cpnum, cpnum_range, cpnum_dir = parse_directory_name(param_dir.name)
            if cpnum is None: continue
            bug_path = param_dir / 'detected_bugs.csv'
            if not bug_path.exists(): continue
            with open(bug_path, 'r') as f:
                bug_results = [row[0] for row in csv.reader(f)]
                if len(bug_results) >= 5:
                    bug_detected_all = 1 if all(r not in ['timeout', 'null'] for r in bug_results[:5]) else 0
                    data_records.append({
                        'tree': tree_value, 'cpNum': cpnum, 
                        'cpNum_range': cpnum_range, 'cpNum_dir': cpnum_dir,
                        'bug_detected_all': bug_detected_all
                    })
    df = pd.DataFrame(data_records)
    print(f"データ収集完了: {len(df)}件のレコード")
    return df

df_agg = collect_data_task2()

# タスク2: 5回実行して5回ともバグ発見の予測（ランダムフォレスト）

In [None]:
X = df_agg[['tree', 'cpNum', 'cpNum_range', 'cpNum_dir']]
y = df_agg['bug_detected_all']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
print(f"訓練データ: {len(X_train)}, テストデータ: {len(X_test)}")

## GridSearchCVによるランダムフォレストの探索

In [None]:
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
])

param_grid_rf = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_leaf': [1, 5, 0.01, 0.05],
    'rf__max_features': ['sqrt', 'log2']
}

grid_search_rf = GridSearchCV(
    pipeline_rf, 
    param_grid_rf, 
    cv=cv, 
    scoring='f1', 
    n_jobs=-1, 
    verbose=1
)

print("探索を開始します...")
grid_search_rf.fit(X_train, y_train)

print(f"\nBest Parameters: {grid_search_rf.best_params_}")
print(f"Best CV F1-Score: {grid_search_rf.best_score_:.4f}")

## 最良モデルの評価と重要度分析

In [None]:
best_model_rf = grid_search_rf.best_estimator_
y_pred = best_model_rf.predict(X_test)

print("【テストデータでの評価】")
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score:  {f1_score(y_test, y_pred):.4f}")

print("\n【混同行列】")
print(confusion_matrix(y_test, y_pred))

print("\n【特徴量重要度】")
rf_inner = best_model_rf.named_steps['rf']
importances = pd.Series(rf_inner.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importances)

plt.figure(figsize=(10, 6))
sns.barplot(x=importances.values, y=importances.index, palette='viridis')
plt.title('Feature Importances of Best RF Model (Task 2: All Bug)')
plt.show()

## 探索結果の可視化

In [None]:
results_df = pd.DataFrame(grid_search_rf.cv_results_)
results_df['param_rf__min_samples_leaf'] = results_df['param_rf__min_samples_leaf'].astype(str)

plt.figure(figsize=(12, 6))
sns.boxplot(x='param_rf__min_samples_leaf', y='mean_test_score', data=results_df)
plt.title('Min Samples Leaf vs F1-Score (Task 2: All Bug)')
plt.ylabel('Mean CV F1-Score')
plt.show()