# Bachelorarbeit – Vorhersage technischer Schuld mittels maschinellem Lernen auf Basis von Code-Metriken und Commit-Historien

(Zur vollständigen Reproduktion die hier importierten Module installieren, die Repos klonen und deren Pfade ggf. konfigurieren sowie die Umgebungsvariable OPENAI_API_KEY setzen)

In [64]:
import os, re, json, math, asyncio, datetime as dt, collections, hashlib
from pathlib import Path
import pandas as pd, numpy as np
from pydriller import Repository
from pydriller.domain.commit import ModificationType as MT
from tqdm import tqdm

BASE_DIR = Path(__file__).parent if '__file__' in globals() else Path.cwd()
REPOS = {
    'requests':  {'path': 'repos/requests',        'branch': 'main'},
    'fastapi':  {'path': 'repos/fastapi',        'branch': 'master'},
    'scrapy':   {'path': 'repos/scrapy',         'branch': 'master'},
    'flask':    {'path': 'repos/flask',          'branch': 'main'},
    'keras':    {'path': 'repos/keras',          'branch': 'master'},
}

CUTOFF=dt.datetime(2025,6,15,23,59,59, tzinfo=dt.timezone.utc)
SATD=re.compile(r'\b(TODO|FIXME|BUG|HACK|XXX|WORKAROUND|TEMP|KLUDGE|UGLY|DIRTY|BROKEN|FIX)\b',re.I)

LLM_FRAC=.25
LLM_MODEL='gpt-4o'

OUT=Path('data'); OUT.mkdir(exist_ok=True)
SPLIT=Path('splits'); SPLIT.mkdir(exist_ok=True)
LLM_DIR=Path('llm_batch'); LLM_DIR.mkdir(exist_ok=True)

In [65]:
def is_comment_or_docstring(line):
    line = line.strip()
    return (
        line.startswith('#') or
        line.startswith('"""') or
        line.startswith("'''") or
        line.endswith('"""') or
        line.endswith("'''")
    )

def satd_delta(mod):
    add = rem = 0
    for _, line in mod.diff_parsed['added']:
        if is_comment_or_docstring(line) and SATD.search(line):
            add += 1
    for _, line in mod.diff_parsed['deleted']:
        if is_comment_or_docstring(line) and SATD.search(line):
            rem += 1
    return add - rem

def is_py(m):
    fp=m.new_path or m.old_path or ''
    return fp.endswith('.py')

def quick_hunks_count(mod):
    diff = mod.diff.splitlines()
    state = False
    h = 0
    for line in diff:
        if line.startswith(('+', '-')):
            if not state:
                state = True
                h += 1
        else:
            state = False
    return h

#def diff_snippet(txt,max_lines=3000):
#    return '\n'.join(txt.split('\n')[:max_lines])

#def summary(r):
#    return (f"adds {r['lines_added']} LOC ({r['lines_deleted']} del) across {r['files_changed']} py‑files; "
#            f"ΔCCmax {r['cc_delta_max']}; methods {r['n_methods_changed']}; "
#            f"commits90d {r['n_commits_file_past90d']}; authors tot {r['n_authors_till_now']}")

## Mining & Feature‑Engineering

In [None]:
import csv
import json
import collections
from pathlib import Path
from pydriller import Repository
from pydriller.metrics.process.commits_count import CommitsCount
from pydriller.metrics.process.contributors_experience import ContributorsExperience
from pydriller.metrics.process.history_complexity import HistoryComplexity

# LLM Batch Setup
GENERATE_LLM_BATCH = True
MAX_REQ = 50_000
MAX_MB = 180

# Konstanten
COLS = [
    'repo_id', 'commit_hash', 'commit_uid', 'commit_date',
    'lines_added', 'lines_deleted', 'files_changed', 'hunks',
    'n_methods_changed', 'cc_delta_sum', 'cc_delta_max',
    'complexity_current_sum', 'churn_delta', 'churn_cum',
    'contributors_count', 'contributors_cum',
    'n_authors_till_now', 'n_commits_file_past90d',
    'commits_count_file', 'contributors_experience',
    'history_complexity',
    'dmm_unit_complexity', 'dmm_unit_size', 'dmm_unit_interfacing',
    'satd_delta', 'label_td_satd'
]

# Hilfsfunktionen
def prepare_llm_prompt(row, diff_text):
    def summary_llm(r):
        fields = [
            f"Lines Added: {r['lines_added']}",
            f"Lines Deleted: {r['lines_deleted']}",
            f"Files Changed: {r['files_changed']}",
            f"Hunks: {r['hunks']}",
            f"Methods Changed: {r['n_methods_changed']}",
            f"Complexity Δ (Sum/Max): {r['cc_delta_sum']}/{r['cc_delta_max']}",
            f"Churn Δ: {r['churn_delta']}",
            f"Churn Cumulative: {r['churn_cum']}",
            f"Contributors (this commit): {r['contributors_count']}",
            f"Commits (past 90d): {r['n_commits_file_past90d']}",
            f"Contributors (cumulative): {r['contributors_cum']}",
            f"DMM Complexity: {r['dmm_unit_complexity']}"
        ]
        return " | ".join(fields)

    prompt = (
        "You are a senior reviewer.\n\n"
        "Commit Summary:\n"
        f"{summary_llm(row)}\n\n"
        "DIFF:\n"
        f"{diff_text}\n\n"
        "Question: Does this commit introduce technical debt? Answer yes or no."
    )

    rec = {
        'custom_id': row['commit_uid'],
        'method': 'POST',
        'url': '/v1/chat/completions',
        'body': {
            'model': LLM_MODEL,
            'messages': [{'role': 'user', 'content': prompt}],
            'max_tokens': 1,
            'temperature': 0
        }
    }

    return json.dumps(rec, ensure_ascii=False) + '\n'


# Verarbeitung
for repo, cfg in tqdm(REPOS.items(), desc="Repos"):
    path = (BASE_DIR / cfg['path']).expanduser().resolve()
    if not path.is_dir():
        raise FileNotFoundError(f"{path} existiert nicht – REPOS-Eintrag prüfen!")

    file_prev_cc = collections.defaultdict(int)
    file_current_cc = collections.defaultdict(int)
    file_churn_cum = collections.defaultdict(int)
    file_authors = collections.defaultdict(set)
    file_times = collections.defaultdict(list)
    file_contributors = collections.defaultdict(set)

    START_DATE = dt.datetime(1970, 1, 1, tzinfo=dt.timezone.utc)
    print(f"[{repo}] Berechne Prozessmetriken...")

    commits_count_dict = CommitsCount(str(path), since=START_DATE, to=CUTOFF).count()
    contrib_exp_dict = ContributorsExperience(str(path), since=START_DATE, to=CUTOFF).count()
    hist_complexity_dict = HistoryComplexity(str(path), since=START_DATE, to=CUTOFF).count()

    print(f"[{repo}] Prozessmetriken berechnet.")

    csv_file = OUT / f'features_{repo}.csv'
    writer = csv.DictWriter(csv_file.open('w', newline=''), fieldnames=COLS)
    writer.writeheader()

    part = 1
    cur_size = 0
    handle = None

    commits = Repository(
        path_to_repo=str(path),
        only_in_branch=cfg['branch'],
        to=CUTOFF,
        only_modifications_with_file_types=['.py'],
        num_workers=64,
        skip_whitespaces=True,
        histogram_diff=True
    ).traverse_commits()

    for c in tqdm(commits, desc=repo, leave=False):
        py = [m for m in c.modified_files if is_py(m)]
        if not py:
            continue

        la = sum(m.added_lines for m in py)
        ld = sum(m.deleted_lines for m in py)
        files_changed = len(py)
        hunks = sum(quick_hunks_count(m) for m in py)
        n_methods = sum(len(m.changed_methods) for m in py)

        cc_delta_sum = 0
        cc_delta_max = 0
        churn_delta = 0
        complexity_current_sum = 0
        churn_cum_sum = 0
        contributors_in_commit = set()

        for m in py:
            fp = m.new_path or m.old_path
            delta_cc = (m.complexity or 0) - file_prev_cc[fp]
            cc_delta_sum += delta_cc
            cc_delta_max = max(cc_delta_max, delta_cc)
            file_prev_cc[fp] = m.complexity or 0
            file_current_cc[fp] = m.complexity or 0
            complexity_current_sum += file_current_cc[fp]
            churn_this = m.added_lines + m.deleted_lines
            churn_delta += churn_this
            file_churn_cum[fp] += churn_this
            churn_cum_sum += file_churn_cum[fp]
            file_contributors[fp].add(c.author.email)
            contributors_in_commit.update(file_contributors[fp])
            file_authors[fp].add(c.author.email)
            file_times[fp].append(c.author_date)

        cutoff90 = c.author_date - dt.timedelta(days=90)
        n_commits90 = sum(
            len([t for t in ts if t >= cutoff90])
            for fp, ts in file_times.items()
            if fp in [m.new_path or m.old_path for m in py]
        )

        commits_count_file = sum(commits_count_dict.get(m.new_path or m.old_path, 0) for m in py)
        contributors_experience = sum(contrib_exp_dict.get(m.new_path or m.old_path, 0) for m in py)
        history_complexity = sum(hist_complexity_dict.get(m.new_path or m.old_path, 0) for m in py)
        satd = sum(satd_delta(m) for m in py)
        label_td_satd = 1 if satd > 0 else 0

        row_dict = {
            'repo_id': repo,
            'commit_hash': c.hash,
            'commit_uid': f'{repo}#{c.hash}',
            'commit_date': c.author_date.isoformat(),
            'lines_added': la,
            'lines_deleted': ld,
            'files_changed': files_changed,
            'hunks': hunks,
            'n_methods_changed': n_methods,
            'cc_delta_sum': cc_delta_sum,
            'cc_delta_max': cc_delta_max,
            'complexity_current_sum': complexity_current_sum,
            'churn_delta': churn_delta,
            'churn_cum': churn_cum_sum,
            'contributors_count': len(contributors_in_commit),
            'contributors_cum': sum(len(file_contributors[fp]) for m in py for fp in [m.new_path or m.old_path]),
            'n_authors_till_now': len({a for s in file_authors.values() for a in s}),
            'n_commits_file_past90d': n_commits90,
            'commits_count_file': commits_count_file,
            'contributors_experience': contributors_experience,
            'history_complexity': history_complexity,
            'dmm_unit_complexity': c.dmm_unit_complexity,
            'dmm_unit_size': c.dmm_unit_size,
            'dmm_unit_interfacing': c.dmm_unit_interfacing,
            'satd_delta': satd,
            'label_td_satd': label_td_satd
        }

        writer.writerow(row_dict)

        if GENERATE_LLM_BATCH and row_dict['satd_delta'] <= 0:
            diff_text = '\n'.join([diff_snippet(m.diff) for m in py])
            jsonl_line = prepare_llm_prompt(row_dict, diff_text)

            if handle is None:
                f = LLM_DIR / f"{repo}_part{part}.jsonl"
                handle = f.open('w', encoding='utf-8')
                cur_size = 0

            if cur_size + len(jsonl_line.encode('utf-8')) > MAX_MB * 1_000_000 or (handle.tell() // 1) > MAX_REQ:
                handle.close()
                part += 1
                f = LLM_DIR / f"{repo}_part{part}.jsonl"
                handle = f.open('w', encoding='utf-8')
                cur_size = 0

            handle.write(jsonl_line)
            cur_size += len(jsonl_line.encode('utf-8'))

    if handle:
        handle.close()

    print(f"[{repo}] abgeschlossen.")

print("Alle Repos verarbeitet.")


## Splits

In [None]:
import pandas as pd
from pathlib import Path

# CSV-Dateien zusammenfügen
csv_files = list((OUT).glob('features_*.csv'))
df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# Splits erstellen
df['commit_dt'] = pd.to_datetime(df.commit_date, utc=True)
train_idx = []
test_idx = []

# Zeitbasiertes Splitten je Projekt (dann aggregieren)
for repo, g in df.groupby('repo_id'):
    g = g.sort_values('commit_dt')
    n = int(0.7 * len(g))
    train_idx += list(g.index[:n]) # first 70%
    test_idx += list(g.index[n:]) #last 30%

# Speichern
SPLIT.joinpath('time_train.csv').write_text(df.loc[train_idx].to_csv(index=False))
SPLIT.joinpath('time_test.csv').write_text(df.loc[test_idx].to_csv(index=False))

# LOPO-Splits erstellen
for repo in df.repo_id.unique():
    SPLIT.joinpath(f'lopo_train_excl_{repo}.csv').write_text(df[df.repo_id != repo].to_csv(index=False))
    SPLIT.joinpath(f'lopo_test_{repo}.csv').write_text(df[df.repo_id == repo].to_csv(index=False))

print('Globale Splits erstellt.')


## Erste Iteration

In [None]:
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
import shap
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve
from pathlib import Path
from tqdm import tqdm

# Daten laden
SPLIT = Path("splits")
train_df = pd.read_csv(SPLIT / 'time_train.csv')
test_df = pd.read_csv(SPLIT / 'time_test.csv')

# Funktionen
def prepare_xy(df):
    X = df.drop(columns=[
        'repo_id', 'commit_hash', 'commit_uid', 'commit_date', 'commit_dt', # irrelevant für Training / keine Features
        'satd_delta',  # direktes SATD-Delta wird nicht als Feature genutzt
        'label_td_satd'  # Zielvariable wird ebenfalls entfernt
    ])
    y = df['label_td_satd']
    return X, y

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n{name} – Klassifikationsbericht:")
    print(classification_report(y_test, y_pred, digits=3))

    print(f"Confusion Matrix ({name}):")
    print(confusion_matrix(y_test, y_pred))

    print(f"ROC AUC ({name}): {roc_auc_score(y_test, y_proba):.4f}")

    return y_pred, y_proba

# Daten vorbereiten
X_train, y_train = prepare_xy(train_df)
X_test, y_test = prepare_xy(test_df)

MODELS_DIR = Path("trained_models")
MODELS_DIR.mkdir(exist_ok=True)

# Random Forest
rf = RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
joblib.dump(rf, MODELS_DIR / "rf_model.joblib")
evaluate_model("Random Forest", rf, X_test, y_test)

# LightGBM
lgbm = lgb.LGBMClassifier(n_estimators=250, random_state=42, n_jobs=-1)
lgbm.fit(X_train, y_train)
joblib.dump(lgbm, MODELS_DIR / "lgbm_model.joblib")
evaluate_model("LightGBM", lgbm, X_test, y_test)

# XGBoost
xgbm = xgb.XGBClassifier(n_estimators=250, random_state=42, n_jobs=-1, use_label_encoder=False)
xgbm.fit(X_train, y_train)
joblib.dump(xgbm, MODELS_DIR / "xgb_model.joblib")
evaluate_model("XGBoost", xgbm, X_test, y_test)

print("Alle Modelle trainiert und gespeichert.")


In [None]:
# LOPO-Demonstration für alle Repos und Modelle

from pathlib import Path
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Ordner mit Splits
SPLIT = Path("splits")

# Repos aus LOPO-Splits ableiten:
repos = sorted([p.name.replace("lopo_test_", "").replace(".csv", "") 
                for p in SPLIT.glob("lopo_test_*.csv")])

# Funktion zur Aufbereitung
def prepare_xy(df):
    X = df.drop(columns=[
        'repo_id', 'commit_hash', 'commit_uid', 'commit_date', 'commit_dt',
        'satd_delta', 'label_td_satd'
    ])
    y = df['label_td_satd']
    return X, y

# Funktion zur Evaluation
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print(f"\n{name} – LOPO-Ergebnisse:")
    print(classification_report(y_test, y_pred, digits=3))
    print(f"Confusion Matrix ({name}):")
    print(confusion_matrix(y_test, y_pred))
    print(f"ROC AUC ({name}): {roc_auc_score(y_test, y_proba):.4f}")

# LOPO über alle Projekte
for repo in repos:
    print(f"\n=== LOPO: {repo} excluded ===")
    
    # Daten laden
    train_df = pd.read_csv(SPLIT / f'lopo_train_excl_{repo}.csv')
    test_df = pd.read_csv(SPLIT / f'lopo_test_{repo}.csv')

    # Aufbereiten
    X_train, y_train = prepare_xy(train_df)
    X_test, y_test = prepare_xy(test_df)

    # Modelle trainieren und evaluieren

    rf = RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    evaluate_model(f"Random Forest (LOPO: {repo})", rf, X_test, y_test)

    lgbm = lgb.LGBMClassifier(n_estimators=250, random_state=42, n_jobs=-1)
    lgbm.fit(X_train, y_train)
    evaluate_model(f"LightGBM (LOPO: {repo})", lgbm, X_test, y_test)

    xgbm = xgb.XGBClassifier(n_estimators=250, random_state=42, n_jobs=-1, use_label_encoder=False)
    xgbm.fit(X_train, y_train)
    evaluate_model(f"XGBoost (LOPO: {repo})", xgbm, X_test, y_test)


In [None]:
import shap
import matplotlib.pyplot as plt

# TreeExplainer vorbereiten (für Baum-Modelle wie RF, LightGBM, XGBoost)
explainer_rf = shap.TreeExplainer(rf)
explainer_lgbm = shap.TreeExplainer(lgbm)
explainer_xgbm = shap.TreeExplainer(xgbm)

# SHAP-Werte für das Test-Set berechnen
shap_values_rf = explainer_rf.shap_values(X_test)
shap_values_lgbm = explainer_lgbm.shap_values(X_test)
shap_values_xgbm = explainer_xgbm.shap_values(X_test)

# SHAP Summary Plot für Random Forest
shap.summary_plot(shap_values_rf, X_test, show=False)
plt.title("SHAP Summary – Random Forest")
plt.savefig("shap_rf_summary.png", bbox_inches='tight')
plt.close()

# SHAP Summary Plot für LightGBM
shap.summary_plot(shap_values_lgbm, X_test, show=False)
plt.title("SHAP Summary – LightGBM")
plt.savefig("shap_lgbm_summary.png", bbox_inches='tight')
plt.close()

# SHAP Summary Plot für XGBoost
shap.summary_plot(shap_values_xgbm, X_test, show=False)
plt.title("SHAP Summary – XGBoost")
plt.savefig("shap_xgbm_summary.png", bbox_inches='tight')
plt.close()

print("SHAP-Analyse abgeschlossen. Plots als PNG gespeichert.")


## LLM-as-Judge OpenAI GPT-API Batching (Upload & Batch Start)

In [None]:
import os
import json
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI

# OpenAI API-Key aus Umgebungsvariable laden
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
assert client.api_key, "OPENAI_API_KEY Umgebungsvariable ist nicht gesetzt."

LLM_DIR = Path("llm_batch")
BATCH_INFO_FILE = LLM_DIR / "batch_metadata.json"

batch_metadata = {}

jsonl_files = sorted(LLM_DIR.glob("*.jsonl"))

for jsonl_file in tqdm(jsonl_files, desc="Batch uploads + creations"):
    # Datei-Upload
    with open(jsonl_file, "rb") as f:
        batch_input_file = client.files.create(
            file=f,
            purpose="batch"
        )

    file_id = batch_input_file.id
    print(f"File {jsonl_file.name} uploaded: {file_id}")

    # Batch starten
    batch = client.batches.create(
        input_file_id=file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": f"TD Detection Batch for {jsonl_file.name}"}
    )

    batch_id = batch.id
    print(f"Batch gestartet für {jsonl_file.name}: {batch_id}")

    # Metadaten speichern
    batch_metadata[jsonl_file.name] = {
        "file_id": file_id,
        "batch_id": batch_id,
        "status": "submitted"
    }

# Metadaten-JSON sichern
with open(BATCH_INFO_FILE, "w", encoding="utf-8") as f:
    json.dump(batch_metadata, f, indent=2)

print(f"Alle Batches gestartet. Metadaten gespeichert in {BATCH_INFO_FILE}")


## LLM-as-Judge OpenAI GPT-API Batching (Status & ggf. Download)

In [None]:
import os
import json
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI

# OpenAI API-Key laden
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Verzeichnisse und Dateien
LLM_DIR = Path("llm_batch")
BATCH_INFO_FILE = LLM_DIR / "batch_metadata.json"
BATCH_STATUS_FILE = LLM_DIR / "batch_status.json"

# Batch-Metadaten laden
with open(BATCH_INFO_FILE, "r", encoding="utf-8") as f:
    batch_metadata = json.load(f)

# Status für alle bekannten Batches abfragen
batch_status_results = {}

for name, meta in tqdm(batch_metadata.items(), desc="Batch Status Check"):
    batch_id = meta["batch_id"]

    # Batch-Status vom OpenAI-Server abrufen
    batch = client.batches.retrieve(batch_id)
    status_info = {
        "status": batch.status,
        "input_file_id": batch.input_file_id,
        "output_file_id": batch.output_file_id,
        "error_file_id": batch.error_file_id,
        "request_counts": batch.request_counts
    }

    # Download output- und error-file
    if batch.output_file_id:
        output_path = LLM_DIR / f"{name}_output.jsonl"
        with open(output_path, "wb") as out_f:
            content = client.files.content(batch.output_file_id)
            out_f.write(content.read())

    if batch.error_file_id:
        error_path = LLM_DIR / f"{name}_errors.jsonl"
        with open(error_path, "wb") as err_f:
            content = client.files.content(batch.error_file_id)
            err_f.write(content.read())

    batch_status_results[name] = status_info

# Ergebnisse sichern
def safe_json(obj):
    try:
        json.dumps(obj)
        return obj
    except TypeError:
        return str(obj)

with open(BATCH_STATUS_FILE, "w", encoding="utf-8") as f:
    json.dump({k: {kk: safe_json(vv) for kk, vv in v.items()} for k, v in batch_status_results.items()}, f, indent=2)

print(f"Batch-Status und Ergebnisse gespeichert in {BATCH_STATUS_FILE}")


## Zweite Iteration

In [None]:
import pandas as pd
import json
from pathlib import Path
from tqdm import tqdm

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import joblib

# === Pfade und Setup ===
LLM_DIR = Path("llm_batch")
OUT = Path("data")
SPLIT = Path("splits")
SPLIT.mkdir(exist_ok=True)

# === 1. LLM-Judgement einlesen ===
llm_labels = {}

jsonl_files = sorted(LLM_DIR.glob("*_output.jsonl"))

for file in tqdm(jsonl_files, desc="LLM Judgements einlesen"):
    with open(file, "r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            commit_uid = data.get("custom_id")
            choice = data.get("response", {}).get("body", {}).get("choices", [{}])[0]
            content = choice.get("message", {}).get("content", "").strip().lower()
            if content in ["yes", "no"]:
                llm_labels[commit_uid] = 1 if content == "yes" else 0

print(f"LLM labels geladen: {len(llm_labels)}")

# === 2. CSV-Dateien aktualisieren und splits neu erstellen ===

csv_files = list((OUT).glob('features_*.csv'))
dfs = []

for file in tqdm(csv_files, desc="CSV-Dateien verarbeiten"):
    df = pd.read_csv(file)
    df["label_llm"] = df["commit_uid"].map(llm_labels).fillna(0).astype(int)
    df["label_td_combined"] = ((df["label_td_satd"] == 1) | (df["label_llm"] == 1)).astype(int)
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)

# Splits erstellen
df_all['commit_dt'] = pd.to_datetime(df_all.commit_date, utc=True)
train_idx, test_idx = [], []

for repo, g in df_all.groupby('repo_id'):
    g = g.sort_values('commit_dt')
    n = int(0.7 * len(g))
    train_idx += list(g.index[:n])
    test_idx += list(g.index[n:])

SPLIT.joinpath('time_train_iteration-2.csv').write_text(df_all.loc[train_idx].to_csv(index=False))
SPLIT.joinpath('time_test_iteration-2.csv').write_text(df_all.loc[test_idx].to_csv(index=False))

for repo in df_all.repo_id.unique():
    SPLIT.joinpath(f'lopo_train_excl_{repo}_iteration-2.csv').write_text(df_all[df_all.repo_id != repo].to_csv(index=False))
    SPLIT.joinpath(f'lopo_test_{repo}_iteration-2.csv').write_text(df_all[df_all.repo_id == repo].to_csv(index=False))

print("Neue Splits erstellt (Iteration 2)")

# === 3. Training, Test, LOPO, SHAP wie Iteration 1 ===

# Hilfsfunktionen
def prepare_xy(df):
    X = df.drop(columns=[
        'repo_id', 'commit_hash', 'commit_uid', 'commit_date', 'commit_dt',
        'satd_delta', 'label_td_satd', 'label_llm'
    ])
    y = df['label_td_combined']
    return X, y

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    print(f"\n{name} – Evaluation:")
    print(classification_report(y_test, y_pred, digits=3))
    print(f"Confusion Matrix ({name}):\n{confusion_matrix(y_test, y_pred)}")
    print(f"ROC AUC ({name}): {roc_auc_score(y_test, y_proba):.4f}")

# Laden
train_df = pd.read_csv(SPLIT / 'time_train_iteration-2.csv')
test_df = pd.read_csv(SPLIT / 'time_test_iteration-2.csv')

X_train, y_train = prepare_xy(train_df)
X_test, y_test = prepare_xy(test_df)

# Modelle
rf = RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
evaluate_model("Random Forest (Iteration 2)", rf, X_test, y_test)

lgbm = lgb.LGBMClassifier(n_estimators=250, random_state=42, n_jobs=-1)
lgbm.fit(X_train, y_train)
evaluate_model("LightGBM (Iteration 2)", lgbm, X_test, y_test)

xgbm = xgb.XGBClassifier(n_estimators=250, random_state=42, n_jobs=-1, use_label_encoder=False)
xgbm.fit(X_train, y_train)
evaluate_model("XGBoost (Iteration 2)", xgbm, X_test, y_test)

joblib.dump(rf, "trained_rf_iteration2.joblib")
joblib.dump(lgbm, "trained_lgbm_iteration2.joblib")
joblib.dump(xgbm, "trained_xgbm_iteration2.joblib")

# LOPO
repos = df_all.repo_id.unique()

for repo in repos:
    print(f"\n=== LOPO: {repo} excluded ===")
    train_df = pd.read_csv(SPLIT / f'lopo_train_excl_{repo}_iteration-2.csv')
    test_df = pd.read_csv(SPLIT / f'lopo_test_{repo}_iteration-2.csv')
    X_train, y_train = prepare_xy(train_df)
    X_test, y_test = prepare_xy(test_df)
    rf.fit(X_train, y_train)
    evaluate_model(f"Random Forest (LOPO {repo})", rf, X_test, y_test)
    lgbm.fit(X_train, y_train)
    evaluate_model(f"LightGBM (LOPO {repo})", lgbm, X_test, y_test)
    xgbm.fit(X_train, y_train)
    evaluate_model(f"XGBoost (LOPO {repo})", xgbm, X_test, y_test)

# SHAP
explainer = shap.TreeExplainer(lgbm)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, show=False)
plt.title("SHAP Summary – LightGBM – Iteration 2")
plt.savefig("shap_lgbm_iteration2.png", bbox_inches='tight')
plt.close()

print("Iteration 2 abgeschlossen.")
