# Klasyfikacja – szczegółowe raporty czasów, metryk i macierzy konfuzji
Notatnik wypisuje – **dla każdego modelu i każdego katalogu** – czas trenowania, pełny `classification_report` oraz rysuje macierz konfuzji.

Lista modeli została odchudzona o te, które wcześniej powodowały błędy.

In [None]:

# -------------------- 📦 Imports
import time, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, confusion_matrix,
                             ConfusionMatrixDisplay)

# Classifiers
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb

class _EstimatorTags(dict):
    """Mini-kompatybilna wersja scikit-learn EstimatorTags."""
    def __getattr__(self, item):
        return self.get(item, None)

class XGBClassifierPatched(xgb.XGBClassifier):
    def __sklearn_tags__(self):
        # możesz wrzucić więcej flag, ale ta jedna w zupełności wystarczy
        return _EstimatorTags({'requires_fit': True})
        
class XGBClassifierPatched(xgb.XGBClassifier):
    """XGBClassifier z brakującą metodą __sklearn_tags__ dla scikit-learn ≥1.4"""
    def __sklearn_tags__(self):
        return {}

warnings.filterwarnings('ignore')
RANDOM_STATE = 42


In [None]:

# -------------------- 🗂 Paths
file_paths2 = [
    #'testData/21.10_A+NA+AZA_30s_600ms_3/',
    'testData/22.130_A+NA+AZA_30s_600ms_3/',
    #'testData/23.60_A+NA+AZA_30s_200ms_3/',
    #'testData/24.60_A+NA+AZA_30s_1100ms_3/',
    #'testData/25.60_A+NA+AZA_5s_600ms_3/',
    #'testData/26.60_A+NA+AZA_50s_600ms_3/',
    #'testData/27.60_A+NA+AZA_30s_600ms_2/',
    #'testData/28.60_A+NA+AZA_30s_600ms_8/',
]
labels2 = [ _[12:-1] if _[11] == '.' else _[11:-1] for _ in file_paths2 ]


In [None]:

def load_dataset(csv_path: Path, target='user_class'):
    df = pd.read_csv(csv_path)
    numeric_cols = ['application_time','database_time','api_time',
                    'cpu_usage_db_test','cpu_usage_db','cpu_usage_web']
    categorical_cols = [] if target=='endpoint_url' else ['endpoint_url']
    X = df[numeric_cols + categorical_cols].copy()
    y_raw = df[target].astype(str)
    le = LabelEncoder().fit(y_raw)
    y = le.transform(y_raw)
    return X, y, numeric_cols, categorical_cols, le.classes_


In [None]:

def make_pipelines(num_cols, cat_cols, n_classes):
    prep = ColumnTransformer([
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])
    models = {
        'DecisionTree': DecisionTreeClassifier(max_depth=8, random_state=RANDOM_STATE),
        'ExtraTree': ExtraTreeClassifier(random_state=RANDOM_STATE),
        'RandomForest': RandomForestClassifier(n_estimators=500, max_depth=8,
                                               min_samples_split=8, max_features='sqrt',
                                               random_state=RANDOM_STATE, bootstrap=False),
        'ExtraTrees': ExtraTreesClassifier(n_estimators=800, max_depth=10,
                                           min_samples_split=4, max_features='sqrt',
                                           random_state=RANDOM_STATE, bootstrap=False),
        'GaussianNB': GaussianNB(),
        'BernoulliNB': BernoulliNB(),
        'KNN': KNeighborsClassifier(n_neighbors=7),
        'LDA': LinearDiscriminantAnalysis(),
        'MLP': MLPClassifier(hidden_layer_sizes=(120,80,40), early_stopping=True,
                             max_iter=500, random_state=RANDOM_STATE),
        'LinearSVC': LinearSVC(C=10, class_weight='balanced', penalty='l1', loss='squared_hinge', dual=False, max_iter=10_000, 
                               random_state=RANDOM_STATE),
        'XGB': XGBClassifierPatched(
           booster='gbtree', objective='multi:softmax',
           num_class=n_classes, eval_metric='mlogloss',
           max_depth=10, n_estimators=300, learning_rate=0.1,
           random_state=RANDOM_STATE)
    }
    return {n: Pipeline([('prep', prep), ('clf', m)]) for n, m in models.items()}


In [None]:

# -------------------- 🚀 Training & detailed reports
for path, label in zip(file_paths2, labels2):
    csv_path = Path(path) / 'stockApp_merged_data.csv'
    print(f'\n{label}')
    X, y, num_cols, cat_cols, class_names = load_dataset(csv_path)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=RANDOM_STATE)
    pipes = make_pipelines(num_cols, cat_cols, len(class_names))

    for name, pipe in pipes.items():
        try:
            start = time.time()
            pipe.fit(X_train, y_train)
            elapsed = time.time() - start
            y_pred = pipe.predict(X_test)
            print(f'{name} time: {elapsed}')
            print(classification_report(y_test, y_pred, target_names=class_names))
            cm = confusion_matrix(y_test, y_pred)
            ConfusionMatrixDisplay(cm, display_labels=class_names).plot(
                cmap='Blues', xticks_rotation=45)
            plt.title(f'Confusion Matrix | {name}')
            plt.show()
        except Exception as e:
            print(f'⚠️ {name} failed: {e}')
