# ML regresja i klasyfikacja na Apple Silicon (M1)

Ten notatnik odtwarza pipeline z Twojego projektu, ale działa **natywnie** na macOS arm64. Zawiera zarówno regresję na XGBoost, jak i klasyfikację wieloklasową (RandomForest, ExtraTrees, DecisionTree, KNN, LinearSVC, MLP, XGB).


In [None]:

from __future__ import annotations
import itertools, time, sys, warnings, pathlib
from typing import List

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error, r2_score,
    classification_report, confusion_matrix
)
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

try:
    import xgboost as xgb
    _xgb_ok = True
except (ImportError, RuntimeError) as err:
    _xgb_ok = False
    warnings.warn(
        f'Nie udało się załadować xgboost ({err}). '
        'Regresja i klasyfikacja XGB zostaną pominięte.', RuntimeWarning
    )


In [None]:
# --- Ścieżki do zbiorów danych (z poprzedniego notatnika) ---
file_paths = [
    #'testData/1.10_A_30s_600ms_3/',
    #'testData/2.50_A_30s_600ms_3/',
    #'testData/3.90_A_30s_600ms_3/',
    'testData/4.130_A_30s_600ms_3/',
    #'testData/5.60_A_30s_200ms_3/',
    #'testData/6.60_A_30s_500ms_3/',
    #'testData/7.60_A_30s_800ms_3/',
    #'testData/8.60_A_30s_1100ms_3/',
    #'testData/9.60_A_5s_600ms_3/',
    #'testData/10.60_A_20s_600ms_3/',
    #'testData/11.60_A_35s_600ms_3/',
    #'testData/12.60_A_50s_600ms_3/',
    'testData/13.60_A_30s_600ms_2/',
    #'testData/14.60_A_30s_600ms_4/',
    #'testData/15.60_A_30s_600ms_6/',
    #'testData/16.60_A_30s_600ms_8/',
    #'testData/17.60_A_30s_600ms_3/',
    'testData/18.60_A+NA+AZA_30s_600ms_3/',
    #'testData/19.60_A+NA_30s_600ms_3/',
    #'testData/20.60_A+AZA_30s_600ms_3/',
]
file_paths2 = [
    'testData/21.10_A+NA+AZA_30s_600ms_3/',
    'testData/22.130_A+NA+AZA_30s_600ms_3/',
    #'testData/23.60_A+NA+AZA_30s_200ms_3/',
    #'testData/24.60_A+NA+AZA_30s_1100ms_3/',
    #'testData/25.60_A+NA+AZA_5s_600ms_3/',
    #'testData/26.60_A+NA+AZA_50s_600ms_3/',
    #'testData/27.60_A+NA+AZA_30s_600ms_2/',
    #'testData/28.60_A+NA+AZA_30s_600ms_8/',
]

labels = [ _[12:-1] if _ [11] == '.' else _[11:-1] for _ in file_paths ]
labels2 = [ _[12:-1] if _ [11] == '.' else _[11:-1] for _ in file_paths2 ]
# Aliasy dla kompatybilności ze starym notatnikiem
xgbTree = globals().get('xgb_tree')
multiClass = globals().get('multi_class')


In [None]:

# ---------------- Regresja -----------------
def xgb_tree(csv_path: pathlib.Path, target: str) -> None:
    if not _xgb_ok:
        print('⚠️  xgboost niedostępny – pomijam regresję')
        return
    df = pd.read_csv(csv_path)
    if 'user_class' in df.columns:
        df = df.drop(columns='user_class')
    df = df.drop(columns=[
        'timestamp', 'endpoint_url', 'api_method',
        'application_time_trade', 'database_time_trade',
        'number_of_sell_offers', 'number_of_buy_offers'
    ], errors='ignore')
    X = df.drop(columns=[target])
    y = df[target]
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
    dtrain, dtest = xgb.DMatrix(X_tr, label=y_tr), xgb.DMatrix(X_te, label=y_te)
    params = dict(objective='reg:squarederror', tree_method='hist',
                  device='cpu', max_depth=10, learning_rate=0.1)
    print(f'Trenuję XGB regresor dla “{target}” w {csv_path.parent.name}')
    model = xgb.train(params, dtrain, num_boost_round=100)
    preds = model.predict(dtest)
    print(f'MSE={mean_squared_error(y_te, preds):.4f} | R²={r2_score(y_te, preds):.4f}')
# --------------- Klasyfikacja --------------
_CLASSIFIERS = {
    'RandomForest': RandomForestClassifier(max_depth=8, random_state=42,
                                           criterion='gini', n_estimators=500,
                                           min_samples_split=8, max_features='sqrt', bootstrap=False),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=1000, random_state=42,
                                       criterion='gini', max_depth=10, min_samples_split=4,
                                       max_features='sqrt', bootstrap=False),
    'DecisionTree': DecisionTreeClassifier(random_state=42, max_depth=8,
                                           criterion='gini', max_features='sqrt',
                                           min_samples_split=8, splitter='best'),
    'KNN': KNeighborsClassifier(n_neighbors=7, algorithm='auto', leaf_size=20),
    'LinearSVC': LinearSVC(multi_class='ovr', class_weight='balanced',
                           max_iter=10000, C=10, loss='squared_hinge',
                           penalty='l1', dual=False, random_state=42),
    'MLP': MLPClassifier(random_state=42, alpha=1e-4, learning_rate='constant',
                         max_iter=200, batch_size=16, hidden_layer_sizes=(120, 80, 40),
                         solver='adam', activation='relu'),
}
if _xgb_ok:
    _CLASSIFIERS['XGB'] = xgb.XGBClassifier(booster='gbtree', objective='multi:softmax',
                                            eval_metric='mlogloss', random_state=42)
def _scale_if_needed(name, X_tr, X_te):
    if name in ('LinearSVC', 'MLP'):
        scaler = StandardScaler().fit(X_tr)
        return scaler.transform(X_tr), scaler.transform(X_te)
    return X_tr, X_te
def multi_class(csv_path: pathlib.Path, label_col='user_class'):
    df = pd.read_csv(csv_path)
    classes = sorted(df[label_col].unique())
    df['mapped'] = df[label_col].map({c:i for i,c in enumerate(classes)})
    if label_col != 'endpoint_url':
        df['endpoint_url_mapped'] = df['endpoint_url'].map(
            {u:i for i,u in enumerate(sorted(df['endpoint_url'].unique()))})
        features = ['endpoint_url_mapped','application_time','database_time','api_time',
                    'cpu_usage_db_test','cpu_usage_db','cpu_usage_web']
    else:
        features = ['application_time','database_time','api_time',
                    'cpu_usage_db_test','cpu_usage_db','cpu_usage_web']
    X, y = df[features], df['mapped']
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, train_size=0.8,
                                              random_state=42, stratify=y)
    for name, clf in _CLASSIFIERS.items():
        Xt, Xe = _scale_if_needed(name, X_tr, X_te)
        start = time.time()
        clf.fit(Xt, y_tr)
        preds = clf.predict(Xe)
        print(f'\n{name} | czas {time.time()-start:.2f}s')
        print(classification_report(y_te, preds, target_names=classes, zero_division=0))
        cm = confusion_matrix(y_te, preds)
        _plot_cm(cm, classes, title=f'{csv_path.parent.name} | {name}')
def _plot_cm(cm, labels, title='CM'):
    fig, ax = plt.subplots(figsize=(6,5))
    im = ax.imshow(cm, cmap='Blues')
    ax.figure.colorbar(im, ax=ax)
    ax.set_xticks(range(len(labels)), labels, rotation=45, ha='right')
    ax.set_yticks(range(len(labels)), labels)
    ax.set_xlabel('Predicted'); ax.set_ylabel('True'); ax.set_title(title)
    thresh = cm.max()/2.0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, cm[i,j], ha='center', va='center',
                    color='white' if cm[i,j]>thresh else 'black')
    plt.show()


In [None]:
DATA_ROOT = pathlib.Path('testData')  # <= ZMIEŃ NA SWOJĄ ŚCIEŻKĘ
assert DATA_ROOT.exists(), f'Ścieżka {DATA_ROOT} nie istnieje'

In [None]:

# --- Automatyczne tworzenie list plików CSV na podstawie odkomentowanych katalogów ---
# Jeśli w file_paths (1‑20) lub file_paths2 (21‑28) wszystko jest zakomentowane,
# notebook skanuje całe DATA_ROOT tak jak dotychczas.

from pathlib import Path

def _csvs_from_dirs(dir_list, fallback_root):
    dirs = [d for d in dir_list if isinstance(d,str) and d.strip() and not d.strip().startswith('#')]
    if dirs:
        return [Path(d) / 'stockApp_merged_data.csv' for d in dirs]
    else:
        return list(fallback_root.rglob('*_merged_data.csv'))

CSV_FILES_REG = _csvs_from_dirs(file_paths, DATA_ROOT)   # regresja
CSV_FILES_CLS = _csvs_from_dirs(file_paths2, DATA_ROOT)  # klasyfikacja
print(f'📈 CSV do regresji: {len(CSV_FILES_REG)} plików')
print(f'🔤 CSV do klasyfikacji: {len(CSV_FILES_CLS)} plików')


In [None]:
if _xgb_ok:
    targets = ['api_time', 'application_time', 'database_time']
    for csv_file in CSV_FILES_REG:
        for t in targets:
            xgb_tree(csv_file, t)

## 4 – Klasyfikacja wieloklasowa
Wykonaj komórkę poniżej – otrzymasz metryki i macierze konfuzji dla kilku klasyfikatorów.

In [None]:
for csv_file in CSV_FILES_CLS:
    multi_class(csv_file, label_col='user_class')

### To wszystko!
Jeśli chcesz zmienić hiperparametry, edytuj odpowiednie sekcje z definicjami modeli.