In [9]:
# -*- coding: utf-8 -*-
# Pipeline + dodatkowe wykresy + XGBoost
import os, glob, re
import numpy as np
import pandas as pd
from scipy import signal
from scipy.fft import rfft, rfftfreq
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc, roc_auc_score, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import joblib
import warnings
warnings.filterwarnings('ignore')

# OPTIONAL: xgboost (pip install xgboost)
try:
    from xgboost import XGBClassifier
except Exception as e:
    raise RuntimeError("Brakuje xgboost. Zainstaluj: pip install xgboost") from e

# ========== USTAWIENIA ==========
SAMPLE_RATE = 50
WINDOW_SEC = 2.56
STEP_SEC = WINDOW_SEC / 2
WINDOW_SAMPLES = int(WINDOW_SEC * SAMPLE_RATE)

OUTPUT_FIGURES = './figures'
os.makedirs(OUTPUT_FIGURES, exist_ok=True)

# Wywołaj load_and_segment_auto('.') aby otrzymać segments, labels, meta
segments, labels, meta = load_and_segment_auto('.')  # użyj istniejącej funkcji
print("Labels sample:", pd.Series(labels).value_counts().to_dict())

# ========== Preprocessing (filtrowanie) ==========
for seg in segments:
    for c in ['ax','ay','az','gx','gy','gz']:
        if c in seg.columns:
            try:
                seg[c + '_filt'] = apply_filter(seg[c].values)
                seg[c + '_filt'] = signal.detrend(seg[c + '_filt'])
            except Exception:
                seg[c + '_filt'] = seg[c].values

# ========== Feature extraction ==========
feature_rows = []
labels_clean = []
for seg, lab in zip(segments, labels):
    f = extract_features_from_segment(seg)
    feature_rows.append(f)
    labels_clean.append(lab if lab is not None else 'UNKNOWN')

X_df = pd.DataFrame(feature_rows).fillna(0)
y = np.array(labels_clean)
print("Feature matrix:", X_df.shape, "n_classes:", len(np.unique(y)))

# ========== Visualization 1: rozkład klas ==========
plt.figure(figsize=(8,4))
vc = pd.Series(y).value_counts().sort_values(ascending=False)
plt.bar(vc.index.astype(str), vc.values)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Liczba okien')
plt.title('Rozkład etykiet (okna)')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_FIGURES, 'class_distribution.png'), dpi=150)
plt.close()

# ========== VarianceThreshold + Skalowanie + PCA (do wizualizacji) ==========
vt = VarianceThreshold(threshold=1e-4)
X_var = vt.fit_transform(X_df)  # użyjemy X_var do trenowania modeli (łatwiej interpretować importances)
feature_mask = vt.get_support()
feature_names_kept = X_df.columns[feature_mask].tolist()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_var)

pca = PCA(n_components=min(10, X_scaled.shape[1]))
X_pca = pca.fit_transform(X_scaled)
print("PCA explained cum:", pca.explained_variance_ratio_.cumsum())

# ========== Visualization 2: PCA scatter 2 pierwsze składowe ==========
le_vis = LabelEncoder()
y_enc_vis = le_vis.fit_transform(y)
plt.figure(figsize=(6,5))
for lbl in np.unique(y_enc_vis):
    idx = y_enc_vis == lbl
    plt.scatter(X_pca[idx,0], X_pca[idx,1], s=8, alpha=0.6, label=str(le_vis.inverse_transform([lbl])[0]))
plt.xlabel('PC1'); plt.ylabel('PC2'); plt.title('PCA 1 vs 2')
plt.legend(bbox_to_anchor=(1.05,1), loc='upper left', fontsize='small')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_FIGURES,'pca_scatter.png'), dpi=150)
plt.close()

# ========== Przygotowanie etykiet do modelu ==========
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = le.classes_
print("Classes:", classes)

# Podzielmy dane na train/test (dla wykresów/oceny finalnej)
X_train, X_test, y_train, y_test = train_test_split(X_var, y_enc, test_size=0.2, stratify=y_enc, random_state=42)

# ========== Trening: RandomForest i XGBoost (multi-class) ==========
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
xgb = XGBClassifier(n_estimators=200, use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Cross-val (5-fold) porównanie accuracy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rf_scores = cross_val_score(rf, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
xgb_scores = cross_val_score(xgb, X_train, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
print("RF CV acc: %.3f ± %.3f" % (rf_scores.mean(), rf_scores.std()))
print("XGB CV acc: %.3f ± %.3f" % (xgb_scores.mean(), xgb_scores.std()))

# Dopasuj finalne modele na całym X_train
rf.fit(X_train, y_train)
xgb.fit(X_train, y_train)

# Ewaluacja na X_test
rf_preds = rf.predict(X_test)
xgb_preds = xgb.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)
xgb_acc = accuracy_score(y_test, xgb_preds)
print("RF test acc:", rf_acc, "XGB test acc:", xgb_acc)

# ========== Visualization 3: feature importances (top 20) ==========
importances_rf = rf.feature_importances_
importances_xgb = xgb.feature_importances_
idx_rf = np.argsort(importances_rf)[::-1][:20]
idx_xgb = np.argsort(importances_xgb)[::-1][:20]

plt.figure(figsize=(8,6))
names = [feature_names_kept[i] for i in idx_rf]
vals = importances_rf[idx_rf]
plt.barh(range(len(names))[::-1], vals, align='center')
plt.yticks(range(len(names))[::-1], names)
plt.xlabel('Importance')
plt.title('Top 20 feature importances (RandomForest)')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_FIGURES, 'feature_importances_rf.png'), dpi=150)
plt.close()

plt.figure(figsize=(8,6))
names = [feature_names_kept[i] for i in idx_xgb]
vals = importances_xgb[idx_xgb]
plt.barh(range(len(names))[::-1], vals, align='center')
plt.yticks(range(len(names))[::-1], names)
plt.xlabel('Importance')
plt.title('Top 20 feature importances (XGBoost)')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_FIGURES, 'feature_importances_xgb.png'), dpi=150)
plt.close()

# ========== Visualization 4: confusion matrix (XGB) ==========
cm = confusion_matrix(y_test, xgb_preds)
plt.figure(figsize=(8,6))
plt.imshow(cm, interpolation='nearest', cmap='Blues')
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, ha='right')
plt.yticks(tick_marks, classes)
plt.ylabel('True label'); plt.xlabel('Predicted label')
plt.title('Confusion matrix (XGBoost) on test set')
# annotate counts
thresh = cm.max() / 2
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_FIGURES, 'confusion_matrix_xgb.png'), dpi=150)
plt.close()

# ========== Dodatkowe: porównanie ROC dla binary (jeśli masz 2 klasy) ==========
if len(classes) == 2:
    # train a binary xgb/rf on X_train and plot ROC (one-vs-rest not needed)
    from sklearn.metrics import roc_curve, auc
    rf_prob = rf.predict_proba(X_test)[:,1]
    xgb_prob = xgb.predict_proba(X_test)[:,1]
    fpr, tpr, _ = roc_curve(y_test, rf_prob)
    fpr2, tpr2, _ = roc_curve(y_test, xgb_prob)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, label=f'RF AUC={auc(fpr,tpr):.2f}')
    plt.plot(fpr2, tpr2, label=f'XGB AUC={auc(fpr2,tpr2):.2f}')
    plt.plot([0,1],[0,1],'--', color='gray')
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.legend(); plt.title('ROC (test)')
    plt.tight_layout(); plt.savefig(os.path.join(OUTPUT_FIGURES,'roc_compare.png'), dpi=150); plt.close()

# ========== Zapis modeli i obiektów ==========
joblib.dump(rf, 'rf_model.joblib')
joblib.dump(xgb, 'xgb_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(vt, 'variance_threshold.joblib')
joblib.dump(pca, 'pca.joblib')
joblib.dump(le, 'label_encoder.joblib')
print("Saved RF/XGB models and preprocessors to disk. Wygenerowane wykresy w", OUTPUT_FIGURES)


Załadowane CSV: ['./train.csv', './test.csv']
Połączone shape: (10299, 563)
Kolumny przykładowe: ['tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y', 'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y', 'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y', 'tBodyAcc-mad()-Z', 'tBodyAcc-max()-X', 'tBodyAcc-max()-Y', 'tBodyAcc-max()-Z', 'tBodyAcc-min()-X', 'tBodyAcc-min()-Y', 'tBodyAcc-min()-Z', 'tBodyAcc-sma()', 'tBodyAcc-energy()-X', 'tBodyAcc-energy()-Y', 'tBodyAcc-energy()-Z', 'tBodyAcc-iqr()-X', 'tBodyAcc-iqr()-Y', 'tBodyAcc-iqr()-Z', 'tBodyAcc-entropy()-X', 'tBodyAcc-entropy()-Y', 'tBodyAcc-entropy()-Z', 'tBodyAcc-arCoeff()-X,1', 'tBodyAcc-arCoeff()-X,2', 'tBodyAcc-arCoeff()-X,3', 'tBodyAcc-arCoeff()-X,4', 'tBodyAcc-arCoeff()-Y,1']
Detected mapping: {'ax': 'tBodyAcc-mean()-X', 'ay': 'tBodyAcc-mean()-Y', 'az': 'tBodyAcc-mean()-Z', 'gx': 'tBodyGyro-mean()-X', 'gy': 'tBodyGyro-mean()-Y', 'gz': 'tBodyGyro-mean()-Z'}
Utworzono okien: 159
Labels sample: {'LAYING': 55, 'STANDING': 49,