# Adding New Features

This notebook performs feature engineering to enhance the metadata with additional statistical and signal-based features extracted from the raw signals data. The main steps are:

1. **Loading Data**
   - Loads `signals.csv`, `metadata.csv`, and their respective test and Kaggle versions.

2. **Feature Extraction**
   - For each `user_snippet`, statistical and signal-based features are extracted for the x, y, and z axes, including:
     - **Statistical features**: mean, std, range, IQR, MAD, skewness, kurtosis, coefficient of variation.
     - **Rolling statistics**: rolling mean and rolling std.
     - **Frequency-domain features**: FFT peak, spectral energy, spectral entropy.
     - **Hjorth parameters**: mobility and complexity.
     - **Transition-based features**: zero-crossing rate.

3. **Additional Derived Features**
   - Computes additional global features such as:
     - **SMA (Signal Magnitude Area)**
     - **Axis correlations** (corr_xy, corr_xz, corr_yz)
     - **Jerk statistics** (mean and std)

4. **Merging Features**
   - All newly computed features are merged with the original `metadata.csv` to create enriched datasets.

5. **Saving Outputs**
   - The final feature-enriched datasets are saved as:
     - `Updated_Metadata.csv`
     - `Updated_Metadata_Test.csv`
     - `Updated_Metadata_Kaggle.csv`

These updated files serve as the training and evaluation input for all machine learning models developed in this project.


In [6]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew, iqr
from scipy.signal import welch
from scipy.fft import fft

# ---------- Load all data ----------
metadata_df = pd.read_csv('metadata.csv')
metadata_test_df = pd.read_csv('metadata_test.csv')
metadata_kaggle_df = pd.read_csv('metadata_kaggle.csv')

signals_df = pd.read_csv('signals.csv')
signals_test_df = pd.read_csv('signals_test.csv')
signals_kaggle_df = pd.read_csv('signals_kaggle.csv')

# ---------- Statistical & Spectral Feature Extraction ----------
def extract_features_from_signals(df):
    feature_rows = []

    for snippet_id, group in df.groupby("user_snippet"):
        feature_dict = {"user_snippet": snippet_id}

        for axis in ["x-axis", "y-axis", "z-axis"]:
            signal = group[axis].fillna(0).values
            series = pd.Series(signal)

            # Basic statistics
            mean_val = np.mean(signal)
            std_val = np.std(signal)
            var_0 = np.var(signal)

            # Hjorth parameters
            first_deriv = np.diff(signal, prepend=signal[0])
            second_deriv = np.diff(first_deriv, prepend=first_deriv[0])
            var_d1 = np.var(first_deriv)
            var_d2 = np.var(second_deriv)
            mobility = np.sqrt(var_d1 / var_0) if var_0 != 0 else 0
            complexity = np.sqrt(var_d2 / var_d1) / mobility if var_d1 != 0 else 0

            # Frequency & spectral features
            f, Pxx = welch(signal, nperseg=min(256, len(signal)))
            Pxx_norm = Pxx / np.sum(Pxx) if np.sum(Pxx) != 0 else np.ones_like(Pxx)

            # Feature dictionary updates
            feature_dict.update({
                f"{axis}__fft_peak": np.max(np.abs(fft(signal))),
                f"{axis}__hjorth_mobility": mobility,
                f"{axis}__hjorth_complexity": complexity,
                f"{axis}__rolling_mean": series.rolling(window=5, min_periods=1).mean().mean(),
                f"{axis}__rolling_std": series.rolling(window=5, min_periods=1).std().mean(),
                f"{axis}__zero_crossing_rate": ((series.shift(1) * series) < 0).sum() / len(series),
                f"{axis}__spectral_energy": np.sum(Pxx),
                f"{axis}__spectral_entropy": -np.sum(Pxx_norm * np.log2(Pxx_norm + 1e-12)),
                f"{axis}__iqr": iqr(signal),
                f"{axis}__coeff_var": std_val / mean_val if mean_val != 0 else 0,
                f"{axis}__energy": np.sum(signal ** 2),
                f"{axis}_MAD": np.mean(np.abs(signal - mean_val)),
                f"{axis}_range": np.max(signal) - np.min(signal),
                f"{axis}_kurtosis": kurtosis(signal),
                f"{axis}_skewness": skew(signal),
            })

        feature_rows.append(feature_dict)

    return pd.DataFrame(feature_rows)

# ---------- Additional Features: SMA, Correlations, Jerk ----------
def compute_sma(group):
    return np.mean(np.abs(group['x-axis']) + np.abs(group['y-axis']) + np.abs(group['z-axis']))

def compute_correlations(group):
    return pd.Series({
        'corr_xy': group['x-axis'].corr(group['y-axis']),
        'corr_xz': group['x-axis'].corr(group['z-axis']),
        'corr_yz': group['y-axis'].corr(group['z-axis']),
    })

def compute_jerk_stats(group):
    jerk_x = np.diff(group['x-axis']) / np.diff(group['timestamp'])
    jerk_y = np.diff(group['y-axis']) / np.diff(group['timestamp'])
    jerk_z = np.diff(group['z-axis']) / np.diff(group['timestamp'])
    return pd.Series({
        'jerk_x_mean': np.mean(jerk_x),
        'jerk_x_std': np.std(jerk_x),
        'jerk_y_mean': np.mean(jerk_y),
        'jerk_y_std': np.std(jerk_y),
        'jerk_z_mean': np.mean(jerk_z),
        'jerk_z_std': np.std(jerk_z),
    })

def enrich_metadata(signals_df, base_metadata):
    features = extract_features_from_signals(signals_df)
    grouped = signals_df.groupby('user_snippet')
    sma_series = grouped.apply(compute_sma).rename("sma")
    correlation_df = grouped.apply(compute_correlations)
    jerk_df = grouped.apply(compute_jerk_stats)
    more_features = pd.concat([sma_series, correlation_df, jerk_df], axis=1).reset_index()
    metadata_enriched = pd.merge(base_metadata, features, on='user_snippet', how='left')
    metadata_enriched = pd.merge(metadata_enriched, more_features, on='user_snippet', how='left')
    return metadata_enriched

# ---------- Apply to all datasets ----------
metadata_all = enrich_metadata(signals_df, metadata_df)
metadata_test_all = enrich_metadata(signals_test_df, metadata_test_df)
metadata_kaggle_all = enrich_metadata(signals_kaggle_df, metadata_kaggle_df)

# ---------- Save to CSV ----------
metadata_all.to_csv('Updated_Metadata.csv', index=False)
metadata_test_all.to_csv('Updated_Metadata_Test.csv', index=False)
metadata_kaggle_all.to_csv('Updated_Metadata_Kaggle.csv', index=False)


  sma_series = grouped.apply(compute_sma).rename("sma")
  correlation_df = grouped.apply(compute_correlations)
  jerk_df = grouped.apply(compute_jerk_stats)
  sma_series = grouped.apply(compute_sma).rename("sma")
  correlation_df = grouped.apply(compute_correlations)
  jerk_df = grouped.apply(compute_jerk_stats)
  sma_series = grouped.apply(compute_sma).rename("sma")
  correlation_df = grouped.apply(compute_correlations)
  jerk_df = grouped.apply(compute_jerk_stats)
