<a href="https://colab.research.google.com/github/zhangling297/Substance-Use/blob/master/Linear_Regression_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(123)

## 1. Functions Definition

def make_sliding_windows(data, input_length, output_length):
    """
    Create sliding windows from time series data
    """
    total_rows = len(data)
    max_start = total_rows - input_length - output_length + 1

    input_list = []
    output_list = []

    for i in range(max_start):
        input_sample = data[i:(i + input_length)]
        output_sample = data[(i + input_length):(i + input_length + output_length)]

        if np.any(np.isnan(input_sample)) or np.any(np.isnan(output_sample)):
            continue

        input_list.append(input_sample)
        output_list.append(output_sample)

    return {'inputs': input_list, 'outputs': output_list}

def normalize_minus1_to1_ignore_zero_na(df, lower_pct=0.1, upper_pct=0.99):
    """
    Normalize dataframe columns to range [-1, 1]
    """
    df_normalized = df.copy()

    for col in df.select_dtypes(include=[np.number]).columns:
        x = df[col].values

        # Forward fill then backward fill NA values
        x_filled = pd.Series(x).fillna(method='ffill').fillna(method='bfill').values

        if np.all(np.isnan(x_filled)):
            df_normalized[col] = 0
            continue

        # Calculate quantiles
        x_low = np.nanpercentile(x_filled, lower_pct * 100)
        x_high = np.nanpercentile(x_filled, upper_pct * 100)

        rng = x_high - x_low
        if rng == 0:
            df_normalized[col] = 0
            continue

        # Normalize to [-1, 1]
        x_norm = 2 * (x_filled - x_low) / rng - 1
        x_norm = np.clip(x_norm, -1, 1)
        df_normalized[col] = x_norm

    return df_normalized

def revin_xy_train(x, y):
    """
    Apply RevIN normalization to training data
    """
    mu = np.nanmean(x, axis=1, keepdims=True)
    sigma = np.nanstd(x, axis=1, keepdims=True)
    sigma[sigma == 0] = 1  # Avoid division by zero

    x_norm = (x - mu) / sigma
    y_norm = (y - mu) / sigma

    return {'x_norm': x_norm, 'y_norm': y_norm, 'mu': mu, 'sigma': sigma}

def revin_x(x):
    """
    Apply RevIN normalization to test data
    """
    mu = np.nanmean(x, axis=1, keepdims=True)
    sigma = np.nanstd(x, axis=1, keepdims=True)
    sigma[sigma == 0] = 1

    x_norm = (x - mu) / sigma
    return {'x_norm': x_norm, 'mu': mu, 'sigma': sigma}

## 2. Load and Preprocess Data

# Load data
mY_data = pd.read_csv("C:/Users/18350/Downloads/Workman North Stream Data for STA Final.csv")

# Normalize data
mY_data_normalized = normalize_minus1_to1_ignore_zero_na(mY_data)

# Subset and sample data
partial_data = mY_data_normalized.iloc[231725:].iloc[::40].reset_index(drop=True)

# Convert date column if it exists
if 'PosixDate' in partial_data.columns:
    partial_data['PosixDate'] = pd.to_datetime(partial_data['PosixDate'], format='%Y-%m-%d %H:%M:%S')

## 3. Time Series Visualization

if 'PosixDate' in partial_data.columns and 'Q_cms' in partial_data.columns:
    plt.figure(figsize=(12, 6))
    plt.plot(partial_data['PosixDate'], partial_data['Q_cms'],
             color='blue', linewidth=2)
    plt.xlabel('Date')
    plt.ylabel('Normalized Streamflow (cms)')
    plt.title('Streamflow Over Time (Normalized)')
    plt.grid(True)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## 4. Split Dataset

if 'PosixDate' in partial_data.columns:
    # Find start of 2025
    start_2025 = partial_data[partial_data['PosixDate'].dt.year == 2025].index[0]
    train_end = start_2025
    valid_end = start_2025
    total_rows = len(partial_data)

    # Select columns 4-5 (0-indexed, so columns 3-4)
    train_data = partial_data.iloc[:train_end, 3:5].reset_index(drop=True)
    valid_data = partial_data.iloc[train_end:valid_end, 3:5].reset_index(drop=True)
    test_data = partial_data.iloc[valid_end:total_rows, 3:5].reset_index(drop=True)
else:
    # Fallback if no date column
    train_data = partial_data.iloc[:len(partial_data)//2, 3:5].reset_index(drop=True)
    test_data = partial_data.iloc[len(partial_data)//2:, 3:5].reset_index(drop=True)

## 5. Visualize Train vs Test Distributions

train_last = train_data.iloc[:, -1].dropna().values
test_last = test_data.iloc[:, -1].dropna().values

plt.figure(figsize=(10, 6))
plt.hist(train_last, bins=30, alpha=0.5, color='blue', density=True, label='Train')
plt.hist(test_last, bins=30, alpha=0.5, color='red', density=True, label='Test')
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Histogram of Train vs Test (Q_cms)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 6. Sliding Window Setup

input_length = 48
output_length = 48

train_samples = make_sliding_windows(train_data.values, input_length, output_length)
test_samples = make_sliding_windows(test_data.values, input_length, output_length)

## 7. Convert to 3D Arrays

def list_to_array3d(list_of_arrays):
    sample_count = len(list_of_arrays)
    timesteps = len(list_of_arrays[0])
    features = list_of_arrays[0].shape[1] if len(list_of_arrays[0].shape) > 1 else 1

    if features > 1:
        array_data = np.zeros((sample_count, timesteps, features))
        for i in range(sample_count):
            array_data[i, :, :] = list_of_arrays[i]
    else:
        array_data = np.zeros((sample_count, timesteps))
        for i in range(sample_count):
            array_data[i, :] = list_of_arrays[i].flatten()

    return array_data

X_train = list_to_array3d(train_samples['inputs'])
Y_train = list_to_array3d(train_samples['outputs'])
X_test = list_to_array3d(test_samples['inputs'])
Y_test = list_to_array3d(test_samples['outputs'])

# Select specific variable (Q_cms - column 2)
variable_idx = 1  # 0-indexed
if len(X_train.shape) == 3:
    X_train = X_train[:, :, variable_idx]
    Y_train = Y_train[:, :, variable_idx]
    X_test = X_test[:, :, variable_idx]
    Y_test = Y_test[:, :, variable_idx]

## 8. Train Baseline Linear Model

L_in = X_train.shape[1]
L_out = Y_train.shape[1]

models = []
for j in range(L_out):
    y_col = Y_train[:, j]

    # Remove rows with NaN values
    valid_mask = ~np.isnan(y_col) & ~np.any(np.isnan(X_train), axis=1)
    X_valid = X_train[valid_mask]
    y_valid = y_col[valid_mask]

    if len(X_valid) > 0:
        model = LinearRegression()
        model.fit(X_valid, y_valid)
        models.append(model)
    else:
        models.append(None)

# Make predictions
pred_matrix = np.zeros_like(Y_test)
for j, model in enumerate(models):
    if model is not None:
        pred_matrix[:, j] = model.predict(X_test)

linear_rmse = np.sqrt(mean_squared_error(Y_test, pred_matrix))
linear_mae = mean_absolute_error(Y_test, pred_matrix)

## 9. Apply RevIN Normalization

# Apply RevIN to training data
train_revin = revin_xy_train(X_train, Y_train)
x_train_norm = train_revin['x_norm']
y_train_norm = train_revin['y_norm']

# Train models with RevIN
revin_models = []
for j in range(L_out):
    y_col = y_train_norm[:, j]

    # Remove rows with NaN values
    valid_mask = ~np.isnan(y_col) & ~np.any(np.isnan(x_train_norm), axis=1)
    X_valid = x_train_norm[valid_mask]
    y_valid = y_col[valid_mask]

    if len(X_valid) > 0:
        model = LinearRegression()
        model.fit(X_valid, y_valid)
        revin_models.append(model)
    else:
        revin_models.append(None)

# Apply RevIN to test data and make predictions
test_revin = revin_x(X_test)
x_test_norm = test_revin['x_norm']
test_mu = test_revin['mu']
test_sigma = test_revin['sigma']

pred_norm = np.zeros_like(Y_test)
for j, model in enumerate(revin_models):
    if model is not None:
        pred_norm[:, j] = model.predict(x_test_norm)

pred_matrix_revIN = pred_norm * test_sigma + test_mu

revin_rmse = np.sqrt(mean_squared_error(Y_test, pred_matrix_revIN))
revin_mae = mean_absolute_error(Y_test, pred_matrix_revIN)

## 10. Compare Results

print("Baseline Linear RMSE:", linear_rmse)
print("RevIN Linear RMSE   :", revin_rmse)
print("Baseline Linear MAE :", linear_mae)
print("RevIN Linear MAE    :", revin_mae)

## 11. Plot Predictions

sample_id = np.argmin(np.nanmean((pred_matrix_revIN - Y_test) ** 2, axis=1))
L_in = X_test.shape[1]
L_out = Y_test.shape[1]
time_axis = np.arange(1, L_in + L_out + 1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Baseline linear regression plot
x_input = X_test[sample_id, :]
y_true = Y_test[sample_id, :]
y_pred = pred_matrix[sample_id, :]

input_and_truth = np.concatenate([x_input, y_true])
input_and_pred = np.concatenate([np.full(L_in, np.nan), y_pred])

ax1.plot(time_axis, input_and_truth, 'k-', linewidth=2, label='Input + Ground Truth')
ax1.plot(time_axis, input_and_pred, 'b--', linewidth=2, label='Predicted Output')
ax1.axvline(x=L_in, color='gray', linestyle=':', alpha=0.7)
ax1.set_xlabel('Time Step')
ax1.set_ylabel('Normalized cms')
ax1.set_title(f'Linear Regression (Sample {sample_id})')
ax1.legend()
ax1.grid(True, alpha=0.3)

# RevIN linear regression plot
y_pred_revin = pred_matrix_revIN[sample_id, :]
input_and_truth2 = np.concatenate([x_input, y_true])
input_and_pred2 = np.concatenate([np.full(L_in, np.nan), y_pred_revin])

ax2.plot(time_axis, input_and_truth2, 'k-', linewidth=2, label='Input + Ground Truth')
ax2.plot(time_axis, input_and_pred2, 'b--', linewidth=2, label='Predicted Output')
ax2.axvline(x=L_in, color='gray', linestyle=':', alpha=0.7)
ax2.set_xlabel('Time Step')
ax2.set_ylabel('Normalized cms')
ax2.set_title(f'Linear Regression + RevIN (Sample {sample_id})')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 12. Additional Model Comparisons (as suggested)

def train_and_evaluate_model(model, X_train, Y_train, X_test, Y_test, use_revin=False):
    """Train and evaluate a given model with optional RevIN"""
    if use_revin:
        # Apply RevIN
        train_revin = revin_xy_train(X_train, Y_train)
        x_train_norm = train_revin['x_norm']
        y_train_norm = train_revin['y_norm']

        test_revin = revin_x(X_test)
        x_test_norm = test_revin['x_norm']
        test_mu = test_revin['mu']
        test_sigma = test_revin['sigma']

        # Train per-output models
        models = []
        for j in range(Y_train.shape[1]):
            y_col = y_train_norm[:, j]
            valid_mask = ~np.isnan(y_col) & ~np.any(np.isnan(x_train_norm), axis=1)
            if np.sum(valid_mask) > 0:
                model_clone = clone_model(model)
                model_clone.fit(x_train_norm[valid_mask], y_col[valid_mask])
                models.append(model_clone)
            else:
                models.append(None)

        # Predict
        pred_norm = np.zeros_like(Y_test)
        for j, m in enumerate(models):
            if m is not None:
                pred_norm[:, j] = m.predict(x_test_norm)

        predictions = pred_norm * test_sigma + test_mu
    else:
        # Standard training
        models = []
        for j in range(Y_train.shape[1]):
            y_col = Y_train[:, j]
            valid_mask = ~np.isnan(y_col) & ~np.any(np.isnan(X_train), axis=1)
            if np.sum(valid_mask) > 0:
                model_clone = clone_model(model)
                model_clone.fit(X_train[valid_mask], y_col[valid_mask])
                models.append(model_clone)
            else:
                models.append(None)

        # Predict
        predictions = np.zeros_like(Y_test)
        for j, m in enumerate(models):
            if m is not None:
                predictions[:, j] = m.predict(X_test)

    rmse = np.sqrt(mean_squared_error(Y_test, predictions))
    mae = mean_absolute_error(Y_test, predictions)

    return rmse, mae, predictions

def clone_model(model):
    """Create a new instance of the same model type"""
    if isinstance(model, LinearRegression):
        return LinearRegression()
    elif isinstance(model, Ridge):
        return Ridge(alpha=model.alpha)
    elif isinstance(model, Lasso):
        return Lasso(alpha=model.alpha)
    elif isinstance(model, RandomForestRegressor):
        return RandomForestRegressor(n_estimators=100, random_state=42)
    else:
        return type(model)()

# Compare different models
models_to_test = {
    'Ridge (α=1.0)': Ridge(alpha=1.0),
    'Lasso (α=0.1)': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

print("\n" + "="*50)
print("MODEL COMPARISON")
print("="*50)

for name, model in models_to_test.items():
    # Without RevIN
    rmse_standard, mae_standard, _ = train_and_evaluate_model(
        model, X_train, Y_train, X_test, Y_test, use_revin=False
    )

    # With RevIN
    rmse_revin, mae_revin, _ = train_and_evaluate_model(
        model, X_train, Y_train, X_test, Y_test, use_revin=True
    )

    print(f"\n{name}:")
    print(f"  Standard - RMSE: {rmse_standard:.4f}, MAE: {mae_standard:.4f}")
    print(f"  RevIN    - RMSE: {rmse_revin:.4f}, MAE: {mae_revin:.4f}")