# ATMS 523 Module 5 Project (FAST Version)

This notebook implements the radar_parameters regression analysis following the assignment prompt.
It includes:
- Baseline computation (Z–R relation)
- Multiple Linear Regression
- Polynomial Regression (0–9 degree, 7-fold CV)
- Random Forest (optimized grid search)


In [2]:
import os, warnings, gc, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.ensemble import RandomForestRegressor

# System and warnings optimization
os.environ['OMP_NUM_THREADS'] = '1'
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['MKL_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore', category=FutureWarning)

# === Load dataset ===
df = pd.read_csv('radar_parameters.csv')
df = df.rename(columns={
    'Zh (dBZ)': 'Zh', 'Zdr (dB)': 'Zdr', 'Ldr (dB)': 'Ldr',
    'Kdp (deg km-1)': 'Kdp', 'Ah (dBZ/km)': 'Ah', 'Adr (dB/km)': 'Adp',
    'R (mm/hr)': 'R'
})
df = df.dropna().astype('float32')

FEATURES = ['Zh','Zdr','Ldr','Kdp','Ah','Adp']
TARGET = 'R'
X, y = df[FEATURES].values, df[TARGET].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# === Baseline ===
def baseline_predict(Zh):
    Z_lin = 10 ** (Zh / 10.0)
    return (Z_lin / 200.0) ** (1.0 / 1.6)

y_pred_train_base = baseline_predict(X_train[:,0])
y_pred_test_base  = baseline_predict(X_test[:,0])
baseline_results = {
    'model': 'Baseline (Z=200R^1.6)',
    'R2_train': r2_score(y_train, y_pred_train_base),
    'RMSE_train': rmse(y_train, y_pred_train_base),
    'R2_test': r2_score(y_test, y_pred_test_base),
    'RMSE_test': rmse(y_test, y_pred_test_base)
}
print(baseline_results)

# === Linear Regression ===
lr = LinearRegression().fit(X_train, y_train)
y_pred_train_lr, y_pred_test_lr = lr.predict(X_train), lr.predict(X_test)
lr_results = {
    'model': 'Linear Regression',
    'R2_train': r2_score(y_train, y_pred_train_lr),
    'RMSE_train': rmse(y_train, y_pred_train_lr),
    'R2_test': r2_score(y_test, y_pred_test_lr),
    'RMSE_test': rmse(y_test, y_pred_test_lr)
}
print(lr_results)

# === Polynomial Regression (0–9 degrees, fast CV) ===
N_small = min(2000, len(X_train))
idx = np.random.default_rng(42).choice(len(X_train), N_small, replace=False)
X_small, y_small = X_train[idx], y_train[idx]

poly_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lr', LinearRegression())
])
param_grid = {'poly__degree': list(range(0, 10))}
cv = KFold(n_splits=7, shuffle=True, random_state=42)
gs_poly = GridSearchCV(poly_pipe, param_grid, scoring='r2', cv=cv, n_jobs=1, verbose=0)
gs_poly.fit(X_small, y_small)
best_poly = gs_poly.best_estimator_
best_d = gs_poly.best_params_['poly__degree']
y_pred_train_poly = best_poly.predict(X_train)
y_pred_test_poly = best_poly.predict(X_test)
poly_results = {
    'model': f'Polynomial Regression (deg={best_d})',
    'R2_train': r2_score(y_train, y_pred_train_poly),
    'RMSE_train': rmse(y_train, y_pred_train_poly),
    'R2_test': r2_score(y_test, y_pred_test_poly),
    'RMSE_test': rmse(y_test, y_pred_test_poly)
}
print(poly_results)

# === Random Forest (very light HalvingGridSearchCV) ===
param_rf = {
    'bootstrap': [True],
    'max_depth': [10, 30, None],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1, 4],
    'min_samples_split': [2, 5],
}
rf = RandomForestRegressor(random_state=42, n_jobs=1, max_samples=0.6)
hgs = HalvingGridSearchCV(rf, param_rf, scoring='r2', cv=cv, resource='n_estimators', min_resources=10, max_resources=400, factor=4, n_jobs=1, verbose=0)
N_rf = min(1000, len(X_train))
idx_rf = np.random.default_rng(42).choice(len(X_train), N_rf, replace=False)
hgs.fit(X_train[idx_rf], y_train[idx_rf])


params = {k: v for k, v in hgs.best_params_.items() if k != 'n_estimators'}

best_rf = RandomForestRegressor(
    **params,
    n_estimators=400,      
    random_state=42,
    n_jobs=1,
    max_samples=0.6
)


best_rf.fit(X_train, y_train)
y_pred_train_rf, y_pred_test_rf = best_rf.predict(X_train), best_rf.predict(X_test)
rf_results = {
    'model': 'Random Forest (fast)',
    'R2_train': r2_score(y_train, y_pred_train_rf),
    'RMSE_train': rmse(y_train, y_pred_train_rf),
    'R2_test': r2_score(y_test, y_pred_test_rf),
    'RMSE_test': rmse(y_test, y_pred_test_rf)
}
print(rf_results)

# === Summary ===
summary = pd.DataFrame([baseline_results, lr_results, poly_results, rf_results])
print('\nFinal Model Comparison:')
display(summary)


{'model': 'Baseline (Z=200R^1.6)', 'R2_train': 0.2755506634712219, 'RMSE_train': np.float64(7.143949773870337), 'R2_test': 0.3566429018974304, 'RMSE_test': np.float64(7.1893164951531014)}
{'model': 'Linear Regression', 'R2_train': 0.8271994590759277, 'RMSE_train': np.float64(3.4890477618782993), 'R2_test': 0.8410272002220154, 'RMSE_test': np.float64(3.573740175653319)}


7 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.11/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/sklearn/pipeline.py", line 655, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/pytho

{'model': 'Polynomial Regression (deg=2)', 'R2_train': 0.8905119895935059, 'RMSE_train': np.float64(2.7772672618522156), 'R2_test': 0.9930459260940552, 'RMSE_test': np.float64(0.7474483156995063)}
{'model': 'Random Forest (fast)', 'R2_train': 0.9877867966913689, 'RMSE_train': np.float64(0.9275752809371336), 'R2_test': 0.9779099738301961, 'RMSE_test': np.float64(1.332170018059722)}

Final Model Comparison:


Unnamed: 0,model,R2_train,RMSE_train,R2_test,RMSE_test
0,Baseline (Z=200R^1.6),0.275551,7.14395,0.356643,7.189316
1,Linear Regression,0.827199,3.489048,0.841027,3.57374
2,Polynomial Regression (deg=2),0.890512,2.777267,0.993046,0.747448
3,Random Forest (fast),0.987787,0.927575,0.97791,1.33217
