In [9]:
# For this assignment, use the dataset called `radar_parameters.csv` provided in the GitHub repository in the folder `homework`.

## Dataset Description

# The training data consists of polarimetric radar parameters calculated from a disdrometer (an instrument that measures rain drop sizes, shapes, and rainfall rate) 
# measurements from several years in Huntsville, Alabama. A model called `pytmatrix` is used to calculate polarimetric radar parameters from the droplet observations, 
# which can be used as a way to compare what a remote sensing instrument would see and rainfall.

## Data columns

#Features (radar measurements):

#`Zh` - radar reflectivity factor (dBZ) - use the formula $dBZ = 10\log_{10}(Z)$

#`Zdr` - differential reflectivity

#`Ldr` - linear depolarization ratio

#`Kdp` - specific differential phase

#`Ah` - specific attenuation

#`Adp` - differential attenuation

#Target :

#`R` - rain rate

In [10]:
# 1. Split the data into a 70-30 split for training and testing data.
# 2. Using the split created in (1), train a multiple linear regression dataset using the training dataset, and validate it using the testing dataset.  

In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay,
                             accuracy_score, precision_score, recall_score)
from pathlib import Path



In [16]:
from sklearn.linear_model import LinearRegression

In [12]:
# Read in the .csv dataset using pandas
import pandas as pd

radar = pd.read_csv('/home/hildebr1/Module_5/ATMS-523-Module-5/homework/radar_parameters.csv')

radar

Unnamed: 0.1,Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...,...
18964,18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


In [13]:
# Need to convert Zh (dBZ) Z = 10**(radar['Zh (dBZ)']/10)
radar['Zh (dBZ)'] = 10 ** (radar['Zh (dBZ)'] / 10)
radar.rename(columns={'Zh (dBZ)': 'Z'}, inplace=True)

In [14]:
# Check to see if this worked

radar

Unnamed: 0.1,Unnamed: 0,Z,Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,0,206.294563,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,1,187.808651,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,2,486.387732,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,3,714.588688,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,4,1122.051192,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...,...
18964,18964,1417.750266,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,18965,998.466291,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,18966,1474.318332,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,18967,1986.138902,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


In [17]:
FEATURES = ['Z', 'Zdr (dB)', 'Ldr (dB)', 'Kdp (deg km-1)', 'Ah (dBZ/km)', 'Adr (dB/km)']
TARGET = 'R (mm/hr)'


X = radar[FEATURES].values
y = radar[TARGET].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # 70-30 split for training/testing, so test size = 0.3

# Multiple Linear Regression
reg = LinearRegression().fit(X_train, y_train)

# Prediction on test set
y_pred = reg.predict(X_test)

print("Coefficients:", reg.coef_)
print("Intercept:", reg.intercept_)
print("Test set size:", len(y_test))

Coefficients: [ 5.36418741e-04  2.34143684e+00 -3.63876741e-01 -1.12122164e+02
  9.67537224e+03 -1.56995723e+04]
Intercept: -15.519776322079338
Test set size: 5691


In [25]:

# 2) Compare the $R^2$ and root mean square errors of model on the training and testing sets to a baseline prediction of rain rate using the formula $Z = 200 R^{1.6}$.

from sklearn.metrics import r2_score, root_mean_squared_error


# Model predictions
y_pred_train = reg.predict(X_train)
y_pred_test = reg.predict(X_test)

# Baseline prediction (Z) - need to rearrange from $Z = 200 R^{1.6}$ to R = (Z/200)^(1/1.6)
Z_train = X_train[:, FEATURES.index('Z')]
Z_test = X_test[:, FEATURES.index('Z')]

y_base_train = (Z_train / 200) ** (1 / 1.6)
y_base_test = (Z_test / 200) ** (1 / 1.6)

# Print R^2 and RMSE for each of these 
def evaluate(y_rainrate, y_pred, label):
    r2 = r2_score(y_rainrate, y_pred)
    rmse = root_mean_squared_error(y_rainrate, y_pred)
    print(f"{label}: R^2: {r2:.3f}, RMSE: {rmse:.3f}")

print("Model Performance:")
evaluate(y_train, y_pred_train, "Train (Linear Regression Model)")
evaluate(y_test, y_pred_test, "Test (Linear Regression Model)")

print("Baseline Rain Rate Prediction:")
evaluate(y_train, y_base_train, "Train (Baseline Prediction)")
evaluate(y_test, y_base_test, "Test (Baseline Prediction)")

Model Performance:
Train (Linear Regression Model): R^2: 0.993, RMSE: 0.705
Test (Linear Regression Model): R^2: 0.992, RMSE: 0.735
Baseline Rain Rate Prediction:
Train (Baseline Prediction): R^2: 0.333, RMSE: 7.072
Test (Baseline Prediction): R^2: 0.227, RMSE: 7.352


In [26]:
# 3. Repeat 1 doing a grid search over polynomial orders, using a grid search over orders 0-9, and use cross-validation of 7 folds.  
# For the best polynomial model in terms of $R^2$, does it outperform the baseline and the linear regression model in terms of $R^2$ and root mean square error?

In [27]:
from sklearn.preprocessing import PolynomialFeatures

In [28]:
from sklearn.pipeline import make_pipeline

In [30]:
from sklearn.model_selection import GridSearchCV

In [33]:
poly_model = make_pipeline(PolynomialFeatures(include_bias=False), LinearRegression())

# Grid search over orders (degrees) 0–9
param_grid = {
    "polynomialfeatures__degree": np.arange(0, 10)
}

grid = GridSearchCV(
    poly_model,
    param_grid,
    cv=7,            # cross-validation of 7 folds
    scoring='r2',     
    n_jobs=-1          
)

grid.fit(X_train, y_train)

print("Best degree:", grid.best_params_)
print("Best CV R²:", grid.best_score_)

#Evaluate on test data
y_pred = grid.predict(X_test)
r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))

print("Test R²:", r2_test)
print("Test RMSE:", rmse_test)

Best degree: {'polynomialfeatures__degree': np.int64(1)}
Best CV R²: 0.9913232243238621
Test R²: 0.9922723995013984
Test RMSE: 0.7349394990981041


7 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/home/hildebr1/envs/xarray-climate/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/hildebr1/envs/xarray-climate/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/hildebr1/envs/xarray-climate/lib/python3.13/site-packages/sklearn/pipeline.py", line 655, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "/home/h

In [None]:
# 4. Repeat 1 with a Random Forest Regressor, and perform a grid_search on the following parameters:
   
   ```python
   param_grid = {
    "bootstrap": [True, False],
    "max_depth": [10, 100],
    "max_features": ["sqrt", 1.0],  
    "min_samples_leaf": [1, 4],
    "min_samples_split": [2, 10],
    "n_estimators": [200, 1000]}
   ```
 # Can you beat the baseline, or the linear regression, or best polynomial model with the best optimized Random Forest Regressor in terms of $R^2$ and root mean square error?

In [34]:
from sklearn.ensemble import RandomForestRegressor

In [36]:
forest = RandomForestRegressor(random_state=0)

# Grid search parameters
param_grid = {
    "bootstrap": [True, False],
    "max_depth": [10, 100],
    "max_features": ["sqrt", 1.0],  
    "min_samples_leaf": [1, 4],
    "min_samples_split": [2, 10],
    "n_estimators": [200, 1000]
}

# Same grid search used above
grid = GridSearchCV(
    estimator=forest,
    param_grid=param_grid,
    cv=7,
    scoring='r2',
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best hyperparameters:", grid.best_params_)
print("Best CV R²:", grid.best_score_)

# Evaluate on test data
y_pred_rf = grid.predict(X_test)

r2_test = r2_score(y_test, y_pred_rf)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_rf))

print("Test R²:", r2_test)
print("Test RMSE:", rmse_test)




Best hyperparameters: {'bootstrap': True, 'max_depth': 100, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV R²: 0.9782696244025216
Test R²: 0.952697335303038
Test RMSE: 1.8183273091028826


In [37]:
# Model Performance:
# Train (Linear Regression Model): R^2: 0.993, RMSE: 0.705
# Test (Linear Regression Model): R^2: 0.992, RMSE: 0.735

# Baseline Rain Rate Prediction:
# Train (Baseline Prediction): R^2: 0.333, RMSE: 7.072
# Test (Baseline Prediction): R^2: 0.227, RMSE: 7.352

# Polynomial Features:
# Best CV R²: 0.9913232243238621
# Test R²: 0.9922723995013984
# Test RMSE: 0.7349394990981041

# Random Forest Regressor: 
# Best CV R²: 0.9782696244025216
# Test R²: 0.952697335303038
# Test RMSE: 1.8183273091028826