In [2]:
# Load in the Data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from ipywidgets import interact, IntSlider, FloatSlider, Dropdown, VBox, HBox, fixed, Output
from IPython.display import display, HTML
df = pd.read_csv("homework/radar_parameters.csv")

df.drop(columns='Unnamed: 0', inplace=True)
display(df.head())
display(df.describe(include='all'))
print(f"Rows: {len(df)}, Columns: {len(df.columns)}")

Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,23.144878,0.418637,-41.757733,0.005395,0.00029,1.2e-05,2.39352
1,22.737156,0.32285,-43.772069,0.005194,0.00036,1.2e-05,3.502699
2,26.869826,0.330948,-43.577399,0.013385,0.000903,3e-05,8.627561
3,28.540561,0.39948,-42.139731,0.018872,0.001036,4.3e-05,8.424447
4,30.500127,0.543758,-39.763087,0.027438,0.001157,6.4e-05,8.189291


Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
count,18969.0,18969.0,18969.0,18969.0,18969.0,18969.0,18969.0
mean,31.294021,0.762979,-37.969272,0.080879,0.001829,0.000234,7.855561
std,6.49633,0.363489,3.277391,0.221018,0.003469,0.000822,8.569413
min,14.036426,0.285207,-44.849249,0.000697,4.4e-05,2e-06,0.309399
25%,26.720145,0.489184,-40.573505,0.011537,0.000482,2.7e-05,3.072614
50%,31.02028,0.677804,-38.11314,0.02864,0.000977,6.9e-05,5.622457
75%,35.597165,0.94702,-35.601404,0.073099,0.00197,0.000182,9.622175
max,57.400639,3.843941,-25.373718,5.06071,0.082511,0.027538,195.557062


Rows: 18969, Columns: 7


In [3]:
# Target = Rain Rate

target_col = 'R (mm/hr)'
num_df = df.copy()
X = num_df.drop(columns=[target_col]).copy()
y = num_df[target_col].copy()
feature_names = X.columns.tolist()
X.shape, y.shape

((18969, 6), (18969,))

In [4]:
# 1 Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print('Train size:', X_train.shape, 'Test size:', X_test.shape)

Train size: (13278, 6) Test size: (5691, 6)


In [5]:
# 2 Training a multiple linear regression dataset using the training dataset

model = LinearRegression()
model.fit(X_train, y_train)

print(model.intercept_)
print( model.coef_)

# validate it using the testing dataset. 
y_pred = model.predict(X_test)

-29.22109494538222
[ 1.59901626e-01  2.02690952e+00 -6.18193443e-01 -7.10460369e+01
  7.77892389e+03 -6.12208071e+03]


In [6]:
# 2 R^2 and root mean square errors

from sklearn.metrics import r2_score, mean_squared_error

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² on test set: {r2:.3f}")
print(f"RMSE on test set: {rmse:.3f}")


R² on test set: 0.989
RMSE on test set: 0.936


In [7]:
# baseline prediction of rain rate using the formula $Z = 200 R^{1.6}$.


Z_linear = 10 ** (df['Zh (dBZ)'] / 10)
R_baseline = (Z_linear / 200) ** (1 / 1.6)

In [8]:
# R^2 and RMSE for the baseline
r2_baseline = r2_score(df['R (mm/hr)'], R_baseline)
rmse_baseline = np.sqrt(mean_squared_error(df['R (mm/hr)'], R_baseline))

print(f"Baseline R²: {r2_baseline:.3f}")
print(f"Baseline RMSE: {rmse_baseline:.3f}")

Baseline R²: 0.302
Baseline RMSE: 7.158


In [9]:
# Comparison:
# R² on test set: 0.989, RMSE on test set: 0.936
# Baseline R²: 0.302, Baseline RMSE: 7.158

# The test set performs much better at predicting rainfall rate than the baseline prediction equation

In [10]:
# 3 Repeat 1 doing a grid search over polynomial orders, using a grid search over orders 0-9, and use cross-validation of 7 folds.

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

poly_model = make_pipeline(PolynomialFeatures(degree=3, include_bias=False),
                           LinearRegression())
poly_model.fit(X_train, y_train)
y_test_pred = poly_model.predict(X_test)


param_grid = {'polynomialfeatures__degree': np.arange(10),
              'linearregression__fit_intercept': [True, False]}
grid = GridSearchCV(poly_model, param_grid, cv=7, scoring='r2')
grid.fit(X_train, y_train)

print("Best R²:", grid.best_score_)

14 fits failed out of a total of 140.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
14 fits failed with the following error:
Traceback (most recent call last):
  File "/home/alexakt2/envs/xarray-climate/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/alexakt2/envs/xarray-climate/lib/python3.13/site-packages/sklearn/base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/alexakt2/envs/xarray-climate/lib/python3.13/site-packages/sklearn/pipeline.py", line 655, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
  File "/hom

Best R²: 0.9999565556289054


In [11]:

best_poly = grid.best_estimator_
y_test_pred = best_poly.predict(X_test)

r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("R² on test set:", r2_test)
print("RMSE on test set:", rmse_test)

R² on test set: 0.9999992190474835
RMSE on test set: 0.007920887668981474


In [None]:
# 3
# For the best polynomial model in terms of $R^2$, does it outperform the baseline and the linear regression model 
# in terms of $R^2$ and root mean square error?

# ANSWER
#It does out perform it in terms of R^2, but it performs badly on the RMSE. 

In [13]:
# 4 Repeat 1 with a Random Forest Regressor

rf = RandomForestRegressor(random_state=42)

param_grid = {
    "bootstrap": [True, False],
    "max_depth": [10, 100],
    "max_features": ["sqrt", 1.0],
    "min_samples_leaf": [1, 4],
    "min_samples_split": [2, 10],
    "n_estimators": [200, 1000]
}

grid_rf = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train, y_train)


best_rf = grid_rf.best_estimator_

r2_test = r2_score(y_test, y_test_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("RF R²:", r2_test)
print("RF RMSE:", rmse_test)


RF R²: 0.9999992190474835
RF RMSE: 0.007920887668981474


In [None]:
# 4 Can you beat the baseline, or the linear regression, or best polynomial model with 
# the best optimized Random Forest Regressor in terms of $R^2$ and root mean square error

#ANSWER
#It looks like it overfit the model. It was not able to out perofrm the linerar regression model. 