In [1]:
import pandas as pd
data = pd.read_csv("C:\\Users\\tomy3\\ATMS-523-Module-5-3\\homework\\radar_parameters.csv")
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18969 entries, 0 to 18968
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      18969 non-null  int64  
 1   Zh (dBZ)        18969 non-null  float64
 2   Zdr (dB)        18969 non-null  float64
 3   Ldr (dB)        18969 non-null  float64
 4   Kdp (deg km-1)  18969 non-null  float64
 5   Ah (dBZ/km)     18969 non-null  float64
 6   Adr (dB/km)     18969 non-null  float64
 7   R (mm/hr)       18969 non-null  float64
dtypes: float64(7), int64(1)
memory usage: 1.2 MB
None
         Unnamed: 0      Zh (dBZ)      Zdr (dB)      Ldr (dB)  Kdp (deg km-1)  \
count  18969.000000  18969.000000  18969.000000  18969.000000    18969.000000   
mean    9484.000000     31.294021      0.762979    -37.969272        0.080879   
std     5476.022964      6.496330      0.363489      3.277391        0.221018   
min        0.000000     14.036426      0.285207    -44.849249  

In [2]:
from sklearn.model_selection import train_test_split
target_cols = [col for col in data.columns if col.strip().startswith("R")]
if len(target_cols) == 0:
	raise KeyError(f"No target column starting with 'R' found. Available columns: {list(data.columns)}")
target = target_cols[0]

X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

def baseline_predict(Zh):
    Z = 10 ** (Zh / 10)
    R_pred = (Z / 200) ** (1 / 1.6)
    return R_pred

baseline_train = baseline_predict(X_train["Zh (dBZ)"])
baseline_test = baseline_predict(X_test["Zh (dBZ)"])

baseline_metrics = {
    "train_r2": r2_score(y_train, baseline_train),
    "train_rmse": np.sqrt(mean_squared_error(y_train, baseline_train)),
    "test_r2": r2_score(y_test, baseline_test),
    "test_rmse": np.sqrt(mean_squared_error(y_test, baseline_test))
}

In [4]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

lr_train = lr.predict(X_train)
lr_test = lr.predict(X_test)

lr_metrics = {
    "train_r2": r2_score(y_train, lr_train),
    "train_rmse": np.sqrt(mean_squared_error(y_train, lr_train)),
    "test_r2": r2_score(y_test, lr_test),
    "test_rmse": np.sqrt(mean_squared_error(y_test, lr_test))
}

In [19]:

X_poly = data[["Zh (dBZ)", "Zdr (dB)", "Kdp (deg km-1)"]]
y_poly = data["R (mm/hr)"]

X_train, X_test, y_train, y_test = train_test_split(X_poly, y_poly, test_size=0.3, random_state=42)

pipe = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression())
])

param_grid = {"poly__degree": list(range(1, 5))}
grid_poly = GridSearchCV(pipe, param_grid, cv=7, scoring="r2", n_jobs=-1)

grid_poly.fit(X_train, y_train)

best_poly = grid_poly.best_estimator_
y_pred = best_poly.predict(X_test)

print(f" Best polynomial degree: {grid_poly.best_params_['poly__degree']}")
print(f" Test R²: {r2_score(y_test, y_pred):.4f}")
print(f" Test RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")

 Best polynomial degree: 4
 Test R²: 0.9907
 Test RMSE: 0.8665


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import time

# Use same features as Step 5
X_rf = data[["Zh (dBZ)", "Zdr (dB)", "Kdp (deg km-1)"]]
y_rf = data["R (mm/hr)"]

X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size=0.3, random_state=42)

# Reduced grid: one value per category, two for n_estimators
param_grid = {
    "bootstrap": [True],                # covers bootstrap
    "max_depth": [100],                # covers max_depth
    "max_features": ["sqrt"],          # covers max_features
    "min_samples_leaf": [1],           # covers min_samples_leaf
    "min_samples_split": [2],          # covers min_samples_split
    "n_estimators": [200, 1000]        # tests model size
}

rf = RandomForestRegressor(random_state=42)
grid_rf = GridSearchCV(rf, param_grid, cv=7, scoring="r2", n_jobs=-1)

start = time.time()
grid_rf.fit(X_train, y_train)
elapsed = time.time() - start

rf_best = grid_rf.best_estimator_
rf_train = rf_best.predict(X_train)
rf_test = rf_best.predict(X_test)

rf_metrics = {
    "best_params": grid_rf.best_params_,
    "train_r2": r2_score(y_train, rf_train),
    "train_rmse": np.sqrt(mean_squared_error(y_train, rf_train)),
    "test_r2": r2_score(y_test, rf_test),
    "test_rmse": np.sqrt(mean_squared_error(y_test, rf_test))
}

print(f" Best Random Forest parameters: {rf_metrics['best_params']}")
print(f"Grid search time: {elapsed:.2f} seconds")
print(f" Train R²: {rf_metrics['train_r2']:.4f}")
print(f" Train RMSE: {rf_metrics['train_rmse']:.4f}")
print(f" Test R²: {rf_metrics['test_r2']:.4f}")
print(f" Test RMSE: {rf_metrics['test_rmse']:.4f}")

 Best Random Forest parameters: {'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Grid search time: 28.10 seconds
 Train R²: 0.9942
 Train RMSE: 0.6391
 Test R²: 0.9682
 Test RMSE: 1.5977


In [18]:
import pprint
print("Baseline:", baseline_metrics)
print("Linear Regression:", lr_metrics)
poly_metrics = {
    "train_r2": r2_score(y_train, best_poly.predict(X_train)),
    "train_rmse": np.sqrt(mean_squared_error(y_train, best_poly.predict(X_train))),
    "test_r2": r2_score(y_test, y_pred),
    "test_rmse": np.sqrt(mean_squared_error(y_test, y_pred))
}
print("Best Polynomial:", poly_metrics)
print("Best Random Forest:", rf_metrics)

Baseline: {'train_r2': 0.27555056242697507, 'train_rmse': 7.143950117300888, 'test_r2': 0.35664291868109677, 'test_rmse': 7.189316160047872}
Linear Regression: {'train_r2': 0.9879101770566684, 'train_rmse': 0.9228781078336319, 'test_r2': 0.989104571558249, 'test_rmse': 0.9355859609387075}
Best Polynomial: {'train_r2': 0.9902032750423699, 'train_rmse': 0.8307585723804153, 'test_r2': 0.9906547752211697, 'test_rmse': 0.8664757564563575}
Best Random Forest: {'best_params': {'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}, 'train_r2': 0.9942012315548306, 'train_rmse': 0.63914921095839, 'test_r2': 0.9682264271461324, 'test_rmse': 1.5976975395530195}
