In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error


In [2]:
# Load radar data
df = pd.read_csv('homework/radar_parameters.csv')

# Clean column names
df.columns = df.columns.str.strip()

# Select features and target using exact column names
X = df[['Zh (dBZ)', 'Zdr (dB)', 'Ldr (dB)', 'Kdp (deg km-1)', 'Ah (dBZ/km)', 'Adr (dB/km)']]
y = df['R (mm/hr)']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [4]:
# Train linear regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)
y_pred_train = lin_model.predict(X_train)
y_pred_test = lin_model.predict(X_test)

def evaluate(y_true, y_pred, label):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{label} → R²: {r2:.3f}, RMSE: {rmse:.3f}")

evaluate(y_train, y_pred_train, "Linear Train")
evaluate(y_test, y_pred_test, "Linear Test")

# Baseline: Z = 200 * R^1.6 → R = (Z / 200)^(1/1.6)
Z = 10 ** (X_test['Zh (dBZ)'] / 10)
R_baseline = (Z / 200) ** (1 / 1.6)
evaluate(y_test, R_baseline, "Baseline")


Linear Train → R²: 0.988, RMSE: 0.923
Linear Test → R²: 0.989, RMSE: 0.936
Baseline → R²: 0.357, RMSE: 7.189


In [5]:
def PolynomialRegression(degree):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression())

param_grid = {'polynomialfeatures__degree': np.arange(0, 9)}
poly_grid = GridSearchCV(PolynomialRegression(2), param_grid, cv=7, scoring='r2')
poly_grid.fit(X_train, y_train)

# Best model
best_poly = poly_grid.best_estimator_
y_poly_train = best_poly.predict(X_train)
y_poly_test = best_poly.predict(X_test)

evaluate(y_train, y_poly_train, "Poly Train")
evaluate(y_test, y_poly_test, "Poly Test")
print("Best Polynomial Degree:", poly_grid.best_params_)


Poly Train → R²: 1.000, RMSE: 0.005
Poly Test → R²: 1.000, RMSE: 0.007
Best Polynomial Degree: {'polynomialfeatures__degree': np.int64(8)}


In [6]:
# Define evaluation function
def evaluate(y_true, y_pred, label):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{label} → R²: {r2:.3f}, RMSE: {rmse:.3f}")

# Define full parameter grid as required by assignment
param_grid = {
    "bootstrap": [True, False],
    "max_depth": [10, 100],
    "max_features": ["sqrt", 1.0],
    "min_samples_leaf": [1, 4],
    "min_samples_split": [2, 10],
    "n_estimators": [200, 1000]
}

# Create base model
rf = RandomForestRegressor(random_state=42)

# Run grid search with verbose output
rf_grid = GridSearchCV(
    rf,
    param_grid,
    cv=7,
    scoring='r2',
    n_jobs=-1,
    verbose=2  # This shows progress in the notebook
)

# Fit the grid search
rf_grid.fit(X_train, y_train)

# Get best model and predictions
best_rf = rf_grid.best_estimator_
y_rf_train = best_rf.predict(X_train)
y_rf_test = best_rf.predict(X_test)

# Evaluate performance
evaluate(y_train, y_rf_train, "RF Train")
evaluate(y_test, y_rf_test, "RF Test")

# Show best parameters
print("Best RF Parameters:", rf_grid.best_params_)


Fitting 7 folds for each of 64 candidates, totalling 448 fits


KeyboardInterrupt: 