## 0) Setup

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np


In [2]:
data = pd.read_csv('radar_parameters.csv')

In [3]:
data

params = data.drop('R (mm/hr)', axis=1)

params

Unnamed: 0.1,Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km)
0,0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012
1,1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012
2,2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030
3,3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043
4,4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064
...,...,...,...,...,...,...,...
18964,18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080
18965,18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057
18966,18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081
18967,18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105


## 1) Split Data 70-30

In [4]:
X = params
y = data['R (mm/hr)']

X1, X2, y1, y2 = train_test_split(X, y, random_state=0, train_size=0.7)

## 2) Multiple Linear Regression Dataset

In [5]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error

model = LinearRegression(fit_intercept=True)
model.fit(X1, y1)

y_train_pred = model.predict(X1)
y_test_pred = model.predict(X2)

r2_train = r2_score(y1, y_train_pred)
r2_test = r2_score(y2, y_test_pred)

rmse_train = np.sqrt(mean_squared_error(y1, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y2, y_test_pred))

print('Training R2:', r2_train)
print('Testing R2:', r2_test)

print('Training RMSE:', rmse_train)
print('Testing RMSE:', rmse_test)

Training R2: 0.9888357865565246
Testing R2: 0.9868605147786396
Training RMSE: 0.9146705347774786
Testing RMSE: 0.9583373917841848


In [6]:
#Z=200*R^1.6 --> R = (Z/200)^(1/1.6)

Z_test = X2['Zh (dBZ)']

R_baseline = (Z_test / 200) ** (1 / 1.6)

r2_baseline = r2_score(y2, R_baseline)
rmse_baseline = np.sqrt(mean_squared_error(y2, R_baseline))

print('Baseline R2:', r2_baseline)
print('Baseline RMSE:', rmse_baseline)

Baseline R2: -0.7841055717910694
Baseline RMSE: 11.167077569585746


## 3) Grid Search over Polynomial Orders

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

param_grid = {'polynomialfeatures__degree': np.arange(0,6),
              'linearregression__fit_intercept': [True, False]}


grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7, scoring='r2')

In [8]:
grid.fit(X, y);

In [9]:
best = grid.best_estimator_

y_pred = best.predict(X)

r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))


print('Best Parameters:', grid.best_params_)
print('Grid R2:', r2)
print('Grid RMSE:', rmse)

Best Parameters: {'linearregression__fit_intercept': True, 'polynomialfeatures__degree': np.int64(2)}
Grid R2: 0.9982148731289727
Grid RMSE: 0.36205469220221986


## 4) Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {
 "bootstrap": [True, False],
 "max_depth": [10, 100],
 "max_features": ["sqrt", 1.0],  
 "min_samples_leaf": [1, 4],
 "min_samples_split": [2, 10],
 "n_estimators": [200, 1000]}

grid = GridSearchCV(RandomForestRegressor(), param_grid, cv=7, scoring='r2')

grid.fit(X, y)

In [None]:
best_rf = grid.best_estimator_

y_pred_rf = best_rf.predict(X)

r2_rf = r2_score(y, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y, y_pred_rf))


print('Best Parameters:', grid.best_params_)
print('Grid R2:', r2_rf)
print('Grid RMSE:', rmse_rf)