In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('radar_parameters.csv', index_col=0)

In [3]:
df

Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...
18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


In [4]:
features = df.columns[:-1]
target = df.columns[-1]

X = df[features]
y = df[target]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=22)

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [7]:
def get_baseline(Zh):
    Z = 10.**(Zh/10.)
    R_base = (Z/200.)**(1./1.6)
    return R_base

In [8]:
r2_train = r2_score(y_pred_train, y_train)
r2_test = r2_score(y_pred_test, y_test)

mse_train = np.sqrt(mean_squared_error(y_pred_train, y_train))
mse_test = np.sqrt(mean_squared_error(y_pred_test, y_test))

print("Linear Model: Train versus Test")
print('___________________________________')
print(f'Training R^2: {r2_train:.4f}, RMSE: {mse_train:.4f}')
print(f'Testing R^2: {r2_test:.4f}, RMSE: {mse_test:.4f}')

Linear Model: Train versus Test
___________________________________
Training R^2: 0.9881, RMSE: 0.9297
Testing R^2: 0.9884, RMSE: 0.9194


In [9]:
y_train_base = get_baseline(X_train['Zh (dBZ)'])
y_test_base = get_baseline(X_test['Zh (dBZ)'])


In [10]:
r2_train_base = r2_score(y_train_base, y_train)
r2_test_base = r2_score(y_test_base, y_test)

mse_train_base = np.sqrt(mean_squared_error(y_train_base, y_train))
mse_test_base = np.sqrt(mean_squared_error(y_test_base, y_test))

print('Baseline: Train versus Test')
print('___________________________________')
print(f'Training R^2: {r2_train_base:.4f}, RMSE: {mse_train_base:.4f}')
print(f'Testing R^2: {r2_test_base:.4f}, RMSE: {mse_test_base:.4f}')

Baseline: Train versus Test
___________________________________
Training R^2: 0.2287, RMSE: 7.0970
Testing R^2: 0.2660, RMSE: 7.2969


In [11]:
poly_pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('linear', LinearRegression())
])

param_grid = {'poly__degree': np.arange(0, 10)}

poly_grid = GridSearchCV(poly_pipeline, param_grid, cv=7, scoring='r2', n_jobs=-1)

poly_grid.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'poly__degree': array([0, 1, ..., 6, 7, 8, 9])}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,7
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,degree,np.int64(6)
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [12]:
print('Polynomial Regression Grid Search Results')
print('____________________________________________')
print(f'Best polynomial degree: {poly_grid.best_params_['poly__degree']}')
print(f'Best CV R^2: {poly_grid.best_score_:.4f}')

best_poly_model = poly_grid.best_estimator_

Polynomial Regression Grid Search Results
____________________________________________
Best polynomial degree: 6
Best CV R^2: 0.9999


In [13]:
y_pred_train_poly = best_poly_model.predict(X_train)
y_pred_test_poly = best_poly_model.predict(X_test)

In [14]:
r2_train_poly = r2_score(y_pred_train_poly, y_train)
r2_test_poly = r2_score(y_pred_test_poly, y_test)

mse_train_poly = np.sqrt(mean_squared_error(y_pred_train_poly, y_train))
mse_test_poly = np.sqrt(mean_squared_error(y_pred_test_poly, y_test))

print('Poly Model: Train versus Test')
print('_________________________________')
print(f'Training R^2: {r2_train_poly:.4f}, RMSE: {mse_train_poly:.4f}')
print(f'Testing R^2: {r2_test_poly:.4f}, RMSE: {mse_test_poly:.4f}')

Poly Model: Train versus Test
_________________________________
Training R^2: 1.0000, RMSE: 0.0065
Testing R^2: 0.9994, RMSE: 0.2141


In [15]:
param_grid = {
    "bootstrap": [True, False],
    "max_depth": [10, 100],
    "max_features": ["sqrt", 1.0],  
    "min_samples_leaf": [1, 4],
    "min_samples_split": [2, 10],
    "n_estimators": [200, 1000]
}

rf_model = RandomForestRegressor(random_state=22)
rf_grid = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=7, n_jobs=-1, scoring='r2', verbose=2)

rf_grid.fit(X_train, y_train)

Fitting 7 folds for each of 64 candidates, totalling 448 fits


[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  39.7s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  40.5s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  40.9s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  41.0s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  41.8s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  43.6s
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  45.1s
[CV] END bootstrap=True, max_depth=10, max_featu



[CV] END bootstrap=True, max_depth=10, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=13.1min
[CV] END bootstrap=True, max_depth=10, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=13.1min
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 1.2min
[CV] END bootstrap=True, max_depth=10, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=13.2min
[CV] END bootstrap=True, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time= 1.3min
[CV] END bootstrap=True, max_depth=10, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=13.3min
[CV] END bootstrap=True, max_depth=10, max_features=1.0, min_samples_leaf=1, min_samples_split=10, n_estimators=1000; total time=13.3min
[CV] END bootstrap=True, max_depth=10, ma

0,1,2
,estimator,RandomForestR...ndom_state=22)
,param_grid,"{'bootstrap': [True, False], 'max_depth': [10, 100], 'max_features': ['sqrt', 1.0], 'min_samples_leaf': [1, 4], ...}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,7
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
print('Random Forest Grid Search Results')
print('___________________________________')
print(f'Best parameters: {rf_grid.best_params_}')
print(f'Best CV R^2: {rf_grid.best_score_:.4f}')

Random Forest Grid Search Results
___________________________________
Best parameters: {'bootstrap': True, 'max_depth': 100, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV R^2: 0.9778


In [17]:
best_rf_model = rf_grid.best_estimator_

In [18]:
y_pred_train_rf = best_rf_model.predict(X_train)
y_pred_test_rf = best_rf_model.predict(X_test)

In [19]:
r2_train_rf = r2_score(y_pred_train_rf, y_train)
r2_test_rf = r2_score(y_pred_test_rf, y_test)

mse_train_rf = np.sqrt(mean_squared_error(y_pred_train_rf, y_train))
mse_test_rf = np.sqrt(mean_squared_error(y_pred_test_rf, y_test))

print('Random Forest Model: Train versus Test')
print('_________________________________')
print(f'Training R^2: {r2_train_rf:.4f}, RMSE: {mse_train_rf:.4f}')
print(f'Testing R^2: {r2_test_rf:.4f}, RMSE: {mse_test_rf:.4f}')

Random Forest Model: Train versus Test
_________________________________
Training R^2: 0.9970, RMSE: 0.4602
Testing R^2: 0.9799, RMSE: 1.1823
