In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, root_mean_squared_error

In [2]:
# Read polarimetric radar parameter data
df = pd.read_csv("homework/radar_parameters.csv", index_col=0)
df

Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...
18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


In [3]:
# Split DataFrame into features and target for regression models. Then split the data into a 70-30 split for training and testing.
X = df[
    ["Zh (dBZ)", "Zdr (dB)", "Ldr (dB)", "Kdp (deg km-1)", "Ah (dBZ/km)", "Adr (dB/km)"]
]
y = df["R (mm/hr)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [4]:
# Calculate baseline prediction of rain rate and find R^2 and RMSE between baseline and test target data.
baseline = pd.DataFrame()
baseline["Z"] = 10 ** (X_test["Zh (dBZ)"] / 10)
baseline["R"] = (baseline["Z"] / 200) ** (1 / 1.6)

print("R^2 of the baseline prediction:", r2_score(y_test, baseline["R"]))
print(
    "RMSE of the baseline prediction:", root_mean_squared_error(y_test, baseline["R"])
)

R^2 of the baseline prediction: 0.22661047398943468
RMSE of the baseline prediction: 7.3523877227693095


As we can see the baseline prediction of rain rate does not perform very well with an R^2 value of 0.23 and RMSE of 7.35.

In [5]:
# Create a Linear Regression model with the training data
model_lr = LinearRegression(fit_intercept=True)
model_lr.fit(X_train, y_train)

print(model_lr.intercept_)
print(model_lr.coef_)

-29.04312693861295
[ 1.54730156e-01  2.07180028e+00 -6.16452998e-01 -7.06270939e+01
  7.78165830e+03 -6.23505336e+03]


In [6]:
# Predict the target using the linear regression model on the testing data and calculate R^2 and RMSE.
y_pred_lr = model_lr.predict(X_test)

print("R^2 of the Linear Regression model:", r2_score(y_test, y_pred_lr))
print(
    "RMSE of the Linear Regression model:", root_mean_squared_error(y_test, y_pred_lr)
)

R^2 of the Linear Regression model: 0.9868599917483047
RMSE of the Linear Regression model: 0.9583564653829776


The Linear Regression model performs extremely well with a near perfect R^2 of 0.987 and RMSE of 0.958.

In [7]:
# Define a pipeline implement a linear regression combined with polynomial preprocessor
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree), LinearRegression(**kwargs))

In [8]:
# Define paramters for polynomial regression and run a grid search over orders 0-9, and use cross-validation of 7 folds.
param_grid_pr = {
    "polynomialfeatures__degree": np.arange(10),
    "linearregression__fit_intercept": [True, False],
}

grid_pr = GridSearchCV(PolynomialRegression(), param_grid_pr, cv=7)

In [9]:
# Fit the grid search on the training data and find the best parameters.
grid_pr.fit(X_train, y_train)

grid_pr.best_params_

{'linearregression__fit_intercept': True,
 'polynomialfeatures__degree': np.int64(8)}

In [10]:
# Create a polynomial regression model from the training data.
model_pr = grid_pr.best_estimator_
model_pr.fit(X_train, y_train)

0,1,2
,steps,"[('polynomialfeatures', ...), ('linearregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,np.int64(8)
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [11]:
# Predict target values from the testing data using polynomial regression and find R^2 and RMSE.
y_pred_pr = model_pr.predict(X_test)

print("R^2 of the Polynomial Regression model:", r2_score(y_test, y_pred_pr))

print(
    "RMSE of the Polynomial Regression model:",
    root_mean_squared_error(y_test, y_pred_pr),
)

R^2 of the Polynomial Regression model: 0.9999304180914437
RMSE of the Polynomial Regression model: 0.06973927472956029


The Polynomial Regression model performs even better than the linear regression model with an R^2 value of 0.99 and RMSE of 0.07.

In [12]:
# Create a Random Forest Regression model and define parameters.
forest = RandomForestRegressor(random_state=0)

param_grid_rfr = {
    "bootstrap": [True, False],
    "max_depth": [10, 100],
    "max_features": ["sqrt", 1.0],
    "min_samples_leaf": [1, 4],
    "min_samples_split": [2, 10],
    "n_estimators": [200, 1000],
}

grid_rfr = GridSearchCV(forest, param_grid_rfr, cv=7, n_jobs=-1)

In [13]:
# Run a grid search to find the best parameters.
grid_rfr.fit(X_train, y_train)

grid_rfr.best_params_



{'bootstrap': True,
 'max_depth': 100,
 'max_features': 1.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [14]:
# Create a random forest regression model from the training data
model_rfr = grid_rfr.best_estimator_
model_rfr.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
# Predict target values from the testing data using polynomial regression and find R^2 and RMSE.
y_pred_rfr = model_rfr.predict(X_test)

print("R^2 of the Random Forest Regression model:", r2_score(y_test, y_pred_rfr))

print(
    "RMSE of the Random Forest Regression model:",
    root_mean_squared_error(y_test, y_pred_rfr),
)

R^2 of the Random Forest Regression model: 0.9531546685622041
RMSE of the Random Forest Regression model: 1.809515952313254


Even with optimized parameters the Random Forest regression model does not perform as well as the linear and polynomial regression models, but still performs very well.