In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
# Load in data
df = pd.read_csv('radar_parameters.csv')
df.drop(columns='Unnamed: 0', inplace=True)
df

Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr)
0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...
18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


# Part 1: Split the data intro training and test sets


In [3]:
target_col = 'R (mm/hr)' # Target column (rain rate)
num_df = df.copy()
X = num_df.drop(columns=[target_col]).copy() # Feature params
y = num_df[target_col].copy() # Target params
feature_names = X.columns.tolist()

# Train/Test split (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)

print('Train size:', X_train.shape, 'Test size:', X_test.shape)

Train size: (13278, 6) Test size: (5691, 6)


# Part 2

In [4]:
# Run LinReg on training data
model = LinearRegression()
model.fit(X_train, y_train)

# Validate using test data
pred_rain_rate = model.predict(X_test)
r2 = r2_score(y_test.values, pred_rain_rate)
mse = mean_squared_error(y_test.values, pred_rain_rate)
rmse = np.sqrt(mse)

# Performance metrics
print(f'R-squared: {r2}\nRMSE: {rmse}')


R-squared: 0.9890992951689396
RMSE: 0.9358124742086974


In [5]:
# Baseline prediction using Z-R relationship
dbz = df['Zh (dBZ)']
Z = 10**(dbz/10) 
R = (Z/200)**(1/1.6) # Predicted rain rate
actual_R = df['R (mm/hr)']

r2 = r2_score(actual_R.values, R.values)
mse = mean_squared_error(actual_R.values, R.values)
rmse = np.sqrt(mse)

print(f'R-squared: {r2}\nRMSE: {rmse}')

R-squared: 0.3023229070437503
RMSE: 7.157590840042378


The LinReg model has a better performance than the Z-R relationship formula judging by the lower RMSE and higher R-squared.

# Part 3: Grid Search and Polynomial Fit


In [6]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    """
    Polynomial regression function to return polynomial fit
    based on degree of polynomial inputted. 
    """
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [7]:

param_grid = {'polynomialfeatures__degree': np.arange(9), # Search over orders 0-9
              'linearregression__fit_intercept': [True, False]} # Flag telling whether we need to fit the intercept


# Grid search cross val over 7 folds (cv=7)
grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)

# Fit training data to grid search
grid.fit(X_train, y_train)

# Show best
grid.best_params_

{'linearregression__fit_intercept': False, 'polynomialfeatures__degree': 8}

In [8]:
# Get best model based on grid search
model = grid.best_estimator_

model.fit(X_train, y_train)

# Validate using test data
pred_rain_rate = model.predict(X_test)
r2 = r2_score(y_test.values, pred_rain_rate)
mse = mean_squared_error(y_test.values, pred_rain_rate)
rmse = np.sqrt(mse)

# Performance metrics
print(f'R-squared: {r2}\nRMSE: {rmse}')

R-squared: 0.9999992190475068
RMSE: 0.007920887550775944


This polynomial model does appear to outperform both the baseline and LinReg models shown earlier.

# Part 4: Random Forest

In [7]:

# Set dict with parameters to do grid search on
rf_param_grid = param_grid = {
    "bootstrap": [True, False],
    "max_depth": [10, 100],
    "max_features": ["sqrt", 1.0],  
    "min_samples_leaf": [1, 4],
    "min_samples_split": [2, 10],
    "n_estimators": [200, 1000]}

# Do grid search on RF regressor
rf_grid = GridSearchCV(RandomForestRegressor(), 
                       rf_param_grid, 
                       verbose=1, 
                       cv=2,
                       n_jobs=-1)

print(rf_grid)

GridSearchCV(cv=2, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True, False], 'max_depth': [10, 100],
                         'max_features': ['sqrt', 1.0],
                         'min_samples_leaf': [1, 4],
                         'min_samples_split': [2, 10],
                         'n_estimators': [200, 1000]},
             verbose=1)


In [None]:
# Fit training data to grid search
rf_grid.fit(X_train, y_train)
print('done')


Fitting 2 folds for each of 64 candidates, totalling 128 fits


In [None]:
# Show best
rf_grid.best_params_

{'bootstrap': False,
 'max_depth': 100,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [None]:
# Get best model based on grid search with RF
model = rf_grid.best_estimator_

model.fit(X_train, y_train)

# Validate using test data
pred_rain_rate = model.predict(X_test)
r2 = r2_score(y_test.values, pred_rain_rate)
mse = mean_squared_error(y_test.values, pred_rain_rate)
rmse = np.sqrt(mse)

# Performance metrics
print(f'R-squared: {r2}\nRMSE: {rmse}')

R-squared: 0.986030434969219
RMSE: 1.0593829569040536
