In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [None]:
# Loading Data
radar_data = pd.read_csv('homework/radar_parameters.csv')

Part 1 - Generating the testing and training data

In [None]:
# Generating the feature dataset by dropping the target - Rainrate
X = radar_data.drop('R (mm/hr)', axis = 1)

# Generating the target dataset
Y = radar_data['R (mm/hr)']

# Using 4 to 1 train-test split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

print(len(x_train), len(x_test))

In [None]:
def base_rain_calc(Zh: np.array) -> np.array:
    '''
    Function to calculate baseline rainrate base only radar return strength.
    Funtion take a numpy array as an input and returns a numpy array.
    '''
    z = 10 ** (Zh/10)
    r_baseline = (z/200) ** (1/1.6)

    return r_baseline

In [None]:
# Calculating the baseline rain rate from the sample data

baseline_rainrate_test = base_rain_calc(radar_data['Zh (dBZ)'][y_test.index])

Part 2 - Generating a Mulitple Linear Regression Model

In [None]:
model = LinearRegression(fit_intercept=True)

model.fit(x_train,y_train)

# Calculating the R2 score and Root Mean Square error for the Linear Regression
lin_reg_predicted = model.predict(x_test)
lin_reg_R2 = model.score(x_test, y_test)
lin_reg_rmse = root_mean_squared_error(y_test, lin_reg_predicted)

# Calculating the R2 score and Root Mean Square error for the baseline prediction.

baseline_R2 = r2_score(y_test, baseline_rainrate_test)
baseline_rmse = root_mean_squared_error(y_test, baseline_rainrate_test)




In [None]:
print(f"R-squared value for baseline prediction is {np.round(baseline_R2, 3)}")
print(f"RMSE value for baseline prediction is {np.round(baseline_rmse, 3)}")
print(f"R-squared value for multiple linear regression is {np.round(lin_reg_R2, 3)}")
print(f"RMSE value for baseline prediction is {np.round(lin_reg_rmse, 3)}")

In [None]:
# Defining which parameters we want our grid search to search.
# We're looking at polynomials up to the ninth order.

grid_params = {'polynomialfeatures__degree': np.arange(10)}

# Creating the pipeline for a polynomial regression.

def poly_regression(degree = 2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                             LinearRegression(**kwargs))
      

grid_search = GridSearchCV(poly_regression(), param_grid = grid_params, cv = 7)

In [None]:
# Takes about 43 minutes

grid_search.fit(X, Y)


In [None]:
grid_search.best_params_

Best parameters as determined by the CV Grid Search is having 2nd order polynomials

In [None]:
# Creating a 2nd order Polynomial regression and 
# generating predicted values off the x_test set.

poly_2model = poly_regression(degree = 2)
poly_2model.fit(x_train, y_train)
poly_2model_pred = poly_2model.predict(x_test)

In [None]:
# Calculating the R2 score and Root Mean Square error for the 2nd order Polynomial Regression

poly_2model_R2 = r2_score(y_test, poly_2model_pred)
poly_2model_rmse = root_mean_squared_error(y_test, poly_2model_pred)

In [None]:
print(f"R-squared value for multiple polynomial regression of the 2nd order is {np.round(poly_2model_R2, 3)}")
print(f"RMSE value for multiple polynomial regression of the 2nd order is {np.round(poly_2model_rmse, 3)}")

In [89]:
param_grid = {
 "bootstrap": [True, False],
 "max_depth": [10, 100],
 "max_features": ["sqrt", 1.0],  
 "min_samples_leaf": [1, 4],
 "min_samples_split": [2, 10],
 "n_estimators": [200, 1000]}

random_forest_grid = GridSearchCV(RandomForestRegressor(), param_grid, cv=3)

In [None]:
# Takes roughly 2-4 hours

random_forest_grid.fit(X, Y)

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'bootstrap': [True, False], 'max_depth': [10, 100], 'max_features': ['sqrt', 1.0], 'min_samples_leaf': [1, 4], ...}"
,scoring,
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,1000
,criterion,'squared_error'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


Best Parameters as determined by the Random Forest CV grid search are :

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 1.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 1000}

In [None]:
# Training a random forest model with the best parameters
# as determined by the Random Forest CV grid search.
rf_model = RandomForestRegressor(bootstrap = True, max_depth = 100, max_features = 1, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 1000)
rf_model.fit(x_train, y_train)

0,1,2
,n_estimators,1000
,criterion,'squared_error'
,max_depth,100
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
# Generating predicted values from the test set.

rf_model_pred = rf_model.predict(x_test)

In [None]:
# Calculating the R2 score and Root Mean Square error for the Random Forest Regression

rf_model_R2 = r2_score(y_test, rf_model_pred)
rf_model_rmse = root_mean_squared_error(y_test, rf_model_pred)

print(f"R-squared value for the Random Forest best parameters model is {np.round(rf_model_R2, 3)}")
print(f"RMSE value for the Random Forest best parameters model is {np.round(rf_model_rmse, 3)}")

R-squared value for the Random Forest best parameters model is 0.979
RMSE value for the Random Forest best parameters model is 1.208


For my solution, the best results were with the 2nd order polynomial regression. Notably, for the random forest I was 
unable to test across the full parameter space that was initially provided due to lack of compute strength. I also had to
reduce my cross-validation from 7 ways to 3, again to reduce computation time.