In [103]:
# get modules

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
np.random.seed(282828)

In [None]:
df = (
    pd.read_csv("../ATMS-523-MODULE-5/homework/radar_parameters.csv", index_col=0)
    .rename({'Zh (dBZ)': 'Zh',
             'Zdr (dB)': 'Zdr',
             'Ldr (dB)': 'Ldr',
             'Kdp (deg km-1)': 'Kdp',
             'Ah (dBZ/km)': 'Ah',
             'Adr (dB/km)': 'Adr',
             'R (mm/hr)': 'R'}, axis=1)
)
df['Zh'] = 10**(df['Zh']/ 10)
# features = ['Zh','Zdr','Ldr', 'Kdp', 'Ah', 'Adr']
features = ['Zh']
target = ['R']

In [83]:
df

Unnamed: 0,Zh,Zdr,Ldr,Kdp,Ah,Adr,R
0,206.294563,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520
1,187.808651,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699
2,486.387732,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561
3,714.588688,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447
4,1122.051192,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291
...,...,...,...,...,...,...,...
18964,1417.750266,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020
18965,998.466291,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875
18966,1474.318332,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691
18967,1986.138902,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169


### 1. Split the data into a 70-30 split for training and testing data.

In [84]:
X = df[features].values
y = df[target].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
print(len(X_train)/len(df))

0.6999841847224418


### 2. Using the split created in (1), train a multiple linear regression dataset using the training dataset, and validate it using the testing dataset.  Compare the $R^2$ and root mean square errors of model on the training and testing sets to a baseline prediction of rain rate using the formula $Z = 200 R^{1.6}$.

In [85]:
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
R_baseline = (X_test[:,0] / 200)**(1 / 1.6)

In [98]:
print("The R2 score for the model is", r2_score(y_true=y_test, y_pred=y_pred))
print("The R2 score for the baseline is", r2_score(y_true=y_test, y_pred=R_baseline))
print("The root mean square error of the model is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))
print("The root mean square error of the baseline is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=R_baseline)))

The R2 score for the model is 0.3461045729867669
The R2 score for the baseline is 0.3360241963942211
The root mean square error of the model is 7.3155895546578025
The root mean square error of the baseline is 7.371762038500547


### 3. Repeat 1 doing a grid search over polynomial orders, using a grid search over orders 0-9, and use cross-validation of 7 folds.  For the best polynomial model in terms of $R^2$, does it outperform the baseline and the linear regression model in terms of $R^2$ and root mean square error?

In [99]:
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

param_grid = {'polynomialfeatures__degree': np.arange(21),
              'linearregression__fit_intercept': [True, False]}

grid = GridSearchCV(PolynomialRegression(), param_grid, cv=7)
grid.fit(X_train, y_train)
grid.best_params_

{'linearregression__fit_intercept': True,
 'polynomialfeatures__degree': np.int64(3)}

In [104]:
model_poly = grid.best_estimator_
y_pred_poly = model_poly.predict(X_test)
print("The R2 score for the linear model is", r2_score(y_true=y_test, y_pred=y_pred))
print("The R2 score for the baseline is", r2_score(y_true=y_test, y_pred=R_baseline))
print("The R2 score for the polynomial model is", r2_score(y_true=y_test, y_pred=y_pred_poly))
print("The root mean square error of the linear model is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))
print("The root mean square error of the baseline is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=R_baseline)))
print("The root mean square error of the polynomial model is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_poly)))

The R2 score for the linear model is 0.3461045729867669
The R2 score for the baseline is 0.3360241963942211
The R2 score for the polynomial model is 0.3982811987853676
The root mean square error of the linear model is 7.3155895546578025
The root mean square error of the baseline is 7.371762038500547
The root mean square error of the polynomial model is 7.017654323454563


### 4. Repeat 1 with a Random Forest Regressor, and perform a grid_search on the following parameters:

   ```python
param_grid = {
 "bootstrap": [True, False],
 "max_depth": [10, 100],
 "max_features": ["sqrt", 1.0],  
 "min_samples_leaf": [1, 4],
 "min_samples_split": [2, 10],
 "n_estimators": [200, 1000]}
   ```
  Can you beat the baseline, or the linear regression, or best polynomial model with the best optimized Random Forest Regressor in terms of $R^2$ and root mean square error?

In [None]:
def RandomForestPipeline(**kwargs):
    return make_pipeline(StandardScaler(), RandomForestRegressor(random_state=282828, **kwargs))

param_grid_rf = {
    "randomforestregressor__bootstrap": [True, False],
    "randomforestregressor__max_depth": [10, 100],
    "randomforestregressor__max_features": ["sqrt", 1.0],  
    "randomforestregressor__min_samples_leaf": [1, 4],
    "randomforestregressor__min_samples_split": [2, 10],
    "randomforestregressor__n_estimators": [200, 1000]
}

grid_search_rf = GridSearchCV(
    RandomForestPipeline(),
    param_grid_rf,
    cv=7,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

grid_search_rf.fit(X_train, y_train.ravel())
model_rf = grid_search_rf.best_estimator_
y_pred_rf = model_rf.predict(X_test)
print("The R2 score for the linear model is", r2_score(y_true=y_test, y_pred=y_pred))
print("The R2 score for the baseline is", r2_score(y_true=y_test, y_pred=R_baseline))
print("The R2 score for the polynomial model is", r2_score(y_true=y_test, y_pred=y_pred_poly))
print("The R2 score for the RF model is", r2_score(y_true=y_test, y_pred=y_pred_rf))
print("The root mean square error of the linear model is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)))
print("The root mean square error of the baseline is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=R_baseline)))
print("The root mean square error of the polynomial model is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_poly)))
print("The root mean square error of the RF model is", np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_rf)))

Fitting 7 folds for each of 64 candidates, totalling 448 fits
The R2 score for the linear model is 0.3461045729867669
The R2 score for the baseline is 0.3360241963942211
The R2 score for the polynomial model is 0.3982811987853676
The R2 score for the RF model is 0.4766047617166058
The root mean square error of the linear model is 7.3155895546578025
The root mean square error of the baseline is 7.371762038500547
The root mean square error of the polynomial model is 7.017654323454563
The root mean square error of the RF model is 6.545006236042266
