In [31]:
import sys
import numpy as np
import seaborn as sb

import pandas as pd
import datetime
import tensorflow as tf
import tensorflow.keras as keras
import sklearn
import pydot
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# %matplotlib inline


In [2]:
df = pd.read_csv('homework/radar_parameters.csv')
df = df.drop(['Unnamed: 0'], axis=1)

In [3]:
df.dropna(inplace=True) #drop NaNs 

df

#sb.pairplot(df)


#`Zh` - radar reflectivity factor (dBZ) - use the formula $dBZ = 10\log_{10}(Z)$
#so to get Z, we need to calculate this value from it column 1

df['Z (mm^6/m^3)'] = 10 ** (df['Zh (dBZ)'] / 10)


#Z = 200 R^{1.6} ==== reflectivity and rain rate

df

Unnamed: 0,Zh (dBZ),Zdr (dB),Ldr (dB),Kdp (deg km-1),Ah (dBZ/km),Adr (dB/km),R (mm/hr),Z (mm^6/m^3)
0,23.144878,0.418637,-41.757733,0.005395,0.000290,0.000012,2.393520,206.294563
1,22.737156,0.322850,-43.772069,0.005194,0.000360,0.000012,3.502699,187.808651
2,26.869826,0.330948,-43.577399,0.013385,0.000903,0.000030,8.627561,486.387732
3,28.540561,0.399480,-42.139731,0.018872,0.001036,0.000043,8.424447,714.588688
4,30.500127,0.543758,-39.763087,0.027438,0.001157,0.000064,8.189291,1122.051192
...,...,...,...,...,...,...,...,...
18964,31.515997,0.579955,-39.244229,0.034048,0.001417,0.000080,10.648020,1417.750266
18965,29.993334,0.567935,-39.399188,0.024134,0.001032,0.000057,7.981875,998.466291
18966,31.685913,0.655681,-38.375696,0.033971,0.001165,0.000081,6.822691,1474.318332
18967,32.980096,0.768586,-37.166218,0.043117,0.001285,0.000105,6.801169,1986.138902


In [4]:
x = df.drop(['R (mm/hr)'], axis=1) #X data
#x = x.values #numpy array for values

y = df['R (mm/hr)'] #y target data

#y = y.values #turn into numpy array

print(x.shape)
print(y.shape)

(18969, 7)
(18969,)


In [5]:
#1. Split the data into a 70-30 split for training and testing data.
#test_size = .3
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3, random_state=1)


In [6]:
#scale variables using StandardScaler because they're all different variables. 
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() #initiate scaler

#target does not need to be scaled

xtrain_scaled = scaler.fit_transform(xtrain) #apply scaling

xtest_scaled = scaler.transform(xtest) #apply scaling

x_scaled = scaler.transform(x) #value data, scaled


print(xtrain_scaled.shape)
print(xtest_scaled.shape)
print(x_scaled.shape)



(13278, 7)
(5691, 7)
(18969, 7)


2. Using the split created in (1), train a multiple linear regression dataset using the training dataset, and validate it using the testing dataset.  Compare the $R^2$ and root mean square errors of model on the training and testing sets to a baseline prediction of rain rate using the formula $Z = 200 R^{1.6}$.

   - For supervised learning, often we predict labels for unknown data using the ``predict()`` method. ---- this is the one we should use
   - For unsupervised learning, we often transform or infer properties of the data using the ``transform()`` or ``predict()`` method.


In [7]:
# use sklearn linear regression model
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True)

model.fit(xtrain_scaled, ytrain) #use train data for model fit

print(model.intercept_)
print(model.coef_)

7.855626808368438
[  0.23280702   0.86441256  -1.38437146 -24.34811867  33.16927625
 -12.70831978  10.2548108 ]


In [8]:
# use predict to apply fit model to the test data. 

y_pred = model.predict(xtest_scaled) # make model inferences on test set

In [9]:
#Compare the $R^2$ and root mean square errors of model on the training and testing sets to a baseline prediction of rain rate using the formula $Z = 200 R^{1.6}$.

#evaluate the performance of the model on the test data R^2 and the RMSE


from sklearn.metrics import mean_squared_error, r2_score #keeps mse in sklearn keeps coming up as deprecated... just going to calculate with numpy


rmse_test = np.sqrt(np.mean((ytest - y_pred) ** 2))  # residuals of test data (actuals) and predicted values, squared. then sqrt

r2_test = r2_score(ytest, y_pred)

print(rmse_test)
print(r2_test)

0.6948409252416231
0.9937113650587397


In [10]:
#baseline prediction of rain rate using the formula $Z = 200 R^{1.6}$.

#find values
z = df['Z (mm^6/m^3)'] #actual z values
y = df['R (mm/hr)'] # actual R values. equivlent to target data

#input values into formula
r = (z / 200) ** (1/1.6) #predicted

rmse_z = np.sqrt(np.mean((y - r) ** 2))

r2_z = r2_score(y, r)

rmse_rate = rmse_z/(y.mean()) *100

print(rmse_z)
print(r2_z)

print(rmse_rate) #looks abnormally high!

7.157590840042378
0.3023229070437503
91.11495582285232


In [11]:
ztrain, ztest, ytrain_z, ytest_z = train_test_split(z, y, test_size = 0.3, random_state=1)

#split train/test? 

In [12]:
#baseline predictions based on the split data from the original using the formula plug-in

y_pred_train_z = (ztrain / 200) ** (1/1.6) #predicted values of rainfall data based on the formula in the train data

y_pred_test_z = (ztest / 200) ** (1/1.6) #with test data

#find the r values based on the z value predictions

In [13]:
rmse_test_b = np.sqrt(np.mean((ytest_z - y_pred_test_z) ** 2))  # residuals of train model and baseline model 
rmse_train_b = np.sqrt(np.mean((ytrain_z - y_pred_train_z) ** 2)) 

##r^2 shows how much model can predict variance of target variable 
r2_test_b = r2_score(ytest_z, y_pred_test_z)
r2_train_b = r2_score(ytrain_z, y_pred_train_z)

In [132]:
print('test and train linear regression \n')
print('rmse test', rmse_test)

print('r^2 test', r2_test)

print('rmse test percentag of target mean', (rmse_test/ytest.mean())*100 )

print('baseline \n')
print('rmse test baseline', rmse_test_b)
print('rmse train baseline', rmse_train_b)
print('r^2 test baseline', r2_test_b)
print('r^2train baseline', r2_train_b)
print('rmse baseline test percentage of target mean', (rmse_test_b/ytest_z.mean())*100 )
print('rmse baseline train percentage of target mean', (rmse_train_b/ytrain_z.mean())*100, '\n')

 


test and train linear regression 

rmse test 0.6948409252416231
r^2 test 0.9937113650587397
rmse test percentag of target mean 8.845384017977953
baseline 

rmse test baseline 7.118734189050042
rmse train baseline 7.174180513738772
r^2 test baseline 0.3399282314234663
r^2train baseline 0.2851359692537385
rmse baseline test percentage of target mean 90.62209109539727
rmse baseline train percentage of target mean 91.32537337563265 



The closer R^2 is to 1, the more accurate of a predictive model we can assume something is. Lower values of RMSE are indicative of more accurate models as well.

Based on this, we can assume that the test and train split predictie models are much more accurate than just using the Z value of reflectivity alone to predict rainfall


3. Repeat 1 doing a grid search over polynomial orders, using a grid search over orders 0-21, and use cross-validation of 7 folds.  For the best polynomial model in terms of $R^2$, does it outperform the baseline and the linear regression model in terms of $R^2$ and root mean square error?

In [51]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(PolynomialFeatures(degree),
                         LinearRegression(**kwargs))

In [17]:
# k = 7
#orders 0-21
# Define the parameter grid for the polynomial degrees
param_grid = {
    'polynomialfeatures__degree': np.arange(11),  # Degrees from 0 to 21. MODIFIED TO HALF BC OF PROCESSING TIME. n_jobs =-1 ?
    'linearregression__fit_intercept': [True, False]}

# Perform grid search with 7-fold cross-validation
grid = GridSearchCV(PolynomialRegression(), param_grid, cv=3) #MODIFIED FROM 7 DUE TO PROCESSING TIME

# Fit the grid search on the scaled training data
grid.fit(xtrain_scaled, ytrain)

In [55]:
print(grid.best_params_)

grid_model = grid.best_estimator_



{'linearregression__fit_intercept': False, 'polynomialfeatures__degree': 2}


In [56]:
y_grid_pred = grid_model.predict(xtest_scaled)

In [133]:
r2_grid = r2_score(ytest, y_grid_pred)
rmse_grid = np.sqrt(mean_squared_error(ytest, y_grid_pred))
print(rmse_grid)
print(r2_grid)
print('rmse test percentage of target mean', (rmse_grid/ytest.mean())*100 )

0.1251115773147737
0.9997961171933879
rmse test percentage of target mean 1.5926810097710982


4. Repeat 1 with a Random Forest Regressor, and perform a grid_search on the following parameters:
   
   ```python
   {'bootstrap': [True, False],  
   'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],  
   'max_features': ['auto', 'sqrt'],  
   'min_samples_leaf': [1, 2, 4],  
   'min_samples_split': [2, 5, 10],  
   'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
   ```
  Can you beat the baseline, or the linear regression, or best polynomial model with the best optimized Random Forest Regressor in terms of $R^2$ and root mean square error?


In [90]:
#Taking the train test split from earlier:
#xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.3, random_state=1)
#xtrain_scaled = scaler.fit_transform(xtrain) #apply scaling
#xtest_scaled = scaler.transform(xtest) #apply scaling
#x_scaled = scaler.transform(x) #value data, scaled


params = {'bootstrap': [True, False],  
   'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],  
   'max_features': [None, 'sqrt'],  #auto deprecated
   'min_samples_leaf': [1, 2, 4],  
   'min_samples_split': [2, 5, 10],  
   'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [82]:
# Create base LGBM model
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import RandomizedSearchCV



In [80]:
rf = RandomForestRegressor(params, random_state=42)

In [115]:
# Create random search for LGBM model
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=params, n_iter = 11, cv=3, n_jobs = -1, verbose=2)

rf_random.fit(xtrain_scaled, ytrain) ###don't need to standardize in RF but I kept it


Fitting 3 folds for each of 11 candidates, totalling 33 fits


[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=False, max_depth=100, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time= 1.1min
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time= 1.2min
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time= 1.2min
[CV] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time= 1.2min
[CV] END bootstrap=True, max_depth=60, max_features=None, min_samples_leaf=2, min_samples_split=10, n_estimators=1000; total time= 2.3min
[CV] END bootstrap=True, max_de

In [123]:
y_pred_rf = rf_random.predict(xtest_scaled)


# R2 score
print('R^2', r2_score(ytest, y_pred_rf))

# Root mean square error

print('rmse', np.sqrt(mean_squared_error(ytest, y_pred_rf)))

R^2 0.9888386604414193
rmse 0.9256895436695578


In [116]:
# Get optimal hyper-params
rf_model = RandomForestRegressor(**rf_random.best_params_, random_state = 1)

rf_model.fit(xtrain_scaled, ytrain)


In [119]:
y_pred_rf_best = rf_model.predict(xtest_scaled)

# R2 score
print('Best params R^2', r2_score(ytest, y_pred_rf_best))

# Root mean square error

print('best params rmse', np.sqrt(mean_squared_error(ytest, y_pred_rf_best)))


Best params R^2 0.9891863041175504
best params rmse 0.911159218640905


In [130]:
from sklearn.model_selection import cross_val_score

print('Linear Regression \n')
print('rmse test', rmse_test)
print('r^2 test', r2_test, '\n')

print('baseline \n')
print('rmse test baseline', rmse_test_b)
print('r^2 test baseline', r2_test_b, '\n')

print('Grid \n')
print('rmse test', rmse_grid)
print('r^2 test', r2_grid, '\n')

print('RF Best Params \n')
print('rmse', np.sqrt(mean_squared_error(ytest, y_pred_rf_best)))
print('r^2 ', r2_score(ytest, y_pred_rf_best))



Linear Regression 

rmse test 0.6948409252416231
r^2 test 0.9937113650587397 

baseline 

rmse test baseline 7.118734189050042
r^2 test baseline 0.3399282314234663 

Grid 

rmse test 0.1251115773147737
r^2 test 0.9997961171933879 

RF Best Params 

rmse 0.911159218640905
r^2  0.9891863041175504
