In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

# seaborn plot styles

sns.set_style("dark")
sns.set_palette("deep")
plt.rcParams["axes.labelsize"] = 15
plt.rcParams["axes.titlesize"] = 20
myblue = '#0b5394'

%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


import pickle

#display all output form a cell not just the last (the options are 'all', 'none', 'last' and 'last_expr'.)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import time
import os

In [5]:
data = pd.read_csv('../data/data_with_features_v1.csv')

In [6]:
data.columns

Index(['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure',
       'u_in_lag_1', 'u_in_lag_2', 'step_duration', 'mean_u_in',
       'u_in_rolling_mean', 'u_in_rolling_sum', 'u_in_cum_sum',
       'u_out_rolling_mean', 'u_out_rolling_sum', 'u_out_cum_sum',
       'u_in_x_time_step', 'R_x_C'],
      dtype='object')

In [7]:
#train_test_split

train_breath_ids, test_breath_ids = train_test_split(data['breath_id'].unique(), test_size=0.4, random_state=0)

train = data[data['breath_id'].isin(train_breath_ids)]
test = data[data['breath_id'].isin(test_breath_ids)]

assert(data.shape[0] == train.shape[0] + test.shape[0])

#creating features and target
x_train = train.drop(columns = ['id', 'breath_id', 'pressure'])
y_train = train['pressure']
x_test = test.drop(columns = ['id', 'breath_id', 'pressure'])
y_test = test['pressure'] 

In [17]:
#set up a random grid search for random forest regressor

RF = RandomForestRegressor(random_state=0, n_jobs=3)

params = {
        'n_estimators': range(100,200,10),
        'max_depth': range(25,41),
        'min_samples_split': range(5,15),
        'max_features': ["auto"]        
    }

rs = RandomizedSearchCV(
        estimator = RF,
        param_distributions = params,
        cv = 3,
        verbose = 5,
        n_iter= 15        
    )

In [None]:
%%time 

rs.fit(x_train, y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV 1/3] END max_depth=33, max_features=auto, min_samples_split=6, n_estimators=150;, score=0.987 total time=44.8min
[CV 2/3] END max_depth=33, max_features=auto, min_samples_split=6, n_estimators=150;, score=0.987 total time=46.5min
[CV 3/3] END max_depth=33, max_features=auto, min_samples_split=6, n_estimators=150;, score=0.987 total time=46.2min
[CV 1/3] END max_depth=34, max_features=auto, min_samples_split=7, n_estimators=170;, score=0.987 total time=52.0min
[CV 2/3] END max_depth=34, max_features=auto, min_samples_split=7, n_estimators=170;, score=0.987 total time=52.0min
[CV 3/3] END max_depth=34, max_features=auto, min_samples_split=7, n_estimators=170;, score=0.987 total time=52.3min
[CV 1/3] END max_depth=36, max_features=auto, min_samples_split=8, n_estimators=100;, score=0.986 total time=30.8min
[CV 2/3] END max_depth=36, max_features=auto, min_samples_split=8, n_estimators=100;, score=0.987 total time=30.6min
[CV

In [None]:
best_rf_v1 = rs.best_estimator_

#export vote

pickle.dump(best_rf_v1, open('./models/best_rf_v1.p', 'wb'))

cv_results = rs.cv_results_

#export cv results
np.save('cv_results.npy', results)