In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

# seaborn plot styles

sns.set_style("dark")
sns.set_palette("deep")
plt.rcParams["axes.labelsize"] = 15
plt.rcParams["axes.titlesize"] = 20
myblue = '#0b5394'

%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, VotingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


import pickle

#display all output form a cell not just the last (the options are 'all', 'none', 'last' and 'last_expr'.)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import time


In [2]:
data = pd.read_csv('../data/train_clean.csv')

In [3]:
data.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


In [4]:
#train_test_split

train_breath_ids, test_breath_ids = train_test_split(data['breath_id'].unique() , test_size=0.25, random_state=42)

train = data[data['breath_id'].isin(train_breath_ids)]
test = data[data['breath_id'].isin(test_breath_ids)]

assert(data.shape[0] == train.shape[0] + test.shape[0])

In [5]:
#creating features and target
x_train = train[['R', 'C', 'time_step', 'u_in', 'u_out']]
y_train = train['pressure']
x_test = test[['R', 'C', 'time_step', 'u_in', 'u_out']]
y_test = test['pressure'] 

In [7]:
#baseline score

baseline_score = mean_absolute_error(y_test, [y_train.mean()]*y_test.shape[0])

print(f'baseline train mean absolute error: {mean_absolute_error(y_train,[y_train.mean()]*y_train.shape[0])}')
print(f'baseline mean absolute error: {baseline_score}')
print(f'baseline rmse: {mean_squared_error(y_test, [y_train.mean()]*y_test.shape[0], squared=False)}')

baseline train mean absolute error: 6.218913796889238
baseline mean absolute error: 6.18118475477399
baseline rmse: 8.056713303800064


In [11]:
%%time

model_1 = RandomForestRegressor(n_jobs = -1, max_depth=10)
model_1.fit(x_train, y_train)

CPU times: user 21min 42s, sys: 2.26 s, total: 21min 45s
Wall time: 5min 58s


RandomForestRegressor(max_depth=10, n_jobs=-1)

In [12]:
model_1.score(x_train,y_train)

0.7484180929090944

In [13]:
preds_1 = model_1.predict(x_test)

In [14]:
print(f'model_1 mean absolute error: {mean_absolute_error(y_test, preds_1)}')
print(f'model_1 rmse: {mean_squared_error(y_test, preds_1, squared=False)}')

model_1 mean absolute error: 2.0809270467123895
model_1 rmse: 4.048649367933456


In [24]:
model_1_feature_importances = pd.DataFrame({'feature': x_train.columns, 'feature_importance': model_1.feature_importances_})
model_1_feature_importances
# model_1_feature_importances.to_csv('../models/model_1_feature_importances.csv', index=False)

Unnamed: 0,feature,feature_importance
0,R,0.040535
1,C,0.043829
2,time_step,0.803235
3,u_in,0.112402
4,u_out,0.0


In [17]:
#export model_1

# pickle.dump(model_1, open('../models/model_1.p', 'wb'))

In [7]:
np.log(y_train + 1)
np.exp(np.log(y_train + 1))-1
y_train

0          1.922421
1          1.932650
2          2.183380
3          2.544972
4          2.582864
             ...   
4522555    1.582895
4522556    1.582895
4522557    1.568351
4522558    1.625299
4522559    1.582895
Name: pressure, Length: 3391920, dtype: float64

0           5.837492
1           5.907794
2           7.876254
3          11.742872
4          12.234987
             ...    
4522555     3.869032
4522556     3.869032
4522557     3.798729
4522558     4.079938
4522559     3.869032
Name: pressure, Length: 3391920, dtype: float64

0           5.837492
1           5.907794
2           7.876254
3          11.742872
4          12.234987
             ...    
4522555     3.869032
4522556     3.869032
4522557     3.798729
4522558     4.079938
4522559     3.869032
Name: pressure, Length: 3391920, dtype: float64

In [7]:
%%time

#predicting lof(Pressure + 1)

model_2 = RandomForestRegressor(n_jobs = -1, max_depth=10, verbose=5)
model_2.fit(x_train, np.log(y_train + 1))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   39.9s


building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  5.3min


building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78 of 100
building tree 79 of 100
building tree 80 of 100
building tree 81 of 100
building tree 82 of 100
building tree 83 of 100
building tree 84 of 100
building tree 85 of 100
building tree 86 of 100
building tree 87 of 100
building tree 88 of 100
building tree 89 of 100
building tree 90 of 100
building tree 91 of 100
building tree 92 of 100
building tree 93 of 100
building tree 94 of 100
building tree 95 of 100
building tree 96 of 100
building tree 97 of 100
building tree 98 of 100
building tree 99 of 100
building tree 100 of 100
CPU times: user 27min 51s, sys: 4.49 s, total: 27min 56s
Wall time: 8min 52s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  8.9min finished


RandomForestRegressor(max_depth=10, n_jobs=-1, verbose=5)

In [13]:
print(f'training score: {model_2.score(x_train,np.log(y_train + 1))}')
print(f'model_2 train mean absolute error: {mean_absolute_error(y_train, np.exp(model_2.predict(x_train))-1)}')

preds_2 = model_2.predict(x_test)

print(f'model_2 mean absolute error: {mean_absolute_error(y_test, np.exp(preds_2)-1)}')
print(f'model_2 rmse: {mean_squared_error(y_test, np.exp(preds_2)-1, squared=False)}')

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    3.1s remaining:    2.1s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    4.3s finished


training score: 0.833619155634542


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    3.0s remaining:    2.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    4.2s finished


model_2 train mean absolute error: 2.1048977044005484


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    1.1s remaining:    0.7s


model_2 mean absolute error: 2.082798784435768
model_2 rmse: 4.165162927128455


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    1.4s finished


In [8]:
#export model_2

pickle.dump(model_2, open('../models/model_2.p', 'wb'))

model_2_feature_importances = pd.DataFrame({'feature': x_train.columns, 'feature_importance': model_2.feature_importances_})
model_2_feature_importances

model_2_feature_importances.to_csv('../models/model_2_feature_importances.csv', index=False)

Unnamed: 0,feature,feature_importance
0,R,0.028744
1,C,0.030677
2,time_step,0.852383
3,u_in,0.088172
4,u_out,2.4e-05


In [9]:
#train on test data

model_3 = RandomForestRegressor(n_jobs = -1, max_depth=10, random_state=0)
model_3.fit(x_test, y_test)

RandomForestRegressor(max_depth=10, n_jobs=-1, random_state=0)

In [12]:
print(f'training score: {model_3.score(x_test,y_test)}')
print(f'model_3 train mean absolute error: {mean_absolute_error(y_test, model_3.predict(x_test))}')

preds_3 = model_3.predict(x_train)

print(f'model_3 mean absolute error: {mean_absolute_error(y_train, preds_3)}')
print(f'model_3 rmse: {mean_squared_error(y_train, preds_3, squared=False)}')


training score: 0.7526649001575995
model_3 train mean absolute error: 2.053929094552179
model_3 mean absolute error: 2.0897300853619782
model_3 rmse: 4.0681852395049605


In [15]:
%%time 

# Gradient Boosting

model_4 = GradientBoostingRegressor(random_state=0, n_jobs=-1)
model_4.fit(x_train, y_train)

CPU times: user 8min 19s, sys: 1.82 s, total: 8min 21s
Wall time: 8min 21s


GradientBoostingRegressor(random_state=0)

In [16]:
#gradient boosting performance

print(f'training score: {model_4.score(x_train,y_train)}')
print(f'model_4 train mean absolute error: {mean_absolute_error(y_train, model_4.predict(x_train))}')

preds_4 = model_4.predict(x_test)

print(f'model_4 mean absolute error: {mean_absolute_error(y_test, preds_4)}')
print(f'model_4 rmse: {mean_squared_error(y_test, preds_4, squared=False)}')

training score: 0.7051985106615015
model_4 train mean absolute error: 2.427442886767309
model_4 mean absolute error: 2.400265437451546
model_4 rmse: 4.36826782369354


In [18]:
# AdaBoost 

model_5 = AdaBoostRegressor(random_state=0, n_jobs=-1)
model_5.fit(x_train, y_train)

AdaBoostRegressor(random_state=0)

In [37]:
#AdaBoost performance

print(f'training score: {model_5.score(x_train,y_train)}')
print(f'model_5 train mean absolute error: {mean_absolute_error(y_train, model_5.predict(x_train))}')

preds_5 = model_5.predict(x_test)

print(f'model_5 mean absolute error: {mean_absolute_error(y_test, preds_5)}')
print(f'model_5 rmse: {mean_squared_error(y_test, preds_5, squared=False)}')


training score: 0.5611622404917471
model_5 train mean absolute error: 3.2440964056741284
model_5 mean absolute error: 3.2212058315237218
model_5 rmse: 5.352084796453789
