In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from classes.model_factory import ModelFactory

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso


## Treinamento regressão

* A principal métrica que irei olhar é Root Mean Squared Error, que dirá a margem de erro de quantos ciclos o modelo irá possivelmente errar

In [2]:
with open("data/df_test.pkl", "rb") as p:
    df_test = pickle.load(p)

In [3]:
with open("data/df_train.pkl", "rb") as p:
    df_train = pickle.load(p)

In [4]:
df_train.head()

Unnamed: 0,asset_id,runtime,engine1,engine2,engine3,t1,t2,t3,t4,t5,...,sd14,sd15,sd16,sd17,sd18,sd19,sd20,sd21,rul,failure_label
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,191,0
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,5.041671,0.008697,0.0,0.0,0.0,0.0,0.042426,0.003253,190,0
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,3.71745,0.00764,0.0,1.154701,0.0,0.0,0.055076,0.044573,189,0
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,3.050906,0.028117,0.0,1.0,0.0,0.0,0.076322,0.037977,188,0
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2.651326,0.025953,0.0,1.095445,0.0,0.0,0.073621,0.033498,187,0


#### usar apenas as features originais deram um resultado melhor

In [5]:
# separa feature de target
features = ['engine1', 'engine2', 'engine3', 't1', 't2',
       't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10', 't11', 't12', 't13',
       't14', 't15', 't16', 't17', 't18', 't19', 't20', 't21']

target = 'rul'

In [6]:
X = df_train[features]
y = df_train[target]

In [7]:
# separação de treino e teste
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.4, random_state=42)

### Regressão Linear

In [8]:
model_name= 'Linear Regression'
model = LinearRegression()
params = {'normalize':[True, False]}
score = 'neg_mean_squared_error'

linear_reg = ModelFactory(model_name, model, params, score)

In [9]:
model_reg, pred_reg = linear_reg.regression(X_train, y_train, X_test, y_test)
linear_reg.regression_metrics(y_test, pred_reg)

Root Mean Squared Error: 44.42328963273888
Mean Absolute Error: 34.017976789626424
R^2: 0.5761048701950309
Explained Variance: 0.5762385678162177


### Lasso

In [10]:
model_name= 'Lasso'
model = Lasso()
params = {'alpha':[0.001, 0.002, 0.010, 0.050, 0.1, 0.2, 0.5, 1]}
score = 'neg_mean_squared_error'

lasso = ModelFactory(model_name, model, params, score)

In [11]:
model_lasso, pred_lasso = lasso.regression(X_train, y_train, X_test, y_test)
lasso.regression_metrics(y_test, pred_lasso)

Root Mean Squared Error: 44.43392106855771
Mean Absolute Error: 34.02846936737952
R^2: 0.5759019517183553
Explained Variance: 0.5760365624450068


### Decision Tree Regressor

In [12]:
model_name= 'Decision Tree Regressor'
model = DecisionTreeRegressor()
params = {'max_depth':[5, 7, 10, 15, 20 ,25, 30]}
score = 'neg_mean_squared_error'

tree = ModelFactory(model_name, model, params, score)

In [13]:
model_tree, pred_tree = tree.regression(X_train, y_train, X_test, y_test)
tree.regression_metrics(y_test, pred_tree)

Root Mean Squared Error: 44.368745509350724
Mean Absolute Error: 32.05791018714476
R^2: 0.577145171089701
Explained Variance: 0.5772040437754056


### Random Forest Regressor

In [25]:
model_name= 'Random Forest Regressor'
model = RandomForestRegressor()
params = {'n_estimators':[100, 150, 200], "max_features":[3, 4],
         "max_depth":[4, 6, 8]}
score = 'neg_mean_squared_error'

forest = ModelFactory(model_name, model, params, score)

In [26]:
model_forest, pred_forest = forest.regression(X_train, y_train, X_test, y_test)
forest.regression_metrics(y_test, pred_forest)

Root Mean Squared Error: 41.590808191846186
Mean Absolute Error: 29.933250029186176
R^2: 0.6284376382748325
Explained Variance: 0.6285287243160482


#### Random forest apresentou o melhor resultado, com 41 de RMSE

In [27]:
result = pd.DataFrame()

In [29]:
result['predicted'] = model_forest.predict(df_test[features])

In [30]:
result.to_csv("regressao_ViniciusZambotti.csv", index = False)