In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from os import listdir
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [2]:
data_folder = './output/'
df = pd.read_csv(data_folder + 'df_final_changing_windows.csv')
#df.set_index('czas_utc', inplace=True)
df['czas_utc'] = pd.to_datetime(df['czas_utc'])
df.drop(df[df['koncentrat']==0].index, axis=0, inplace=True) #usunięcie wierszy z koncentratem=0
df.drop(['prob_corg_proc', 'prob_s_proc', 'prob_fe_proc', 'prazonka_s_proc', 'prazonka_fe_proc'], axis=1, inplace=True)

features = df.drop('temp_zuz',axis=1).diff()
features['prev_temp_zuz'] = df['temp_zuz'].shift(1)

features = features.iloc[1:,:]
label = df['temp_zuz'].iloc[1:,]

features['czas_utc'] = features['czas_utc'].dt.seconds/3600
#features = df.drop("temp_zuz", axis=1)
#label = df["temp_zuz"]

In [37]:
# wyszukanie najlepszych hyper parametrów dla lasów losowych
grid_values = {'min_samples_leaf': np.arange(20,51,10), 'max_depth': np.arange(2,16,1)}
grid_rf = GridSearchCV(RandomForestRegressor(n_estimators = 100, random_state=2137), param_grid = grid_values,scoring = 'r2')

grid_rf.fit(features, label)
params_rf = grid_rf.best_params_
print("Best parameters for RF:", grid_rf.best_params_)

Best parameters for RF: {'max_depth': 15, 'min_samples_leaf': 20}


In [None]:
# wyszukanie najlepszych hyper parametrów dla Gradient boostingu
gbr_params = {'n_estimators': [10,100],
          'max_depth': np.arange(2,16,1),
          'min_samples_split': np.arange(20,51,10)}

clf = GridSearchCV(estimator=GradientBoostingRegressor(random_state=2137), 
                   param_grid=gbr_params)
clf.fit(features, label)

print("Best parameters for gbr:", clf.best_params_)
params_gbr = clf.best_params_

In [None]:
X_test, X_train, y_test, y_train = train_test_split(features, label, test_size=0.2, random_state=2137)

# lasy losowe
rf = RandomForestRegressor(n_estimators = 100, min_samples_leaf=params_rf['min_samples_leaf'], 
                           max_depth=params_rf['max_depth'], random_state=2137)
# dopasownie do danych treningowych
rf.fit(X_train, y_train)

# predykcja danych treningowych
predictions_rf = rf.predict(X_train)
errors_rf = abs(predictions_rf - y_train)
mse_rf = mean_squared_error(y_train, predictions_rf)

In [None]:
# Gradient boosting
gbr = GradientBoostingRegressor(random_state=2137, max_depth=params_gbr['max_depth'], 
                                n_estimators=params_gbr['n_estimators'],
                                min_samples_split=params_gbr['min_samples_split'])
# dopasownie do danych treningowych
gbr.fit(X_train, y_train)
# predykcja danych treningowych
predictions_gbr = gbr.predict(X_train)
errors_gbr = abs(predictions_gbr - y_train)
mse_gbr = mean_squared_error(y_train, predictions_gbr)

In [None]:
# regresja liniowa
lr = LinearRegression()
# dopasownie do danych treningowych
lr.fit(X_train, y_train)
# predykcja danych treningowych
predictions_lr = lr.predict(X_train)
errors_lr = abs(predictions_lr - y_train)
mse_lr = mean_squared_error(y_train, predictions_lr)

In [None]:
# plotowanie rezultatów dla danych treningowych
plt.rcParams["figure.figsize"] = (20,10)
x = range(len(y_train))
f, (ax1, ax2) = plt.subplots(2, 1)
ax1.plot(x, predictions_lr, linestyle='--', label='Prediction LR')
ax1.plot(x, predictions_rf, linestyle='--', label='Prediction RF')
ax1.plot(x, predictions_gbr, linestyle='--', label='Prediction GB')
ax1.plot(x, y_train, marker='.', label='Real')
ax1.set_ylabel('Charges')
ax1.legend()
ax1.grid()

ax2.plot(x, errors_rf, linestyle='-.', label='RF')
ax2.plot(x, errors_lr, linestyle='-.', label='LR')
ax2.plot(x, errors_gbr, linestyle='-.', label='GB')
ax2.set_ylabel('Prediction errors')
ax2.legend()
ax2.grid()
ax1.set_title('Train data')

In [None]:
# plotowanie rezultatów dla danych testowych
predictions_rf = rf.predict(X_test)
errors_rf = abs(predictions_rf - y_test)
predictions_lr = lr.predict(X_test)
errors_lr = abs(predictions_lr - y_test)
predictions_gbr = gbr.predict(X_test)
errors_gbr = abs(predictions_gbr - y_test)
print('R^2 for train data for RF = %.2f ' %rf.score(X_train,y_train))
print('R^2 for test data for RF = %.2f ' %rf.score(X_test,y_test))
print('R^2 for train data for LR = %.2f ' %lr.score(X_train,y_train))
print('R^2 for test data for LR = %.2f '%lr.score(X_test,y_test))
print('R^2 for train data for GB = %.2f ' %gbr.score(X_train,y_train))
print('R^2 for test data for GB = %.2f '%gbr.score(X_test,y_test))

print('MSE for train data for RF = %.2f ' %mse_rf)
print('MSE for test data for RF = %.2f ' %mean_squared_error(y_test, predictions_rf))
print('MSE for train data for LR = %.2f ' %mse_lr)
print('MSE for test data for LR = %.2f '% mean_squared_error(y_test, predictions_lr))
print('MSE for train data for GB = %.2f ' %mse_gbr)
print('MSE for test data for GB = %.2f '%mean_squared_error(y_test, predictions_gbr))


x = range(len(y_test))
f, (ax1, ax2) = plt.subplots(2, 1)
ax1.plot(x, predictions_lr, linestyle='--', label='Prediction LR')
ax1.plot(x, predictions_rf, linestyle='--', label='Prediction RF')
ax1.plot(x, predictions_gbr, linestyle='--', label='Prediction GB')
ax1.plot(x, y_test, marker='.', label='Real')
ax1.set_ylabel('Charges')
ax1.legend()
ax1.grid()

ax2.plot(x, errors_rf, linestyle='-.', label='RF')
ax2.plot(x, errors_lr, linestyle='-.', label='LR')
ax2.plot(x, errors_gbr, linestyle='-.', label='GB')
ax2.set_ylabel('Prediction errors')
ax2.legend()
ax2.grid()

ax1.set_title('Test data')

In [None]:
imp = pd.DataFrame({'column': features.columns.values, 'RF_importance': rf.feature_importances_,
              'GB_importance': gbr.feature_importances_}).sort_values('RF_importance', ascending=False)
imp

In [None]:
x = {'prev_temp_zuz': 'Poprzednia temperatura', 'prazonka_fe_masa': 'Prazonka_fe_masa', 
     'moc_cieplna_odebrana': 'Moc cieplna odebrana', 'pyl': 'Pył', 'koncentrat': 'Koncentrat', 
     'woda_zasil_temp': 'Woda_zasil_temp', 'woda_powrotna_temp': 'Woda_powrotna_temp', 'prazonka_s_masa': 'Prażonka_S_masa',
    'wentylator': 'Wentylator ODCZ_ZAD OBROTÓW', 'kol_kan_temp': 'Temperatura na kol. kan.', 'czas_utc': 'Diff czas',
    'prob_corg_masa': 'Prob_corg_masa', 'prob_fe_masa': 'Prob_fe_masa', 'woda_powrotna_przeplyw': 'Woda_powrotna_przepływ',
     'wymurowka_temp': 'Temperatura wymurówka', 'prazonka': 'Prażonka', 'prob_s_masa': 'Prob_s_masa'}

In [None]:
imp['column'] = imp['column'].map(x)
imp.set_index('column')