In [1]:
import pandas as pd

In [2]:
_2021 = pd.read_csv('_2021.csv', encoding='utf-8')
_2021['Injeção na rede (kWh)'].fillna("None", inplace=True)

_2022 = pd.read_csv('_2022.csv', encoding='utf-8')
_2022['Injeção na rede (kWh)'].fillna("None", inplace=True)

all = pd.concat([_2021, _2022], ignore_index=True)
all.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11016 entries, 0 to 11015
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Normal (kWh)             11016 non-null  float64
 1   Horário Económico (kWh)  11016 non-null  float64
 2   Autoconsumo (kWh)        11016 non-null  float64
 3   Injeção na rede (kWh)    11016 non-null  object 
 4   Data & Hora              11016 non-null  object 
 5   temp                     11016 non-null  float64
 6   feels_like               11016 non-null  float64
 7   temp_min                 11016 non-null  float64
 8   temp_max                 11016 non-null  float64
 9   pressure                 11016 non-null  int64  
 10  humidity                 11016 non-null  int64  
 11  wind_speed               11016 non-null  float64
 12  clouds_all               11016 non-null  int64  
 13  weather_description      11016 non-null  object 
dtypes: float64(8), int64(3

In [4]:
all.head()

Unnamed: 0,Normal (kWh),Horário Económico (kWh),Autoconsumo (kWh),Injeção na rede (kWh),Data & Hora,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,clouds_all,weather_description
0,0.0,0.0,0.0,,2021-09-29 00:00:00,13.97,13.54,11.45,14.04,1027,81,1.96,87,overcast clouds
1,0.0,0.0,0.0,,2021-09-29 01:00:00,13.48,13.02,13.43,13.9,1028,82,1.83,91,overcast clouds
2,0.0,0.0,0.0,,2021-09-29 02:00:00,12.93,12.47,12.72,13.43,1027,84,1.96,93,overcast clouds
3,0.0,0.0,0.0,,2021-09-29 03:00:00,12.61,12.15,10.34,12.93,1027,85,1.85,95,overcast clouds
4,0.0,0.0,0.0,,2021-09-29 04:00:00,12.61,12.17,9.79,12.93,1027,86,1.83,93,overcast clouds


In [5]:
import numpy as np

import ants

import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score

from sklearn.model_selection import cross_val_score

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
all.drop('Data & Hora', axis=1, inplace=True)

In [7]:
#Hot
weather_description = pd.get_dummies (all['weather_description'], drop_first=True)
all.drop('weather_description', axis=1, inplace=True)
all = pd.concat ([all, weather_description], axis=1)

#label

In [8]:
# Injeção na rede (kWh) troca do tipo dos valores
all['Injeção na rede (kWh)'].replace({'None': 0, 'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}, inplace=True)
print(all['Injeção na rede (kWh)'].value_counts())

Injeção na rede (kWh)
0    7777
3    1103
2    1098
4     606
1     432
Name: count, dtype: int64


In [9]:
all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11016 entries, 0 to 11015
Data columns (total 19 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Normal (kWh)             11016 non-null  float64
 1   Horário Económico (kWh)  11016 non-null  float64
 2   Autoconsumo (kWh)        11016 non-null  float64
 3   Injeção na rede (kWh)    11016 non-null  int64  
 4   temp                     11016 non-null  float64
 5   feels_like               11016 non-null  float64
 6   temp_min                 11016 non-null  float64
 7   temp_max                 11016 non-null  float64
 8   pressure                 11016 non-null  int64  
 9   humidity                 11016 non-null  int64  
 10  wind_speed               11016 non-null  float64
 11  clouds_all               11016 non-null  int64  
 12  few clouds               11016 non-null  bool   
 13  heavy intensity rain     11016 non-null  bool   
 14  light rain            

In [10]:
X = all.drop(['Injeção na rede (kWh)'], axis=1)
y = all['Injeção na rede (kWh)'].to_frame()

'''
def _impute (cols):
    _map = {"None": 0
          , "Low": 1
          , "Medium": 2
          , "High": 3
          , "Very High": 4
    }
    
    return _map[cols[0]]
y['Injeção na rede (kWh)'] = y[['Injeção na rede (kWh)']].apply(_impute, axis= 1)
'''
y

Unnamed: 0,Injeção na rede (kWh)
0,0
1,0
2,0
3,0
4,0
...,...
11011,0
11012,0
11013,0
11014,0


In [11]:


X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.30, random_state=2021)

In [12]:
parameters = { 'min_child_weight': [5, 6, 7]
             , 'colsample_bytree': [0, 0.2]
             , 'max_depth': [3, 4, 5]
             }

_xgb = xgb.XGBClassifier(objective='reg:squarederror')
clf = GridSearchCV (estimator= _xgb, param_grid= parameters, cv= 5, refit= True, verbose= 3, error_score='raise')

In [13]:
best_clf = clf.fit(X_train, y_train)
_xgb.fit (X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END colsample_bytree=0, max_depth=3, min_child_weight=5;, score=0.828 total time=   6.2s
[CV 2/5] END colsample_bytree=0, max_depth=3, min_child_weight=5;, score=0.837 total time=   2.1s
[CV 3/5] END colsample_bytree=0, max_depth=3, min_child_weight=5;, score=0.841 total time=   2.4s
[CV 4/5] END colsample_bytree=0, max_depth=3, min_child_weight=5;, score=0.816 total time=   0.9s
[CV 5/5] END colsample_bytree=0, max_depth=3, min_child_weight=5;, score=0.829 total time=   0.4s
[CV 1/5] END colsample_bytree=0, max_depth=3, min_child_weight=6;, score=0.828 total time=   0.3s
[CV 2/5] END colsample_bytree=0, max_depth=3, min_child_weight=6;, score=0.835 total time=   0.4s
[CV 3/5] END colsample_bytree=0, max_depth=3, min_child_weight=6;, score=0.839 total time=   0.6s
[CV 4/5] END colsample_bytree=0, max_depth=3, min_child_weight=6;, score=0.811 total time=   0.3s
[CV 5/5] END colsample_bytree=0, max_depth=3, min_child_w

In [14]:
xgb_score = _xgb.score(X_test, y_test)

print("Accuracy: %.2f%%" % (xgb_score * 100))

Accuracy: 84.72%


In [15]:
xgb_predictions = _xgb.predict(X_test)

In [16]:
print(classification_report(y_test, xgb_predictions))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      2323
           1       0.41      0.27      0.33       132
           2       0.53      0.55      0.54       329
           3       0.58      0.60      0.59       348
           4       0.57      0.58      0.58       173

    accuracy                           0.85      3305
   macro avg       0.61      0.60      0.60      3305
weighted avg       0.84      0.85      0.84      3305



In [17]:
best_score = best_clf.score(X_test, y_test)

print("Accuracy: %.2f%%" % (best_score * 100))

Accuracy: 85.05%


In [18]:
best_predictions = best_clf.predict(X_test)

In [19]:
print(classification_report(y_test, best_predictions))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      2323
           1       0.52      0.19      0.28       132
           2       0.53      0.62      0.57       329
           3       0.58      0.62      0.60       348
           4       0.60      0.56      0.58       173

    accuracy                           0.85      3305
   macro avg       0.64      0.59      0.60      3305
weighted avg       0.85      0.85      0.85      3305

