In [None]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Leitura de Dados

In [None]:
DATA_DIR = 'assets/wind.csv'

In [None]:
data = pd.read_csv(DATA_DIR)
data = data.drop('rec_fcast', axis=1)
data['datetime'] = pd.to_datetime(data['datetime'], dayfirst=True)

data.set_index('datetime', inplace=True)

data['normalized_wp'] = data['wpower'] / data['capacity']

subset_year = [2013 < x < 2018 for x in data.index.year]

data = data.loc[subset_year, :]

data = data.ffill()
data = data.resample('H').mean().ffill()

series = data['normalized_wp']

thr = 0.1

series.iloc[np.where(series > 1)] = np.nan
series = series.ffill()

series.head()

In [None]:
series.shape

In [None]:
series.plot()

In [None]:
series.resample('D').mean().plot(figsize=(30, 12))

In [None]:
thr = 0.15

series.diff().plot(figsize=(30, 12))
plt.axhline(y = thr, color = 'r')
plt.axhline(y = -thr, color = 'r')

### Construindo um modelo preditivo

In [None]:
from sklearn.model_selection import train_test_split
from src.tde import time_delay_embedding

series.head()

In [None]:
train, test = train_test_split(series, test_size=0.3, shuffle=False)

train_df = time_delay_embedding(train, n_lags=24, horizon=1).dropna()
test_df = time_delay_embedding(test, n_lags=24, horizon=1).dropna()

train_df.head()

In [None]:
X_train = train_df.drop('t+1', axis=1)
X_test = test_df.drop('t+1', axis=1)

print(X_train.head())

train_exc = (train_df['t+1'] - train_df['t-0']).abs() > thr
y_train_01 = train_exc.astype(int)

test_exc = (test_df['t+1'] - test_df['t-0']).abs() > thr
y_test_01 = test_exc.astype(int)

y_train_01

In [None]:
y_train_01.value_counts(normalize=True)

In [None]:
y_test_01.value_counts(normalize=True)

In [None]:
y_train_01.value_counts(normalize=True).plot.bar()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model = RandomForestClassifier()
model.fit(X_train, y_train_01)

In [None]:
anomaly_prob = model.predict_proba(X_test)[:, 1]
anomaly_pred = model.predict(X_test)

anomaly_prob = pd.Series(anomaly_prob, index=y_test_01.index).rename('Anomaly Probability')
y_test_01 = y_test_01.rename('Anomaly')
ax = y_test_01.plot(legend=True,color='red', figsize=(12,6), alpha=0.5)
anomaly_prob.plot(legend=True, color='green', alpha=0.75)

In [None]:
from sklearn.metrics import roc_auc_score, brier_score_loss, f1_score, recall_score, precision_score, classification_report

results_rf = classification_report(y_true=y_test_01, y_pred=anomaly_pred, output_dict=True)['macro avg']
results_rf.pop('support')

pd.Series(results_rf).plot.bar()

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.ensemble import BalancedRandomForestClassifier

X_tr_resampled, y_tr_resampled = RandomUnderSampler().fit_resample(X_train, y_train_01)

model = RandomForestClassifier()
model.fit(X_tr_resampled, y_tr_resampled)

In [None]:
anomaly_prob = model.predict_proba(X_test)[:, 1]
anomaly_pred = model.predict(X_test)

anomaly_prob = pd.Series(anomaly_prob, index=y_test_01.index).rename('Anomaly Probability')
ax = y_test_01.plot(legend=True,color='red', figsize=(12,6), alpha=0.5)
anomaly_prob.plot(legend=True, color='green', alpha=0.5)

In [None]:
results_rfr = classification_report(y_true=y_test_01, y_pred=anomaly_pred, output_dict=True)['macro avg']
results_rfr.pop('support')

pd.Series(results_rfr).plot.bar()

In [None]:
model = BalancedRandomForestClassifier()

model.fit(X_train, y_train_01)

anomaly_prob = model.predict_proba(X_test)[:, 1]
anomaly_pred = model.predict(X_test)

anomaly_prob = pd.Series(anomaly_prob, index=y_test_01.index).rename('Anomaly Probability')
ax = y_test_01.plot(legend=True,color='red', figsize=(12,6), alpha=0.5)
anomaly_prob.plot(legend=True, color='green', alpha=0.5)

In [None]:
results_brf = classification_report(y_true=y_test_01, y_pred=anomaly_pred, output_dict=True)['macro avg']
results_brf.pop('support')

pd.Series(results_brf).plot.bar()

In [None]:
anomaly_pred_thr = (anomaly_prob > 0.2).astype(int)

results_thr_eg = classification_report(y_true=y_test_01, y_pred=anomaly_pred_thr, output_dict=True)['macro avg']
results_thr_eg.pop('support')

pd.Series(results_thr_eg).plot.bar()

In [None]:
from sklearn.ensemble import IsolationForest
?IsolationForest

iforest = IsolationForest(n_estimators=100, max_samples='auto')
iforest.fit(X_train)

if_pred = iforest.predict(X_test)
if_pred[if_pred == 1]=0
if_pred[if_pred == -1]=1
pd.Series(if_pred).value_counts()

In [None]:
results_if = classification_report(y_true=y_test_01, y_pred=if_pred, output_dict=True)['macro avg']
results_if.pop('support')

pd.Series(results_if).plot.bar()

In [None]:
from pyod.models.knn import KNN 

In [None]:
model = KNN()
model.fit(X_train)

In [None]:
knn_pred = model.predict(X_test)
knn_prob = model.decision_function(X_test)
knn_prob

In [None]:
knn_prob = pd.Series(knn_prob, index=y_test_01.index).rename('Anomaly Probability')
ax = y_test_01.plot(legend=True,color='red', figsize=(12,6), alpha=0.5)
knn_prob.plot(legend=True, color='green', alpha=0.5)

In [None]:
results_knn = classification_report(y_true=y_test_01, y_pred=knn_pred, output_dict=True)['macro avg']
results_knn.pop('support')

pd.Series(results_knn).plot.bar()

In [None]:
f1_scores = {'RF':results_rf['f1-score'],
'RF+Resample': results_rfr['f1-score'],
'BalancedRF': results_brf['f1-score'],
'IsolationForest': results_if['f1-score'],
'KNN': results_knn['f1-score']}

pd.Series(f1_scores).plot.bar()