<a href="https://colab.research.google.com/github/ubiratantavares/tcc_puc_minas/blob/main/datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Criação do conjunto de dados de treinamento e teste 

In [None]:
# importar bibliotecas
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
# carregar base de dados
def load_data(filename):
  path = "/content/drive/MyDrive/tcc/data/"
  path_filename = path + filename
  return pd.read_csv(path_filename, delimiter=',')

In [None]:
# criar dataframe por meio do carregamento da base de dados
df = load_data("dados_tratados.csv")

In [None]:
# verificar as 5 primeiras linhas da base de dados
df.head()

Unnamed: 0,FP,Trimestre,Mes,Semana,Dia,Hora,Minuto,Evento
0,0.78,3,7,2,18,0,0,0
1,0.78,3,7,2,18,0,15,0
2,0.78,3,7,2,18,0,30,0
3,0.71,3,7,2,18,0,45,0
4,0.6,3,7,2,18,1,0,0


In [None]:
# definir os atributos previsores
X =  df[["FP", "Trimestre", "Mes", "Semana", "Dia", "Hora", "Minuto"]]
X = X.values
X

array([[ 0.78,  3.  ,  7.  , ..., 18.  ,  0.  ,  0.  ],
       [ 0.78,  3.  ,  7.  , ..., 18.  ,  0.  , 15.  ],
       [ 0.78,  3.  ,  7.  , ..., 18.  ,  0.  , 30.  ],
       ...,
       [ 0.83,  4.  , 12.  , ..., 31.  , 23.  , 30.  ],
       [ 0.84,  4.  , 12.  , ..., 31.  , 23.  , 45.  ],
       [ 0.82,  1.  ,  1.  , ...,  1.  ,  0.  ,  0.  ]])

In [None]:
# definir o atributo alvo
y = df["Evento"]
y = y.values
y

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# separar os atributos previsores em partes
X1 = X[:, :1]
X1

array([[0.78],
       [0.78],
       [0.78],
       ...,
       [0.83],
       [0.84],
       [0.82]])

In [None]:
X2 = X[:, 1:7]
X2

array([[ 3.,  7.,  2., 18.,  0.,  0.],
       [ 3.,  7.,  2., 18.,  0., 15.],
       [ 3.,  7.,  2., 18.,  0., 30.],
       ...,
       [ 4., 12.,  3., 31., 23., 30.],
       [ 4., 12.,  3., 31., 23., 45.],
       [ 1.,  1.,  4.,  1.,  0.,  0.]])

In [None]:
# aplicar codificação "one-hot" nos atributos de X2

one_hot_encoder = OneHotEncoder(sparse_output=False)
X2 = one_hot_encoder.fit_transform(X2)
X2.shape

(86113, 82)

In [None]:
# junção de X1 e X2
X = np.column_stack([X2, X1])
X.shape
X

array([[0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.78],
       [0.  , 0.  , 1.  , ..., 0.  , 0.  , 0.78],
       [0.  , 0.  , 1.  , ..., 1.  , 0.  , 0.78],
       ...,
       [0.  , 0.  , 0.  , ..., 1.  , 0.  , 0.83],
       [0.  , 0.  , 0.  , ..., 0.  , 1.  , 0.84],
       [1.  , 0.  , 0.  , ..., 0.  , 0.  , 0.82]])

In [None]:
# dividir os atributos previsores e alvo em 80% dos dados para treinamento e 20% dos dados para teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# criar dataframes dos dados de treinamento
df_X_train = pd.DataFrame(X_train)
df_y_train = pd.DataFrame(y_train)

# criar dataframes dos dados de teste
df_X_test = pd.DataFrame(X_test)
df_y_test = pd.DataFrame(y_test)

In [None]:
# salvar dataframe em arquivo csv
def save_data(df, filename):
  path = "/content/drive/MyDrive/tcc/data/"
  path_filename = path + filename
  df.to_csv(path_filename, index=False)
  return None

In [None]:
# salvar dataframes
save_data(df_X_train, "x_train.csv")
save_data(df_y_train, "y_train.csv")
save_data(df_X_test, "x_test.csv")
save_data(df_y_test, "y_test.csv")