In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

from datetime import datetime, timedelta
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler

import os
import sys
import logging
from logging.handlers import TimedRotatingFileHandler

In [None]:
SEED = 42

In [None]:
def log_setup(filename, log_level):
    logger = logging.getLogger(filename)
    logger.setLevel(log_level)
    formatter = logging.Formatter(fmt='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                                  datefmt='%m-%d-%y %H:%M:%S')
    fh = TimedRotatingFileHandler(os.path.join("logs", filename + ".log"), when='midnight')
    fh.setFormatter(formatter)
    sh = logging.StreamHandler(sys.stdout)
    sh.setLevel(log_level)
    sh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(sh)
    return logger

In [None]:
os.makedirs("logs", exist_ok=True)
if os.path.isfile("logs/rf_antigo_para_recente.log"):
    os.remove("logs/rf_antigo_para_recente.log")
logger = log_setup("rf_antigo_para_recente", logging.INFO)

In [None]:
data = pd.read_csv("../mai_2022_fev_2023.csv")

In [None]:
len(data)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
features_names = list(data.columns)[:-1]
logger.info(f"features_names: {features_names}")
logger.info(f"len(features_names): {len(features_names)}")

In [None]:
data['date'] = pd.to_datetime(dict(year=data.data_ano, month=data.data_mes, day=data.data_dia))

In [None]:
original_data = data.copy()

In [None]:
for day_range in [1, 7, 15, 30]:
    data = original_data.copy()
    logger.info(f"day_range: {day_range}")

    min_date = datetime.strptime(str(data['date'].min())[:10], '%Y-%m-%d')
    max_date = datetime.strptime(str(data['date'].max())[:10], '%Y-%m-%d')

    logger.info(f"min_date: {min_date}")
    logger.info(f"max_date: {max_date}")

    initial_date_train = datetime.strptime(str(data['date'].min())[:10], '%Y-%m-%d')
    final_date_train = initial_date_train + timedelta(days = day_range)

    initial_date_test = final_date_train
    final_date_test = initial_date_test + timedelta(days = day_range)

    if final_date_train > max_date:
        logger.info("A data final de treino é maior que a data máxima")
        break

    if initial_date_test > max_date:
        logger.info("A data inicial de teste é maior que a data máxima")
        break

    logger.info(f"initial_date_test: {initial_date_test}")
    logger.info(f"final_date_test: {final_date_test}")
    logger.info(f"initial_date_train: {initial_date_train}")
    logger.info(f"final_date_train: {final_date_train}")

    logger.info(f"original_data: {len(original_data)}")

    data_train = data.loc[(data['date'] >= initial_date_train) & (data['date'] < final_date_train)]
    logger.info(f"data_train: {len(data_train)}")
    data_test = data.loc[(data['date'] >= initial_date_test) & (data['date'] < final_date_test)]
    logger.info(f"data_test: {len(data_test)}")

    logger.info(f"min_date_train: {data_train['date'].min()}")
    logger.info(f"max_date_train: {data_train['date'].max()}")
    logger.info(f"min_date_test: {data_test['date'].min()}")
    logger.info(f"max_date_test: {data_test['date'].max()}")

    logger.info(f"Distribuição de classes - Treino: {data_train['output'].value_counts()}")
    logger.info(f"Distribuição de classes - Teste: {data_test['output'].value_counts()}")

    X_train = data_train[features_names]
    logger.info(f"X_train.shape: {X_train.shape}")

    X_train = X_train.values.tolist()
    y_train = data_train['output'].values.tolist()
    logger.info(f"Classes de treino: {np.unique(y_train)}")

    X_test = data_test[features_names]
    logger.info(f"X_test.shape: {X_test.shape}")

    X_test = X_test.values.tolist()
    y_test = data_test['output'].values.tolist()
    logger.info(f"Classes de teste: {np.unique(y_test)}")

    # Train-Test
    logger.info("\n-- Train/Test --")
    clf = RandomForestClassifier(max_depth=3, random_state=SEED)
    try:
        clf.fit(X_train, y_train)
    except Exception as error:
        logger.info(f"Erro: {error}")
        continue

    y_pred = clf.predict(X_test)

    logger.info('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
    logger.info('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
    logger.info('F1: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    logger.info('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
    logger.info(f"cm: {cm}")

    # Reescalando os dados
    logger.info("\n-- Train/Test with data scaling --")
    scaler_train = StandardScaler()
    scaler_train.fit(X_train)
    X_train_scaled = scaler_train.transform(X_train)

    X_test_scaled = scaler_train.transform(X_test)

    clf = RandomForestClassifier(max_depth=3, random_state=SEED)
    try:
        clf.fit(X_train_scaled, y_train)
    except Exception as error:
        logger.info(f"Erro: {error}")
        continue

    y_pred = clf.predict(X_test_scaled)

    logger.info('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
    logger.info('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
    logger.info('F1: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    logger.info('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
    logger.info(f"cm: {cm}")

    logger.info("\n ============================================================================= \n")