In [1]:
import pandas as pd
from sklearn import svm

from datetime import datetime, timedelta
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

from sklearn.preprocessing import StandardScaler

import os
import sys
import logging
from logging.handlers import TimedRotatingFileHandler

In [2]:
def log_setup(filename, log_level):
    logger = logging.getLogger(filename)
    logger.setLevel(log_level)
    formatter = logging.Formatter(fmt='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                                  datefmt='%m-%d-%y %H:%M:%S')
    fh = TimedRotatingFileHandler(os.path.join("logs", filename + ".log"), when='midnight')
    fh.setFormatter(formatter)
    sh = logging.StreamHandler(sys.stdout)
    sh.setLevel(log_level)
    sh.setFormatter(formatter)
    logger.addHandler(fh)
    logger.addHandler(sh)
    return logger

In [3]:
os.makedirs("logs", exist_ok=True)
if os.path.isfile("logs/svm_antigo_para_recente.log"):
    os.remove("logs/svm_antigo_para_recente.log")
logger = log_setup("svm_antigo_para_recente", logging.INFO)

In [4]:
data = pd.read_csv("../mai_2022_fev_2023.csv")

In [5]:
len(data)

81501

In [6]:
data.head()

Unnamed: 0,iluminancia_caixa,iluminancia_teto,temperatura,usuario_1,usuario_2,usuario_3,usuario_4,usuario_5,usuario_6,usuario_7,...,data_ano,data_mes,data_dia,data_hora,data_minuto,data_dia_da_semana,porta,janela1,janela2,output
0,263,62,23.73,0,0,0,0,0,0,0,...,2022,6,7,14,28,1,1,1,1,0
1,25,65,24.36,0,0,0,0,0,0,0,...,2022,6,7,14,30,1,1,1,1,0
2,266,66,23.87,0,0,0,0,0,0,0,...,2022,6,7,14,32,1,1,1,1,0
3,272,68,23.87,0,0,0,0,0,0,0,...,2022,6,7,14,34,1,1,1,1,0
4,275,71,23.87,0,0,0,0,0,0,0,...,2022,6,7,14,36,1,1,1,1,0


In [7]:
data.tail()

Unnamed: 0,iluminancia_caixa,iluminancia_teto,temperatura,usuario_1,usuario_2,usuario_3,usuario_4,usuario_5,usuario_6,usuario_7,...,data_ano,data_mes,data_dia,data_hora,data_minuto,data_dia_da_semana,porta,janela1,janela2,output
81496,164,195,35.12,0,0,1,1,0,1,1,...,2023,2,16,15,1,3,1,1,1,3
81497,165,195,35.16,0,0,1,1,0,1,1,...,2023,2,16,15,6,3,1,1,1,3
81498,165,194,35.17,0,0,1,1,0,1,1,...,2023,2,16,15,12,3,1,1,1,3
81499,164,189,35.17,0,0,1,1,0,1,1,...,2023,2,16,15,17,3,1,1,1,3
81500,162,183,35.17,0,0,1,1,0,1,1,...,2023,2,16,15,22,3,1,1,1,3


In [8]:
class_size = data.groupby(['output'])['output'].count()
print(f"class_size: {class_size}")

class_size: output
0    22798
1      441
2      320
3    57942
Name: output, dtype: int64


In [9]:
# Exemplo para o dataset completo
# Calculando os pesos das classes
# 0    22798
# 1      441
# 2      320
# 3    57942
# Total = 81501

# wc0 = 81501 / (4 * 22798)
# logger.info(f"wc0: {wc0}")
# wc1 = 81501 / (4 * 441)
# logger.info(f"wc1: {wc1}")
# wc2 = 81501 / (4 * 320)
# logger.info(f"wc2: {wc2}")
# wc3 = 81501 / (4 * 57942)
# logger.info(f"wc3: {wc3}")

def get_class_weights(class_size):
    weights = {}
    total_number_of_samples = class_size.sum()

    for idx, number_of_samples in class_size.items():
        weights[idx] = total_number_of_samples / (len(class_size) * number_of_samples)

    return weights

get_class_weights(class_size)

{0: 0.893729713132731,
 1: 46.20238095238095,
 2: 63.67265625,
 3: 0.3516490628559594}

In [10]:
features_names = list(data.columns)[:-1]
logger.info(f"features_names: {features_names}")
logger.info(f"len(features_names): {len(features_names)}")

09-23-23 14:10:11 svm_antigo_para_recente INFO     features_names: ['iluminancia_caixa', 'iluminancia_teto', 'temperatura', 'usuario_1', 'usuario_2', 'usuario_3', 'usuario_4', 'usuario_5', 'usuario_6', 'usuario_7', 'usuario_8', 'usuario_9', 'usuario_10', 'usuario_11', 'usuario_12', 'usuario_13', 'usuario_14', 'usuario_15', 'usuario_16', 'usuario_17', 'usuario_18', 'usuario_19', 'usuario_20', 'usuario_21', 'usuario_22', 'usuario_23', 'usuario_24', 'usuario_25', 'data_ano', 'data_mes', 'data_dia', 'data_hora', 'data_minuto', 'data_dia_da_semana', 'porta', 'janela1', 'janela2']
09-23-23 14:10:11 svm_antigo_para_recente INFO     len(features_names): 37


In [11]:
data['date'] = pd.to_datetime(dict(year=data.data_ano, month=data.data_mes, day=data.data_dia))

In [12]:
original_data = data.copy()

In [13]:
for day_range in [1, 7, 15, 30, 90, 120, 180, 365]:
    data = original_data.copy()
    logger.info(f"day_range: {day_range}")

    min_date = datetime.strptime(str(data['date'].min())[:10], '%Y-%m-%d')
    max_date = datetime.strptime(str(data['date'].max())[:10], '%Y-%m-%d')

    logger.info(f"min_date: {min_date}")
    logger.info(f"max_date: {max_date}")

    initial_date_train = datetime.strptime(str(data['date'].min())[:10], '%Y-%m-%d')
    final_date_train = initial_date_train + timedelta(days = day_range)

    initial_date_test = final_date_train
    final_date_test = initial_date_test + timedelta(days = day_range)

    if final_date_train > max_date:
        logger.info("A data final de treino é maior que a data máxima")
        break

    if initial_date_test > max_date:
        logger.info("A data inicial de teste é maior que a data máxima")
        break

    logger.info(f"initial_date_test: {initial_date_test}")
    logger.info(f"final_date_test: {final_date_test}")
    logger.info(f"initial_date_train: {initial_date_train}")
    logger.info(f"final_date_train: {final_date_train}")

    logger.info(f"original_data: {len(original_data)}")

    data_train = data.loc[(data['date'] >= initial_date_train) & (data['date'] < final_date_train)]
    logger.info(f"data_train: {len(data_train)}")
    data_test = data.loc[(data['date'] >= initial_date_test) & (data['date'] < final_date_test)]
    logger.info(f"data_test: {len(data_test)}")

    logger.info(f"min_date_train: {data_train['date'].min()}")
    logger.info(f"max_date_train: {data_train['date'].max()}")
    logger.info(f"min_date_test: {data_test['date'].min()}")
    logger.info(f"max_date_test: {data_test['date'].max()}")

    logger.info(f"Distribuição de classes - Treino: {data_train['output'].value_counts()}")
    logger.info(f"Distribuição de classes - Teste: {data_test['output'].value_counts()}")

    X_train = data_train[features_names]
    logger.info(f"X_train.shape: {X_train.shape}")

    X_train = X_train.values.tolist()
    y_train = data_train['output'].values.tolist()

    X_test = data_test[features_names]
    logger.info(f"X_test.shape: {X_test.shape}")

    X_test = X_test.values.tolist()
    y_test = data_test['output'].values.tolist()

    # Train-Test
    logger.info("\n-- Train/Test --")
    clf = svm.SVC(kernel="linear", C=1.0)
    try:
        clf.fit(X_train, y_train)
    except Exception as error:
        logger.info(f"Erro: {error}")
        continue

    y_pred = clf.predict(X_test)

    logger.info('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
    logger.info('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
    logger.info('F1: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    logger.info('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
    logger.info(f"cm: {cm}")

    logger.info("\n-- Weighted Train/Test --")
    class_size = data_train.groupby(['output'])['output'].count()
    class_weight = get_class_weights(class_size)
    logger.info(f"Classes Size: {class_size}")
    logger.info(f"Classes Weight: {class_weight}")
    
    wclf = svm.SVC(kernel="linear", class_weight=class_weight)
    try:
        wclf.fit(X_train, y_train)
    except Exception as error:
        logger.info(f"Erro: {error}")
        continue
    
    y_pred_w = wclf.predict(X_test)

    logger.info('Precision: %.3f' % precision_score(y_test, y_pred_w, average='weighted'))
    logger.info('Recall: %.3f' % recall_score(y_test, y_pred_w, average='weighted'))
    logger.info('F1: %.3f' % f1_score(y_test, y_pred_w, average='weighted'))
    logger.info('Accuracy: %.3f' % accuracy_score(y_test, y_pred_w))
    
    cm = confusion_matrix(y_test, y_pred_w, labels=wclf.classes_)
    logger.info(f"cm: {cm}")

    # Reescalando os dados
    logger.info("\n-- Train/Test with data scaling --")
    scaler_train = StandardScaler()
    scaler_train.fit(X_train)
    X_train_scaled = scaler_train.transform(X_train)

    # scaler_test = StandardScaler()
    # scaler_test.fit(X_test)
    X_test_scaled = scaler_train.transform(X_test)

    clf = svm.SVC(kernel="linear", C=1.0)
    try:
        clf.fit(X_train_scaled, y_train)
    except Exception as error:
        logger.info(f"Erro: {error}")
        continue

    y_pred = clf.predict(X_test_scaled)

    logger.info('Precision: %.3f' % precision_score(y_test, y_pred, average='weighted'))
    logger.info('Recall: %.3f' % recall_score(y_test, y_pred, average='weighted'))
    logger.info('F1: %.3f' % f1_score(y_test, y_pred, average='weighted'))
    logger.info('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
    logger.info(f"cm: {cm}")

    logger.info("\n-- Weighted Train/Test with data scaling --")
    # fit the model and get the separating hyperplane using weighted classes
    wclf = svm.SVC(kernel="linear", class_weight=class_weight)
    try:
        wclf.fit(X_train_scaled, y_train)
    except Exception as error:
        logger.info(f"Erro: {error}")
        continue

    y_pred_w = wclf.predict(X_test_scaled)

    logger.info("\n")
    logger.info("With weights")
    logger.info('Precision: %.3f' % precision_score(y_test, y_pred_w, average='weighted'))
    logger.info('Recall: %.3f' % recall_score(y_test, y_pred_w, average='weighted'))
    logger.info('F1: %.3f' % f1_score(y_test, y_pred_w, average='weighted'))
    logger.info('Accuracy: %.3f' % accuracy_score(y_test, y_pred_w))

    cm = confusion_matrix(y_test, y_pred_w, labels=wclf.classes_)
    logger.info(f"cm: {cm}")

    logger.info("\n ============================================================================= \n")

09-23-23 14:10:11 svm_antigo_para_recente INFO     day_range: 1
09-23-23 14:10:11 svm_antigo_para_recente INFO     min_date: 2022-06-07 00:00:00
09-23-23 14:10:11 svm_antigo_para_recente INFO     max_date: 2023-02-16 00:00:00
09-23-23 14:10:11 svm_antigo_para_recente INFO     initial_date_test: 2022-06-08 00:00:00
09-23-23 14:10:11 svm_antigo_para_recente INFO     final_date_test: 2022-06-09 00:00:00
09-23-23 14:10:11 svm_antigo_para_recente INFO     initial_date_train: 2022-06-07 00:00:00
09-23-23 14:10:11 svm_antigo_para_recente INFO     final_date_train: 2022-06-08 00:00:00
09-23-23 14:10:11 svm_antigo_para_recente INFO     original_data: 81501
09-23-23 14:10:11 svm_antigo_para_recente INFO     data_train: 276
09-23-23 14:10:11 svm_antigo_para_recente INFO     data_test: 694
09-23-23 14:10:11 svm_antigo_para_recente INFO     min_date_train: 2022-06-07 00:00:00
09-23-23 14:10:11 svm_antigo_para_recente INFO     max_date_train: 2022-06-07 00:00:00
09-23-23 14:10:11 svm_antigo_para_rec

  _warn_prf(average, modifier, msg_start, len(result))


09-23-23 14:10:13 svm_antigo_para_recente INFO     Precision: 0.758
09-23-23 14:10:13 svm_antigo_para_recente INFO     Recall: 0.637
09-23-23 14:10:13 svm_antigo_para_recente INFO     F1: 0.565
09-23-23 14:10:13 svm_antigo_para_recente INFO     Accuracy: 0.637
09-23-23 14:10:13 svm_antigo_para_recente INFO     cm: [[2567    0    0]
 [   1    0    0]
 [1608    0  476]]
09-23-23 14:10:13 svm_antigo_para_recente INFO     
-- Train/Test with data scaling --
09-23-23 14:10:13 svm_antigo_para_recente INFO     Precision: 0.757
09-23-23 14:10:13 svm_antigo_para_recente INFO     Recall: 0.637
09-23-23 14:10:13 svm_antigo_para_recente INFO     F1: 0.564
09-23-23 14:10:13 svm_antigo_para_recente INFO     Accuracy: 0.637
09-23-23 14:10:13 svm_antigo_para_recente INFO     cm: [[2566    0    1]
 [   1    0    0]
 [1607    0  477]]
09-23-23 14:10:13 svm_antigo_para_recente INFO     
-- Weighted Train/Test with data scaling --
09-23-23 14:10:13 svm_antigo_para_recente INFO     

09-23-23 14:10:13 svm_

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


09-23-23 14:10:13 svm_antigo_para_recente INFO     final_date_train: 2022-06-22 00:00:00
09-23-23 14:10:13 svm_antigo_para_recente INFO     original_data: 81501
09-23-23 14:10:13 svm_antigo_para_recente INFO     data_train: 9915
09-23-23 14:10:13 svm_antigo_para_recente INFO     data_test: 10373
09-23-23 14:10:13 svm_antigo_para_recente INFO     min_date_train: 2022-06-07 00:00:00
09-23-23 14:10:13 svm_antigo_para_recente INFO     max_date_train: 2022-06-21 00:00:00
09-23-23 14:10:13 svm_antigo_para_recente INFO     min_date_test: 2022-06-22 00:00:00
09-23-23 14:10:13 svm_antigo_para_recente INFO     max_date_test: 2022-07-06 00:00:00
09-23-23 14:10:13 svm_antigo_para_recente INFO     Distribuição de classes - Treino: output
3    5471
0    4319
1     123
2       2
Name: count, dtype: int64
09-23-23 14:10:13 svm_antigo_para_recente INFO     Distribuição de classes - Teste: output
3    5552
0    4725
2      93
1       3
Name: count, dtype: int64
09-23-23 14:10:13 svm_antigo_para_recente 

  _warn_prf(average, modifier, msg_start, len(result))


09-23-23 14:10:19 svm_antigo_para_recente INFO     

09-23-23 14:10:19 svm_antigo_para_recente INFO     With weights
09-23-23 14:10:19 svm_antigo_para_recente INFO     Precision: 0.968
09-23-23 14:10:19 svm_antigo_para_recente INFO     Recall: 0.954
09-23-23 14:10:19 svm_antigo_para_recente INFO     F1: 0.961
09-23-23 14:10:19 svm_antigo_para_recente INFO     Accuracy: 0.954
09-23-23 14:10:19 svm_antigo_para_recente INFO     cm: [[4402  208    0  115]
 [   3    0    0    0]
 [  64    0    0   29]
 [  21   36    2 5493]]
09-23-23 14:10:19 svm_antigo_para_recente INFO     

09-23-23 14:10:19 svm_antigo_para_recente INFO     day_range: 30
09-23-23 14:10:19 svm_antigo_para_recente INFO     min_date: 2022-06-07 00:00:00
09-23-23 14:10:19 svm_antigo_para_recente INFO     max_date: 2023-02-16 00:00:00
09-23-23 14:10:19 svm_antigo_para_recente INFO     initial_date_test: 2022-07-07 00:00:00
09-23-23 14:10:19 svm_antigo_para_recente INFO     final_date_test: 2022-08-06 00:00:00
09-23-23 14:10:1

  _warn_prf(average, modifier, msg_start, len(result))


09-23-23 14:21:20 svm_antigo_para_recente INFO     Precision: 0.963
09-23-23 14:21:20 svm_antigo_para_recente INFO     Recall: 0.396
09-23-23 14:21:20 svm_antigo_para_recente INFO     F1: 0.538
09-23-23 14:21:20 svm_antigo_para_recente INFO     Accuracy: 0.396
09-23-23 14:21:20 svm_antigo_para_recente INFO     cm: [[ 547    0    0   78]
 [   1    0    0    0]
 [   0    0    0    0]
 [6290 6392    0 7809]]
09-23-23 14:21:20 svm_antigo_para_recente INFO     
-- Weighted Train/Test with data scaling --
09-23-23 14:22:46 svm_antigo_para_recente INFO     

09-23-23 14:22:46 svm_antigo_para_recente INFO     With weights
09-23-23 14:22:46 svm_antigo_para_recente INFO     Precision: 0.976
09-23-23 14:22:46 svm_antigo_para_recente INFO     Recall: 0.920
09-23-23 14:22:46 svm_antigo_para_recente INFO     F1: 0.944
09-23-23 14:22:46 svm_antigo_para_recente INFO     Accuracy: 0.920
09-23-23 14:22:46 svm_antigo_para_recente INFO     cm: [[  506     0    42    77]
 [    1     0     0     0]
 [    0 

  _warn_prf(average, modifier, msg_start, len(result))
