In [1]:
from Core.DTO import *
from Core.Relations import *
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re
from datetime import datetime 

## Conexão com banco

In [2]:
# 🔹 Configuração do banco (pode ser reutilizada para qualquer ModelDTOo)
mongo_url = "mongodb://localhost:27017/"
db_manager = DatabaseManager('mysql+pymysql://root:000000000@localhost/mydb', mongo_url = mongo_url)
session = db_manager.get_session()
dataset_repo = DatasetRepository(session)
conversor = ConverterDTO(session=session)
mongo_db = db_manager.get_mongo_db()


### Seoul

In [3]:
project_name = 'SeoulBike'
target_feature_name = 'rented_bike_count'
name_space= project_name

def process_raw_seoul_data():

    def remove_parentheses_content(text):
        return re.sub(r'\([^)]*\)', '', text)

    df = pd.read_csv('data//SeoulBikeData.csv', encoding='latin1')
    df.columns = [remove_parentheses_content(i.lower()).strip().replace(' ','_') for i in df.columns]
    df['timestamp'] = df['date'].map(lambda x : datetime.strptime(x,  "%d/%m/%Y"))
    df = df.drop(columns = 'date')
    df['timestamp'] = df['timestamp'] + pd.to_timedelta(df['hour'], unit='h')
    df_melt = df.melt(id_vars = ['timestamp'], value_vars = df.drop(columns = 'timestamp').columns)
    df_melt['idEntity'] = '1'
    df_melt['type'] = df_melt['value'].map(lambda x : type(x).__name__)
    df_melt.rename(columns ={'variable':'name'}, inplace=True)
    return df_melt

In [4]:

dataset = Dataset(name = project_name)
item_exists, dataset_dto = conversor.get_if_exists(dataset)

if not(item_exists):
    df = process_raw_seoul_data()

    dataset_dto = DatasetDTO(name = project_name ) 
    lst_features= df['name'].drop_duplicates().to_list()
    dataset_dto.process_feature_list(lst_features= lst_features, name_space=name_space)
    dataset_repo.save(dataset_dto)
    item_exists, dataset_dto = conversor.get_if_exists(dataset)
    dataset_dto.save_data_mongo(mongo_db ,df = df)

dataset_dto.load_data_from_mongo(mongo_db)
dataset = dataset_dto.dataset 

In [5]:
targetFeature = Feature(name = target_feature_name, nameSpace = FeatureNameSpace(name = name_space))
project = Project(name  = project_name, projectType = ProjectType(name = 'Regression'), targetFeature = targetFeature)

item_exists,project_dto = conversor.get_if_exists(project)

if not(item_exists):
	targetFeature = dataset_dto.get_feature_by_name(name = target_feature_name)
	project_dto =ProjectDTO(name  = project_name, projectType = ProjectTypeDTO(name = 'Regression'), targetFeature = targetFeature)
	ProjectRepository(session=session).save(project_dto)


In [6]:
datas = pd.date_range(start="2018-01-01", end="2018-11-30", freq="MS")  
for data_inicio in tqdm(datas):
    data_fim = pd.date_range(start=data_inicio, periods=1, freq="ME")[0]

    #treinando
    model = OHEDecisionTreeRegressor()
    task = SeoulBikeTrainingTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'end_date':data_inicio})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)   
    run_repo.save(run_dto)

    #predição
    model.idModel =  run_dto.model.idModel
    task = SeoulBikePredictionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'start_date':data_inicio,'end_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)


  0%|          | 0/11 [00:00<?, ?it/s]

In [71]:
from evidently.legacy.report import Report
from evidently.legacy.metric_preset  import DataDriftPreset 

In [72]:
# Simulando dados
np.random.seed(42)
ref_data = pd.DataFrame({
    'feature_1': np.random.normal(0, 1, 1000),
    'feature_2': np.random.choice(['A', 'B', 'C'], 1000, p=[0.6, 0.3, 0.1]),
})
prod_data = pd.DataFrame({
    'feature_1': np.random.normal(0.5, 1.2, 1000),
    'feature_2': np.random.choice(['A', 'B', 'C'], 1000, p=[0.4, 0.4, 0.2]),
})

# Gerar relatório
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=ref_data, current_data=prod_data)
#report.show()

In [85]:
results = report.as_dict()

# Exemplo: lista de features e se houve drift
drift_results = results['metrics'][1]['result']['drift_by_columns']

for feature, info in drift_results.items():
    print(f"Feature: {feature}")
    #print(f"  Drift detectado: {info['drift_detected']}")
    #print(f"  Tipo: {info['column_type']}")
    #print(f"  Score (ex: PSI): {info['drift_score']}")
    #print(f"  Teste usado: {info['stat_test']}")
    print(info)

Feature: feature_1
{'column_name': 'feature_1', 'column_type': 'num', 'stattest_name': 'K-S p_value', 'stattest_threshold': 0.05, 'drift_score': 1.3152720028193915e-21, 'drift_detected': True, 'current': {'small_distribution': {'x': [-3.1234145869849903, -2.384543304247292, -1.6456720215095937, -0.9068007387718953, -0.16792945603419707, 0.5709418267035011, 1.3098131094411998, 2.0486843921788975, 2.787555674916596, 3.526426957654295, 4.265298240391992], 'y': [0.010827325661322294, 0.031128561276301585, 0.10962667232088819, 0.2246670074724376, 0.32888001696266467, 0.31805269130134217, 0.207072603272789, 0.0879720209982436, 0.025714898445640434, 0.009473909953657019]}}, 'reference': {'small_distribution': {'x': [-3.2412673400690726, -2.531867456996693, -1.822467573924314, -1.1130676908519344, -0.40366780777955524, 0.30573207529282387, 1.0151319583652039, 1.724531841437583, 2.433931724509962, 3.143331607582341, 3.852731490654721], 'y': [0.0056385687331610155, 0.031012128032385608, 0.135325