In [1]:
from Core.DTO import *
from Core.Relations import *
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re
from datetime import datetime 

## Conexão com banco

In [2]:
# 🔹 Configuração do banco (pode ser reutilizada para qualquer ModelDTOo)
mongo_url = "mongodb://localhost:27017/"
db_manager = DatabaseManager('mysql+pymysql://root:000000000@localhost/mydb', mongo_url = mongo_url)
session = db_manager.get_session()
dataset_repo = DatasetRepository(session)
conversor = ConverterDTO(session=session)
mongo_db = db_manager.get_mongo_db()


### Seoul

In [3]:
project_name = 'SeoulBike'
target_feature_name = 'rented_bike_count'
name_space= project_name

def process_raw_seoul_data():

    def remove_parentheses_content(text):
        return re.sub(r'\([^)]*\)', '', text)

    df = pd.read_csv('data//SeoulBikeData.csv', encoding='latin1')
    df.columns = [remove_parentheses_content(i.lower()).strip().replace(' ','_') for i in df.columns]
    df['timestamp'] = df['date'].map(lambda x : datetime.strptime(x,  "%d/%m/%Y"))
    df = df.drop(columns = 'date')
    df['timestamp'] = df['timestamp'] + pd.to_timedelta(df['hour'], unit='h')
    df_melt = df.melt(id_vars = ['timestamp','hour'], value_vars = df.drop(columns = 'timestamp').columns)
    df_melt.rename(columns = {'hour':'idEntity'}, inplace=True)
    df_melt['type'] = df_melt['value'].map(lambda x : type(x).__name__)
    df_melt.rename(columns ={'variable':'name'}, inplace=True)
    return df_melt

In [4]:

dataset = Dataset(name = project_name)
item_exists, dataset_dto = conversor.get_if_exists(dataset)

if not(item_exists):
    df = process_raw_seoul_data()

    dataset_dto = DatasetDTO(name = project_name ) 
    lst_features= df['name'].drop_duplicates().to_list()
    dataset_dto.process_feature_list(lst_features= lst_features, name_space=name_space)
    dataset_repo.save(dataset_dto)
    item_exists, dataset_dto = conversor.get_if_exists(dataset)
    dataset_dto.save_data_mongo(mongo_db ,df = df)

dataset_dto.load_data_from_mongo(mongo_db)
dataset = dataset_dto.dataset 

In [5]:
targetFeature = Feature(name = target_feature_name, nameSpace = FeatureNameSpace(name = name_space))
project = Project(name  = project_name, projectType = ProjectType(name = 'Regression'), targetFeature = targetFeature)

item_exists,project_dto = conversor.get_if_exists(project)

if not(item_exists):
	targetFeature = dataset_dto.get_feature_by_name(name = target_feature_name)
	project_dto =ProjectDTO(name  = project_name, projectType = ProjectTypeDTO(name = 'Regression'), targetFeature = targetFeature)
	ProjectRepository(session=session).save(project_dto)


In [None]:
datas = pd.date_range(start="2018-01-01", end="2018-11-30", freq="MS")  
for data_inicio in tqdm(datas):
    data_fim = pd.date_range(start=data_inicio, periods=1, freq="ME")[0]

    #treinando
    model = OHEDecisionTreeRegressor()
    task = SeoulBikeTrainingTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'end_date':data_inicio})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)   
    run_repo.save(run_dto)

    #predição
    model.idModel =  run_dto.model.idModel
    task = SeoulBikePredictionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'start_date':data_inicio,'end_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)


  0%|          | 0/11 [00:00<?, ?it/s]

In [None]:
import shap
X = dataset_dto.df
preprocessor = model.model.named_steps['preprocessor']
regressor = model.model.named_steps['regressor']

feature_names = model.feature_names()

X_transformed = preprocessor.transform(X)
explainer = shap.Explainer(regressor, X_transformed, feature_names=feature_names)
shap_values = explainer(X_transformed,  check_additivity=False)

shap_df = pd.DataFrame(shap_values.values, columns=feature_names)
shap_df['prediction_i'] = range(len(shap_df))

# Derreter para o formato longo (long format)
shap_df_long = shap_df.melt(id_vars='prediction_i', var_name='feature', value_name='contribution')
shap_df_long.sort_values(by = 'prediction_i', ignore_index=True, inplace=True)


In [11]:
preprocessor.get_feature_names_out()

array(['cat__SeoulBike__functioning_day_Yes',
       'cat__SeoulBike__holiday_Holiday',
       'cat__SeoulBike__holiday_No Holiday',
       'cat__SeoulBike__seasons_Winter',
       'remainder__SeoulBike__dew_point_temperature',
       'remainder__SeoulBike__humidity', 'remainder__SeoulBike__rainfall',
       'remainder__SeoulBike__snowfall',
       'remainder__SeoulBike__solar_radiation',
       'remainder__SeoulBike__temperature',
       'remainder__SeoulBike__visibility',
       'remainder__SeoulBike__wind_speed'], dtype=object)

In [54]:
explainer = shap.Explainer(pipeline.named_steps['regressor'], preprocessor.transform(df), feature_names=model.feature_names())
shap_values = explainer(preprocessor.transform(df[2342:2343]), check_additivity=False)

In [46]:
preprocessor.transform(df[2342:2343])

array([[1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 8.000e-01, 6.200e+01,
        0.000e+00, 0.000e+00, 1.640e+00, 7.700e+00, 1.758e+03, 2.400e+00]])

In [47]:
preprocessor.transform(df[2343:2344]) 

array([[1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e-01, 5.300e+01,
        0.000e+00, 0.000e+00, 2.230e+00, 9.200e+00, 1.896e+03, 3.300e+00]])

In [55]:
shap_df = pd.DataFrame({
    'variável': explainer.feature_names,
    'contribuição': shap_values.data[0]
})

In [56]:
shap_df

Unnamed: 0,variável,contribuição
0,SeoulBike__functioning_day_Yes,1.0
1,SeoulBike__holiday_Holiday,0.0
2,SeoulBike__holiday_No Holiday,1.0
3,SeoulBike__seasons_Winter,0.0
4,SeoulBike__dew_point_temperature,0.8
5,SeoulBike__humidity,62.0
6,SeoulBike__rainfall,0.0
7,SeoulBike__snowfall,0.0
8,SeoulBike__solar_radiation,1.64
9,SeoulBike__temperature,7.7


In [8]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import shap

# 1. Gerar dados de exemplo com categóricas e numéricas
np.random.seed(42)
df = pd.DataFrame({
    'idade': np.random.randint(18, 60, 100),
    'renda': np.random.normal(5000, 1500, 100),
    'sexo': np.random.choice(['masculino', 'feminino'], 100),
    'cidade': np.random.choice(['SP', 'RJ', 'BH'], 100),
})
df['gasto'] = df['renda'] * 0.3 + df['idade'] * 2 + (df['sexo'] == 'feminino') * 200 + np.random.normal(0, 50, 100)

# 2. Separar variáveis
X = df.drop('gasto', axis=1)
y = df['gasto']

# 3. Definir colunas
cat_cols = ['sexo', 'cidade']
num_cols = ['idade', 'renda']

# 4. Criar pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(sparse_output=False), cat_cols)
])

pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# 5. Treinar modelo
pipeline.fit(X, y)

# 6. Fazer predição para uma instância
instance = X.iloc[[0]]
prediction = pipeline.predict(instance)[0]

# 7. Explicação com SHAP
explainer = shap.Explainer(pipeline.named_steps['model'], preprocessor.transform(X), feature_names=preprocessor.get_feature_names_out())
shap_values = explainer(preprocessor.transform(instance))

# 8. Formatando resultado
shap_df = pd.DataFrame({
    'variável': explainer.feature_names,
    'contribuição': shap_values.values[0]
})
shap_df['predição'] = prediction
shap_df = shap_df[['predição', 'variável', 'contribuição']]

# 9. Exibir
print(shap_df)

      predição             variável  contribuição
0  1682.774811           num__idade     19.387550
1  1682.774811           num__renda    -85.417889
2  1682.774811   cat__sexo_feminino     23.256057
3  1682.774811  cat__sexo_masculino     25.483806
4  1682.774811       cat__cidade_BH      3.131381
5  1682.774811       cat__cidade_RJ     -1.623889
6  1682.774811       cat__cidade_SP     -0.205007
