In [1]:
from Core.DTO import *
from Core.Relations import *
import pandas as pd
import numpy as np


## Input de dados

In [2]:
# 🔹 Configuração do banco (pode ser reutilizada para qualquer ModelDTOo)
db_manager = DatabaseManager('mysql+pymysql://root:000000000@localhost/mydb')
session = db_manager.get_session()
dataset_repo = DatasetRepository(session)


In [3]:
from sklearn import datasets

dataset_dto = dataset_repo.filter_by({'name' : 'Iris'}).first()
if not(dataset_dto):
    dataset_iris = datasets.load_iris()
    df = pd.DataFrame(dataset_iris.data, columns=dataset_iris.feature_names)
    df['target'] = dataset_iris.target
    dataset = Dataset(targetFeature='target', df  =df , name = 'Iris')
    dataset_dto = DatasetDTO(dataset=dataset)
    dataset_repo.save(dataset_dto)


In [None]:
import re
from datetime import datetime

def remove_parentheses_content(text):
    return re.sub(r'\([^)]*\)', '', text)

dataset_dto = dataset_repo.filter_by({'name' : 'SeoulBike'}).first()
if not(dataset_dto):

    df = pd.read_csv('data//SeoulBikeData.csv', encoding='latin1')
    df.columns = [remove_parentheses_content(i.lower()).strip().replace(' ','_') for i in df.columns]
    df['date'] = df['date'].map(lambda x : datetime.strptime(x,  "%d/%m/%Y"))
    dataset = Dataset(targetFeature='rented_bike_count', df  =df , name = 'SeoulBike')
    dataset_dto = DatasetDTO(dataset=dataset)
    dataset_repo.save(dataset_dto)

### Teste Iris

In [None]:

dataset = dataset_repo.filter_by({'name' : 'Iris'}).first().dataset
model = RandomForestClassifierModel()

task = IrisClassificationTask (dataset = dataset) 
run = Run()
run.execute( task = task,  model = model, modelParameters= {"n_estimators": 100, "max_depth": 5, "criterion": "gini"})

conversor = ConverterDTO(session=session)
run_dto = conversor.converter_object_to_dto(run)
run_repo = RunRepository(session=session)
run_repo.save(run_dto)

### Teste seoul

In [10]:
dataset = dataset_repo.filter_by({'name' : 'SeoulBike'}).first().dataset
df = dataset.df

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Separar features e target
X = df.drop(["rented_bike_count", "date"], axis=1)
y = df["rented_bike_count"].astype(int)

# Colunas categóricas e numéricas
categorical_cols = X.select_dtypes(include=["category", "object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()

# Pré-processamento com OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols)
    ],
    remainder="passthrough"
)

# Pipeline com DecisionTreeRegressor
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(random_state=42))
])

# Split dos dados
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinamento
model.fit(X_train, y_train)

# Predição
y_pred = model.predict(X_test)

# Avaliação
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")

RMSE: 345.05
R²: 0.71




In [None]:
categorical_cols

['seasons', 'holiday', 'functioning_day']

In [None]:
model.named_steps['regressor'].set_params()

In [None]:
# Recuperar o OneHotEncoder treinado
ohe = model.named_steps["preprocessor"].named_transformers_["cat"]
# Obter nomes das colunas codificadas
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)
# Unir com colunas numéricas
feature_names = list(ohe_feature_names) + numeric_cols

# Obter importâncias diretamente do modelo
importances = model.named_steps["regressor"].feature_importances_

# Criar DataFrame com as importâncias
importancias_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print("\nTop 15 features mais importantes:")
print(importancias_df.head(15))


Top 15 features mais importantes:
                  feature  importance
9             temperature    0.297462
8                    hour    0.283113
10               humidity    0.099838
14        solar_radiation    0.093880
6      functioning_day_No    0.084870
13  dew_point_temperature    0.039355
0          seasons_Autumn    0.025672
12             visibility    0.020564
11             wind_speed    0.017793
15               rainfall    0.017707
3          seasons_Winter    0.008987
7     functioning_day_Yes    0.003540
1          seasons_Spring    0.001876
2          seasons_Summer    0.001812
5      holiday_No Holiday    0.001741
