In [1]:
from Core.DTO import *
from Core.Relations import *
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re
from datetime import datetime 

## Conexão com banco

In [None]:
# 🔹 Configuração do banco (pode ser reutilizada para qualquer ModelDTOo)
mongo_url = "mongodb://localhost:27017/"
db_manager = DatabaseManager('mysql+pymysql://root:000000000@localhost/mydb', mongo_url = mongo_url)
session = db_manager.get_session()
dataset_repo = DatasetRepository(session)
conversor = ConverterDTO(session=session)
mongo_db = db_manager.get_mongo_db()


In [None]:
project_name = 'LoanDefaultPrediction'
target_feature_name = 'default'
name_space= project_name


def process_raw_loan_data():

    columns = ['id', 'loan_amnt', 'term', 'int_rate', 'installment', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 'issue_d',
       'loan_status', 'purpose', 'addr_state', 'dti']

    df = pd.read_csv('data//Loan.csv', usecols = columns )
    
    df['term'] = df['term'].str.strip().str.split(' ').map(lambda x: x[0]).astype('int')
    df['timestamp'] = pd.to_datetime(df['issue_d'], format="%b-%y")

    dict_emp = {'10+ years': 10,
    '< 1 year': 0,
    '1 year': 1,
    '3 years': 3,
    '8 years': 8,
    '9 years': 9,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '2 years': 2,
    '7 years': 7}

    ix = df['emp_length'].notnull()
    df.loc[ix,'emp_length'] = df.loc[ix,'emp_length'].map(lambda x : dict_emp[x])
    df['emp_length'] = df['emp_length'].astype('float')
    df['int_rate'] = df['int_rate'].str.replace('%','', regex=False).astype('float')

    df = df[df['loan_status'] != 'Current'].reset_index(drop=True)
    df['default'] = (df['loan_status'] == 'Charged Off').astype('int')
    df.drop(columns = ['issue_d','loan_status'], inplace=True)
    df.rename(columns={'id':'idEntity'}, inplace=True)
    df['idEntity'] = df['idEntity'].astype('string')
    df = df[df['timestamp']<'2011-02-01'].reset_index(drop=True)

    df_melt = df.melt(id_vars = ['timestamp','idEntity'], value_vars = df.drop(columns = 'timestamp').columns)
    df_melt['type'] = df_melt['value'].map(lambda x : type(x).__name__)
    df_melt.rename(columns ={'variable':'name'}, inplace=True)

    return df_melt

In [None]:

dataset = Dataset(name = project_name)
item_exists, dataset_dto = conversor.get_if_exists(dataset)

if not(item_exists):
    df = process_raw_loan_data()

    dataset_dto = DatasetDTO(name = project_name ) 
    lst_features= df['name'].drop_duplicates().to_list()
    dataset_dto.process_feature_list(lst_features= lst_features, name_space=name_space)
    dataset_repo.save(dataset_dto)
    item_exists, dataset_dto = conversor.get_if_exists(dataset)
    dataset_dto.save_data_mongo(mongo_db ,df = df)

dataset_dto.load_data_from_mongo(mongo_db)
dataset = dataset_dto.dataset 

In [None]:
targetFeature = Feature(name = target_feature_name, nameSpace = FeatureNameSpace(name = name_space))
project = Project(name  = project_name, projectType = ProjectType(name = 'Classification'), targetFeature = targetFeature)

item_exists,project_dto = conversor.get_if_exists(project)

if not(item_exists):
	targetFeature = dataset_dto.get_feature_by_name(name = target_feature_name)
	project_dto =ProjectDTO(name  = project_name, projectType = ProjectTypeDTO(name = 'Classification'), targetFeature = targetFeature)
	ProjectRepository(session=session).save(project_dto)


In [None]:
datas = pd.date_range(start="2009-01-01", end="2011-11-01", freq="MS")
for data_inicio in tqdm(datas):
    data_fim = pd.date_range(start=data_inicio, periods=1, freq="ME")[0]

    #treinando
    model = OHERandomForestClassifier()
    task = ClassificationTrainingTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'end_date':data_inicio})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)   
    run_repo.save(run_dto)

    #predição
        
    slices = [{'condition':"addr_state == 'NY'", 'description':'NY'},
            {'condition':"addr_state == 'CA'", 'description':'CA'},
            {'condition':"addr_state == 'FL'", 'description':'FL'},
            {'condition':"addr_state == 'TX'", 'description':'TX'},
            {'condition':"addr_state == 'IL'", 'description':'IL'}]

    model.idModel =  run_dto.model.idModel
    task = ClassificationPredictionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'start_date':data_inicio,'end_date':data_fim, 'slices' :slices})

    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)

    #drift features
    task = FeatureDriftCheckTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                  'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)

    #missing features
    task = FeatureDriftCheckTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                  'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)


    #outlier detection
    task = OutlierDetectionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                    'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)


  0%|          | 0/2 [00:00<?, ?it/s]

  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  top_k_rate = positives_top_k / cutoff
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  stat, p_value = ks_2samp(pos_scores, neg_scores)
  top_k_rate = positives_top_k / cutoff


In [9]:
df = dataset.df.drop(columns = ['idEntity'], errors='ignore') 
name_space_map = {}
for feature in dataset.features:
    name_space_map[feature.name] = feature.nameSpace

In [13]:
df_current = dataset.df.drop(columns=['timestamp','idEntity'])

serie_missing = df_current.isnull().mean() * 100
df_missing = pd.DataFrame(serie_missing.values, serie_missing.index).T
dict_missing = df_missing.to_dict(orient = 'records')[0]
measureValues = []
for feature_name, missing_pc in dict_missing.items():
    name_space_obj = name_space_map[feature_name]
    name_space_id = name_space_obj.idFeatureNameSpace
    feature_reference = Feature(name = feature_name, nameSpace=name_space_obj, idFeatureNameSpace = name_space_id)
    measure = Measure(name = 'Missing PC' +' ' + feature_reference.nameSpace.name + ' ' +feature_reference.name)
    measure.subjectFeatures = [SubjectFeature(feature=feature_reference)]
    measureValue = MeasureValue(measure=measure, value = missing_pc, evaluationProcedure= None) # TODO
    measureValues.append(measureValue)

    print(feature_name, missing_pc)

addr_state 0.0
annual_inc 0.0
default 0.0
dti 0.0
emp_length 2.6777613603960906
home_ownership 0.0
installment 0.0
int_rate 0.0
loan_amnt 0.0
purpose 0.0
term 0.0
verification_status 0.0


### Seoul

In [3]:
project_name = 'SeoulBike'
target_feature_name = 'rented_bike_count'
name_space= project_name

def process_raw_seoul_data():

    def remove_parentheses_content(text):
        return re.sub(r'\([^)]*\)', '', text)

    df = pd.read_csv('data//SeoulBikeData.csv', encoding='latin1')
    df.columns = [remove_parentheses_content(i.lower()).strip().replace(' ','_') for i in df.columns]
    df['timestamp'] = df['date'].map(lambda x : datetime.strptime(x,  "%d/%m/%Y"))
    df = df.drop(columns = 'date')
    df['timestamp'] = df['timestamp'] + pd.to_timedelta(df['hour'], unit='h')
    df_melt = df.melt(id_vars = ['timestamp'], value_vars = df.drop(columns = 'timestamp').columns)
    df_melt['idEntity'] = '1'
    df_melt['type'] = df_melt['value'].map(lambda x : type(x).__name__)
    df_melt.rename(columns ={'variable':'name'}, inplace=True)
    return df_melt

In [4]:

dataset = Dataset(name = project_name)
item_exists, dataset_dto = conversor.get_if_exists(dataset)

if not(item_exists):
    df = process_raw_seoul_data()

    dataset_dto = DatasetDTO(name = project_name ) 
    lst_features= df['name'].drop_duplicates().to_list()
    dataset_dto.process_feature_list(lst_features= lst_features, name_space=name_space)
    dataset_repo.save(dataset_dto)
    item_exists, dataset_dto = conversor.get_if_exists(dataset)
    dataset_dto.save_data_mongo(mongo_db ,df = df)

dataset_dto.load_data_from_mongo(mongo_db)
dataset = dataset_dto.dataset 

In [5]:
targetFeature = Feature(name = target_feature_name, nameSpace = FeatureNameSpace(name = name_space))
project = Project(name  = project_name, projectType = ProjectType(name = 'Regression'), targetFeature = targetFeature)

item_exists,project_dto = conversor.get_if_exists(project)

if not(item_exists):
	targetFeature = dataset_dto.get_feature_by_name(name = target_feature_name)
	project_dto =ProjectDTO(name  = project_name, projectType = ProjectTypeDTO(name = 'Regression'), targetFeature = targetFeature)
	ProjectRepository(session=session).save(project_dto)


In [None]:
datas = pd.date_range(start="2018-01-01", end="2018-11-30", freq="MS")  
for data_inicio in tqdm(datas):
    data_fim = pd.date_range(start=data_inicio, periods=1, freq="ME")[0]

    #treinando
    model = OHEDecisionTreeRegressor()
    task = RegressionTrainingTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'end_date':data_inicio})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)   
    run_repo.save(run_dto)

    slices = [{'condition':"hour >= 9 and hour <= 18", 'description':'business hours'},
            {'condition':"hour < 9 or hour > 18", 'description':'not business hours'}]

    #predição
    model.idModel =  run_dto.model.idModel
    task = RegressionPredictionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'start_date':data_inicio,'end_date':data_fim, 'slices':slices})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)

    #drift features
    task = FeatureDriftCheckTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                  'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)

    #missing features
    task = FeatureDriftCheckTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                  'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)

    #outlier detection
    task = OutlierDetectionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                    'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)


  0%|          | 0/11 [00:00<?, ?it/s]

In [2]:
# 1. Imports
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from fairlearn.metrics import MetricFrame, selection_rate, true_positive_rate, false_positive_rate
import plotly.express as px

# 2. Carregar dataset Adult
adult = fetch_openml("adult", version=2, as_frame=True)
df = adult.frame.copy()

# 3. Pré-processamento
df = df.dropna()
df = df[df['sex'].isin(['Male', 'Female'])]  # manter apenas sexos binários
df = df[df['race'] != 'Other']               # remover grupo pequeno

# Alvo e features
X = df.drop(columns=['class'])
y = (df['class'] == '>50K').astype(int)  # 1 se renda > 50k

# Dummies + normalização
X = pd.get_dummies(X, drop_first=True)
X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)

# Separar conjuntos
X_train, X_test, y_train, y_test, sexo_train, sexo_test = train_test_split(
    X, y, df['sex'], test_size=0.3, random_state=42
)

# 4. Treinar modelo
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# 5. Métricas com Fairlearn
metricas = {
    "accuracy": accuracy_score,
    "selection_rate": selection_rate,
    "TPR": true_positive_rate,
    "FPR": false_positive_rate,
}

mf = MetricFrame(
    metrics=metricas,
    y_true=y_test,
    y_pred=y_pred,
    sensitive_features=sexo_test
)

print("📊 Métricas por grupo (sexo):")
print(mf.by_group)

# 6. Preparar dados para Plotly
df_plot = mf.by_group.reset_index(names='grupo').melt(id_vars='grupo', var_name='métrica', value_name='valor')

# 7. Plotar com Plotly
fig = px.bar(
    df_plot,
    x="grupo",
    y="valor",
    color="métrica",
    barmode="group",
    text_auto=".2f",
    title="📊 Métricas por grupo sensível (sexo) com Fairlearn",
    labels={"grupo": "Sexo", "valor": "Valor da Métrica"}
)

fig.update_layout(yaxis=dict(tickformat=".2f"))
fig.show()


📊 Métricas por grupo (sexo):
        accuracy  selection_rate       TPR       FPR
sex                                                 
Female  0.927972        0.074592  0.511983  0.022187
Male    0.807436        0.274888  0.631487  0.112381


In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

from fairlearn.metrics import (
    demographic_parity_difference,
    equalized_odds_difference,
    true_positive_rate,
    false_positive_rate,
    selection_rate,
    MetricFrame
)

# 1. Carregar o dataset "Adult" (renda >50K)
adult = fetch_openml(data_id=1590, as_frame=True)  # Adult Income dataset
df = adult.frame

# 2. Preprocessamento simples
df = df.dropna()
df = df[df["race"].isin(["White", "Black"])]  # Simplificar para binário
df["sex"] = df["sex"].map({"Male": 1, "Female": 0})
df["race"] = df["race"].map({"White": 1, "Black": 0})

# Atributo sensível: "sex"
A = df["sex"]
y = (df["class"] == ">50K").astype(int)

X = df.drop(columns=["class"])

# Encode categoricals
X = pd.get_dummies(X, drop_first=True)

# 3. Split
X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    X, y, A, test_size=0.3, random_state=42
)

# 4. Treinar modelo
model = DecisionTreeClassifier(max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 5. Avaliar fairness com Fairlearn
metrics = {
    "selection_rate": selection_rate,
    "true_positive_rate": true_positive_rate,
    "false_positive_rate": false_positive_rate,
}

frame = MetricFrame(metrics=metrics, y_true=y_test, y_pred=y_pred, sensitive_features=A_test)

print("=== Métricas por grupo (sexo) ===")
print(frame.by_group)

print("\n=== Diferenças entre grupos (fairness gaps) ===")
print("Demographic Parity Difference:", demographic_parity_difference(y_test, y_pred, sensitive_features=A_test))
print("Equalized Odds Difference:", equalized_odds_difference(y_test, y_pred, sensitive_features=A_test))

# Predictive parity (precisão positiva por grupo)
def predictive_parity(y_true, y_pred, sensitive_features):
    df = pd.DataFrame({
        'y_true': y_true,
        'y_pred': y_pred,
        'group': sensitive_features
    })
    pp = df[df['y_pred'] == 1].groupby('group')['y_true'].mean()
    return pp

print("\nPredictive Parity por grupo:")
print(predictive_parity(y_test, y_pred, A_test))

# Disparate impact
sr = frame.by_group["selection_rate"]
disparate_impact = min(sr) / max(sr)
print("\nDisparate Impact (min/max selection rate):", disparate_impact)


=== Métricas por grupo (sexo) ===
     selection_rate  true_positive_rate  false_positive_rate
sex                                                         
0          0.063552            0.463617             0.012045
1          0.236730            0.574861             0.084746

=== Diferenças entre grupos (fairness gaps) ===
Demographic Parity Difference: 0.17317819427547032
Equalized Odds Difference: 0.11124390421987457

Predictive Parity por grupo:
group
0    0.832090
1    0.753027
Name: y_true, dtype: float64

Disparate Impact (min/max selection rate): 0.26845840742618515


  pp = df[df['y_pred'] == 1].groupby('group')['y_true'].mean()


In [2]:
from scipy.stats import kruskal
import numpy as np

# Três grupos com distribuições diferentes
g1 = np.random.normal(loc=50, scale=5, size=100)
g2 = np.random.normal(loc=55, scale=5, size=100)
g3 = np.random.normal(loc=50, scale=5, size=100)

# Aplicando Kruskal-Wallis
stat, p = kruskal(g1, g2, g3)
print("Estatística H:", round(stat, 3))
print("p-valor:", round(p, 4))

Estatística H: 47.033
p-valor: 0.0


In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from scipy.stats import ks_2samp

from scipy.stats import ks_2samp

def ks_statistic(y_true, y_score):
    pos_scores = y_score[y_true == 1]
    neg_scores = y_score[y_true == 0]
    stat, p_value = ks_2samp(pos_scores, neg_scores)
    return stat

# Gerar dados sintéticos
X, y = make_classification(n_samples=1000, n_features=10, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# Treinar modelo
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Obter scores (probabilidades para classe 1)
y_score = clf.predict_proba(X_test)[:, 1]

# Calcular KS
ks = ks_statistic(y_test, y_score)
print(f"KS statistic: {ks:.4f}")

KS statistic: 0.8000
