In [1]:
from Core.DTO import *
from Core.Relations import *
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import re
from datetime import datetime 

## Conexão com banco

In [2]:
# 🔹 Configuração do banco (pode ser reutilizada para qualquer ModelDTOo)
mongo_url = "mongodb://localhost:27017/"
db_manager = DatabaseManager('mysql+pymysql://root:000000000@localhost/mydb', mongo_url = mongo_url)
session = db_manager.get_session()
dataset_repo = DatasetRepository(session)
conversor = ConverterDTO(session=session)
mongo_db = db_manager.get_mongo_db()


In [3]:
project_name = 'LoanDefaultPrediction'
target_feature_name = 'default'
name_space= project_name


def process_raw_loan_data():

    columns = ['id', 'loan_amnt', 'term', 'int_rate', 'installment', 'emp_length',
       'home_ownership', 'annual_inc', 'verification_status', 'issue_d',
       'loan_status', 'purpose', 'addr_state', 'dti']

    df = pd.read_csv('data//Loan.csv', usecols = columns )
    
    df['term'] = df['term'].str.strip().str.split(' ').map(lambda x: x[0]) 
    df['timestamp'] = pd.to_datetime(df['issue_d'], format="%b-%y")

    dict_emp = {'10+ years': 10,
    '< 1 year': 0,
    '1 year': 1,
    '3 years': 3,
    '8 years': 8,
    '9 years': 9,
    '4 years': 4,
    '5 years': 5,
    '6 years': 6,
    '2 years': 2,
    '7 years': 7}

    ix = df['emp_length'].notnull()
    df.loc[ix,'emp_length'] = df.loc[ix,'emp_length'].map(lambda x : dict_emp[x])
    df['emp_length'] = df['emp_length'].astype('float')
    df['int_rate'] = df['int_rate'].str.replace('%','', regex=False).astype('float')

    df = df[df['loan_status'] != 'Current'].reset_index(drop=True)
    df['default'] = (df['loan_status'] == 'Charged Off').astype('int')
    df.drop(columns = ['issue_d','loan_status'], inplace=True)
    df.rename(columns={'id':'idEntity'}, inplace=True)

    df_melt = df.melt(id_vars = ['timestamp','idEntity'], value_vars = df.drop(columns = 'timestamp').columns)
    df_melt['type'] = df_melt['value'].map(lambda x : type(x).__name__)
    df_melt.rename(columns ={'variable':'name'}, inplace=True)

    return df_melt

In [4]:

dataset = Dataset(name = project_name)
item_exists, dataset_dto = conversor.get_if_exists(dataset)

if not(item_exists):
    df = process_raw_loan_data()

    dataset_dto = DatasetDTO(name = project_name ) 
    lst_features= df['name'].drop_duplicates().to_list()
    dataset_dto.process_feature_list(lst_features= lst_features, name_space=name_space)
    dataset_repo.save(dataset_dto)
    item_exists, dataset_dto = conversor.get_if_exists(dataset)
    dataset_dto.save_data_mongo(mongo_db ,df = df)

dataset_dto.load_data_from_mongo(mongo_db)
dataset = dataset_dto.dataset 

In [5]:
targetFeature = Feature(name = target_feature_name, nameSpace = FeatureNameSpace(name = name_space))
project = Project(name  = project_name, projectType = ProjectType(name = 'Classification'), targetFeature = targetFeature)

item_exists,project_dto = conversor.get_if_exists(project)

if not(item_exists):
	targetFeature = dataset_dto.get_feature_by_name(name = target_feature_name)
	project_dto =ProjectDTO(name  = project_name, projectType = ProjectTypeDTO(name = 'Classification'), targetFeature = targetFeature)
	ProjectRepository(session=session).save(project_dto)


In [None]:
datas = pd.date_range(start="2009-01-01", end="2011-11-01", freq="MS")
for data_inicio in tqdm(datas):
    data_fim = pd.date_range(start=data_inicio, periods=1, freq="ME")[0]

    #treinando
    model = OHERandomForestClassifier()
    task = ClassificationTrainingTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'end_date':data_inicio})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)   
    run_repo.save(run_dto)

    #predição
    model.idModel =  run_dto.model.idModel
    task = ClassificationPredictionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'start_date':data_inicio,'end_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)

    #drift features
    task = FeatureDriftCheckTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                  'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)


  0%|          | 0/35 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Seoul

In [3]:
project_name = 'SeoulBike'
target_feature_name = 'rented_bike_count'
name_space= project_name

def process_raw_seoul_data():

    def remove_parentheses_content(text):
        return re.sub(r'\([^)]*\)', '', text)

    df = pd.read_csv('data//SeoulBikeData.csv', encoding='latin1')
    df.columns = [remove_parentheses_content(i.lower()).strip().replace(' ','_') for i in df.columns]
    df['timestamp'] = df['date'].map(lambda x : datetime.strptime(x,  "%d/%m/%Y"))
    df = df.drop(columns = 'date')
    df['timestamp'] = df['timestamp'] + pd.to_timedelta(df['hour'], unit='h')
    df_melt = df.melt(id_vars = ['timestamp'], value_vars = df.drop(columns = 'timestamp').columns)
    df_melt['idEntity'] = '1'
    df_melt['type'] = df_melt['value'].map(lambda x : type(x).__name__)
    df_melt.rename(columns ={'variable':'name'}, inplace=True)
    return df_melt

In [4]:

dataset = Dataset(name = project_name)
item_exists, dataset_dto = conversor.get_if_exists(dataset)

if not(item_exists):
    df = process_raw_seoul_data()

    dataset_dto = DatasetDTO(name = project_name ) 
    lst_features= df['name'].drop_duplicates().to_list()
    dataset_dto.process_feature_list(lst_features= lst_features, name_space=name_space)
    dataset_repo.save(dataset_dto)
    item_exists, dataset_dto = conversor.get_if_exists(dataset)
    dataset_dto.save_data_mongo(mongo_db ,df = df)

dataset_dto.load_data_from_mongo(mongo_db)
dataset = dataset_dto.dataset 

In [5]:
targetFeature = Feature(name = target_feature_name, nameSpace = FeatureNameSpace(name = name_space))
project = Project(name  = project_name, projectType = ProjectType(name = 'Regression'), targetFeature = targetFeature)

item_exists,project_dto = conversor.get_if_exists(project)

if not(item_exists):
	targetFeature = dataset_dto.get_feature_by_name(name = target_feature_name)
	project_dto =ProjectDTO(name  = project_name, projectType = ProjectTypeDTO(name = 'Regression'), targetFeature = targetFeature)
	ProjectRepository(session=session).save(project_dto)


In [None]:
datas = pd.date_range(start="2018-01-01", end="2018-11-30", freq="MS")  
for data_inicio in tqdm(datas):
    data_fim = pd.date_range(start=data_inicio, periods=1, freq="ME")[0]

    #treinando
    model = OHEDecisionTreeRegressor()
    task = RegressionPredictionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'end_date':data_inicio})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)   
    run_repo.save(run_dto)

    #predição
    model.idModel =  run_dto.model.idModel
    task = RegressionPredictionTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = model)
    run.execute( task_parameters={'start_date':data_inicio,'end_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)

    #drift features
    task = FeatureDriftCheckTask (dataset = dataset) 
    run = Run(project = project, task = task,  model = None)
    run.execute( task_parameters={'end_reference_date':data_inicio,
                                  'start_current_date':data_inicio,'end_current_date':data_fim})
    run_dto = conversor.converter_object_to_dto(run)
    run_repo = RunRepository(session=session)
    run_repo.save(run_dto)


  0%|          | 0/11 [00:00<?, ?it/s]