# Completo

In [2]:
#Importando bibliotecas nescessárias 
import pandas as pd 
import json
import psycopg2
from sqlalchemy import create_engine
from datetime import datetime
#sklearn
from sklearn.model_selection import cross_validate
#Internos
import sys
sys.path.append('../src/6.modeling')
from models import *

import mlflow
import mlflow.sklearn

#Paths
conf_data_path = '../config/data_config.json'
conf_model_path = '../config/model_config.json'

#Lendo arquivos de configuração
config_data_file = open(conf_data_path)
config_data = json.loads(config_data_file.read())
config_data_file.close()

#Lendo arquivos de configuração
config_model_file = open(conf_model_path)
config_model = json.loads(config_model_file.read())
config_model_file.close()

#Conectando com os dados
connect_data = config_data['connection']
textEngine = f"{connect_data['driver']}://{connect_data['user']}:{connect_data['password']}@{connect_data['host']}:{connect_data['port']}/{connect_data['database']}"

#Lendo dados em pandas
master_table = pd.read_sql_table('master_table',textEngine)

#########
#PrepData
#########

#removendo index
master_table = master_table.drop('index',axis=1)

#pegando amostra dos dados
percent_sample = config_model['percent_sample']
ponto_corte = int(len(master_table)*percent_sample)
master_part = master_table[:ponto_corte]

#retirando nan
master_part = master_part.dropna()

##########
# Modelos
##########
output_dict = {}

y = master_part['Revenue']
x = master_part.drop(['Revenue'],axis=1)

list_result  = []
for model_name in config_model['models'] :
    print(f"Treinando modelo {model_name}")
    #Iniciando o modelo (buscado a classe com eval e iniciando os parametros)
    model_class = eval(model_name)
    paramns = config_model['models'][model_name]['params']
    model = model_class(**paramns)
    #Fazendo teste de cross_validade
    score = cross_validate(model, x, y, scoring=config_model['scoring'],cv=5, return_train_score=False)
    result = pd.DataFrame(score)
    list_result.append(result)
    name = model.__class__.__name__
    output_dict[name] = model.get_params()
    #Salvando dados no mlflow
    experiment_id = mlflow.set_experiment('Teste_piloto') 
    mlflow.start_run(run_name=model_name)
    model_id = mlflow.active_run().info.run_id
    #Salvando metricas
    mlflow.log_params(paramns)
    #Colocando metricas no mlflow
    for column in result:
        mlflow.log_metric(column,result[column].mean())
    mlflow.set_tag('type','model/'+model_name)
    mlflow.sklearn.save_model(model,'model/'+str(model_id))
    mlflow.sklearn.log_model(model,artifact_path='model/'+str(model_id)+'_log')
    mlflow.end_run()

out_type = config_model['model_output']['type'] 
if out_type == 'normal':
    output_table = pd.concat(list_result, axis=1, keys=config_model["models"].keys())
else:
    out_list = []
    for result in list_result: 
        out_list.append(result.apply(out_type))

    output_table = pd.DataFrame(out_list,index=config_model["models"].keys())

#save table
time_stamp_id = str(int(datetime.now().timestamp()))[-5:]
output_table_name = f"model_output_{time_stamp_id}.csv"
output_table_path = config_model['model_output']['path']
output_table.to_csv(f"{output_table_path}{output_table_name}")
print(f"Tabela salva em {output_table_path}/{output_table_name}")

#save dict
output_dict['table_path'] = f"{output_table_path}{output_table_name}"
with open('output_dict.json', 'w') as outfile:
    json.dump(output_dict, outfile, indent=2)
print(f"dados dos modelos salvos em {output_table_path}{output_table_name}")


Treinando modelo LinearRegression
Treinando modelo KNeighborsRegressor
Treinando modelo DecisionTreeRegressor
Treinando modelo RandomForestRegressor
Tabela salva em ./model_output_67191.csv
dados dos modelos salvos em .model_output_67191.csv


In [24]:
mlflow.end_run()

## Testes

In [7]:
mlflow.log_params({"Teste":1,"Teste2":2})

In [30]:
result['fit_time'].mean()

0.24627256393432617

In [33]:
mlflow.create_experiment('Test') 

'1'

In [3]:
date = datetime.now()

In [4]:
date.isoformat()

'2020-05-30T00:47:24.186762'

In [2]:
mlflow.end_run()

In [5]:
df = pd.DataFrame({'z':[1,2,3],'x':[1,2,3]})

In [7]:
df.apply('mean').to_dict()

{'z': 2.0, 'x': 2.0}

In [1]:
import mlflow