<a href="https://colab.research.google.com/github/tenoriolms/Others/blob/main/_module.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://docs.python.org/pt-br/3/tutorial/modules.html

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno


import sklearn.metrics

In [None]:
'''
import sklearn.datasets
import sklearn.model_selection

import sklearn.ensemble #bibliotecas de aprendizado de máquina
'''

'\nimport sklearn.datasets\nimport sklearn.model_selection\n\nimport sklearn.ensemble #bibliotecas de aprendizado de máquina\n'

#Declaração de Variáveis

In [None]:
data = '1672' #ID do databank
git_import_file = '' #nome da variante do databank original
data_url = ''


#dict para armazenar dados da função ID
ID_dict = {}

#gases com permeabilidades presentes no banco de dados ordem crescente de kinetic diameter
#EM ORDEM CRESCENTE AO D.C., COMO NO DATABANK:
gases = ('He','H2','CO2','O2','H2S','CO','N2','CH4','C2H4','C2H6','C3H6','C3H8','SF6') #referentes aos dados de permeabilidade
gases_kinetic_diameter = {'He':2.551, #Angstron, https://doi.org/10.1039/B802426J
                          'H2':2.8585,
                          'CO2':3.3,
                          'O2':3.467,
                          'Ar':3.542,
                          'H2S':3.623,
                          'CO':3.69,
                          'N2':3.72,
                          'CH4':3.758,
                          'C2H4':4.163,
                          'C2H6':4.443,
                          'C3H6':4.678,
                          'C3H8':4.709,
                          'SF6':5.128,
                          'none':0}
gases_effec_diameter = {'He':0.178, #nm, T>Tg
                        'H2':0.214,
                        'CO2':0.302,
                        'O2':0.289,
                        'Ar':0.297,
                        'H2S':np.nan,
                        'CO':0.304,
                        'N2':0.304,
                        'CH4':0.318,
                        'C2H4':0.338,
                        'C2H6':0.346,
                        'C3H6':0.352,
                        'C3H8':0.367,
                        'SF6':np.nan,
                        'none':0}
gases_molar_mass = {'He':4.00, #g/mol
                    'H2':2.02,
                    'CO2':44.01,
                    'O2':31.98,
                    'Ar':39.95,
                    'H2S':34.10,
                    'CO':28.01,
                    'N2':28.00,
                    'CH4':16.04,
                    'C2H4':28.05,
                    'C2H6':30.07,
                    'C3H6':42.08,
                    'C3H8':44.09,
                    'SF6':146.06,
                    'none':0.}
gases_polarizability = {'He':2.04956, #1e-25.cm-3, https://doi.org/10.1039/B802426J, (other: https://cccbdb.nist.gov/pollistx.asp)
                        'H2':8.042,
                        'CO2':29.11,
                        'O2':15.812,
                        'Ar':16.411,
                        'H2S':38.66,
                        'CO':19.5,
                        'N2':17.403,
                        'CH4':25.93,
                        'C2H4':42.52,
                        'C2H6':44.5,
                        'C3H6':62.6,
                        'C3H8':63.3,
                        'SF6':65.4,
                        'none':0.}


gases_kinetic_diameter_inverse = {}
for i in gases_kinetic_diameter.keys():
  gases_kinetic_diameter_inverse[ gases_kinetic_diameter[i] ] = i

gases_effec_diameter_inverse = {}
for i in gases_effec_diameter.keys():
  gases_effec_diameter_inverse[ gases_effec_diameter[i] ] = i
                        
#frequencia de dados para cada gas em situação "pura" e "mixtura"
count_pure = {}
count_mixture = {}


#nome das colunas correspondentes à características da membrana e do processo e 
#seus respectivos índices na coluna
#EM ORDEM:
columns_membrane = ['type', 'description', 'support_material', 'configuration', 'subtype',
                    'filler_loading', 'mean_thickness', 'mean_pore_size', 'pore_size_type',
                    'total_pore_volume', 'micropore_volume', 'specific_surface_area',
                      'aging']
columns_process = ['surface_area', 'temperature', 'feed_pressure', 'permeate_pressure',
                   'delta_pressure', 'feed_flow_rate', 'sweep_gas', 'sweep_gas_flow',
                   'stage_cut']
columns_others = ['provided_data_type', 'in_reference_data_location', 'reference', 'url']

columns_membrane_index = {}
columns_process_index = {}
columns_others_index = {}

#A performance de cada gás foi representada pela seguintes variáveis:
prefix1='x_' #Fração mássica/molar/volumétrica
prefix2='Py_' #Permeabilidade
prefix3='Pe_' #Permeância

#VARIÁVEIS AUXILIARES:
dados = pd.DataFrame() #Dataframe a ser utilizados para a previsão. Versão refinada.

##GitHub info

In [None]:
git_username = 'tenoriolms' #username no GitHub
git_repository = 'databank_CH4' #Nome do repositório

git_token = 'não pode' #token para acesso do repositório.
#O github possui um algoritmo para verificar se dentro de cada arquivo importado/commitado
#existe o token de acesso criado, que é secreto. Caso existir, esse token é revogado.
#Como esse notebook irá ser exportado para o github. O token não pode ser escrito aqui.

!git config --global user.email "lhucas_tenorio@hotmail.com"
!git config --global user.name "tenoriolms"

#Funções

##ID(index,df)

In [None]:
#ID = string utilizada para identificar uma membrana e suas circunstâncias de utilização no banco de dados
#A ID é a soma das variáveis (em forma de strings) que podem variar em uma dada referência
def ID(i, df):
  if i in ID_dict:
    return ID_dict[i]
  else:
    
    aux = str(df['type'][i]) + str(df['description'][i]) \
    + str(df['subtype'][i]) + str(df['filler_loading'][i]) \
    + str(df['mean_thickness'][i]) + str(df['mean_pore_size'][i]) \
    + str(df['specific_surface_area'][i]) + str(df['aging'][i]) \
    + str(df['temperature'][i]) + str(df['feed_pressure'][i]) \
    + str(df['permeate_pressure'][i]) + str(df['delta_pressure'][i]) \
    + str(df['feed_flow_rate'][i]) + str(df['stage_cut'][i]) \
    + str(df['reference'][i]) + str(df['in_reference_data_location'][i])
    ID_dict[i] = aux
    
    return aux

##get_key(val,my_dict))

In [None]:
#Dado um determinado "valor" de um dicionário, qual é a "chave" associada a ele?
def get_key(val,my_dict): #dict = key : value
    for key, value in my_dict.items():
         if val == value:
             return key
 
    return "get_key function: There is no such Key"

##submit_file(git_export_file)

In [None]:
#exportar para o GitHub
def submit_file(git_export_file):
  !git clone https://{git_token}@github.com/{git_username}/{git_repository}
  !cp {git_export_file} {git_repository}
  %cd {git_repository}
  !git add {git_export_file}
  !git commit -m 'Add/Atualizar arquivo {input_file}'
  !git push -u origin
  %cd ..
  !rm -rf {git_repository}

##import_file(git_import_file)

In [None]:
#importar um arquivo do GitHub
def import_file(git_import_file):
  !git clone https://{git_token}@github.com/{git_username}/{git_repository}
  !cp {git_repository}/{git_import_file} .
  !rm -rf {git_repository}

##Zscores(df_for_scaled, df_reference)

In [None]:
#Escalonar cada coluna de um DataFrame utilizando o "z score"
def Zscores(df_for_scaled, df_reference):
  if (any(df_for_scaled.columns != df_reference.columns)):
    print('Zscores function: Dataframes com colunas diferentes')
    return
  
  print(f'Zscores function: columns_reference: {df_reference.columns}')
  for i in df_for_scaled.columns:
    if (df_for_scaled[i].dtype!=object):
      df_for_scaled[i] = (df_for_scaled[i] - df_reference[i].mean()) / df_reference[i].std()
  #return df_for_scaled

##undo_Zscores(df_scaled, df_reference)

In [None]:
#desfazer o escalonamento realizado para cada coluna de um DataFrame utilizando
#o "z score"
def undo_Zscores(df_scaled, df_reference):
  if (any(df_scaled.columns != df_reference.columns)):
    print('undo_Zscores function: Dataframes com colunas diferentes')
    return
  
  print(f'undo_Zscores function: columns_reference: {df_reference.columns}')
  for i in df_scaled.columns:
    if (df_scaled[i].dtype!=object):
      df_scaled[i] = df_scaled[i]*df_reference[i].std() + df_reference[i].mean()
  #return df_scaled

##normalize(df_for_norm, df_reference)

In [None]:
#Normalizar cada coluna de um DataFrame
def normalize(df_for_norm, df_reference):
  if (any(df_for_norm.columns != df_reference.columns)):
    print('normalize function: Dataframes com colunas diferentes')
    return
  
  print(f'normalize function: columns_reference: {df_reference.columns}')
  for i in df_for_norm.columns:
    if (df_for_norm[i].dtype!=object):
      df_for_norm[i] = (df_for_norm[i] - df_reference[i].min()) / (df_reference[i].max() - df_reference[i].min())
  #return df_for_norm

##undo_normalize(df_for_norm, df_reference)

In [None]:
#desfazer a normalização realizada para cada coluna de um DataFrame
def undo_normalize(df_normalized, df_reference):
  if (any(df_normalized.columns != df_reference.columns)):
    print('undo_normalize function: Dataframes com colunas diferentes')
    return
  
  print(f'undo_normalize function: columns_reference: {df_reference.columns}')
  for i in df_normalized.columns:
    if (df_normalized[i].dtype!=object):
      df_normalized[i] = df_normalized[i]*(df_reference[i].max() - df_reference[i].min()) + df_reference[i].min()
  #return df_normalized

##str2int_simple_encoder(df,columns='all')

In [None]:
def str2int_simple_encoder(df,columns='all'):
  import pandas as pd
  
  id_dict = {}
  if (columns=='all'):
    
    for i in df.columns:
      if (df[i].dtype==object):
        id_dict[i] = {}
        unique_values = df[i].unique()
        id_dict[i] = {name: id + 1 for id, name in enumerate(unique_values)}

        df[i] = df[i].apply(lambda row, value : value[row], value = id_dict[i] )

  else:
    
    for i in columns:
      if ( (df[i].dtype==object) and (i in df.columns) ):
        id_dict[i] = {}
        unique_values = df[i].unique()
        id_dict[i] = {name: id + 1 for id, name in enumerate(unique_values)}

        df[i] = df[i].apply(lambda row, value : value[row], value = id_dict[i] )
      else:
        print('str2int_simple_encoder: coluna especificada não é do tipo "object" ou não existe no dataframe')
        return
  
  return id_dict

In [None]:
#como era feito anteriormente:
'''
df = dados

#Criar os dicionários para os valores únicos das colunas categóricas
type_id = {}
aux = df['type'].unique()
for i in aux:
  type_id[i] = np.where(aux==i)[0][0]+1
print(type_id)

#converter os valores categóricos da coluna "type" por numéricos
df['type'] = df['type'].apply(lambda row, value : value[row],
                                                    value = type_id )

'''

'\ndf = dados\n\n#Criar os dicionários para os valores únicos das colunas categóricas\ntype_id = {}\naux = df[\'type\'].unique()\nfor i in aux:\n  type_id[i] = np.where(aux==i)[0][0]+1\nprint(type_id)\n\n#converter os valores categóricos da coluna "type" por numéricos\ndf[\'type\'] = df[\'type\'].apply(lambda row, value : value[row],\n                                                    value = type_id )\n\n'

##str2int_hot_encoder(df,columns='all')

In [None]:
def str2int_hot_encoder(df,columns='all'):
  import pandas as pd
  
  id_dict = {}

  if (columns=='all'):
    columns = []
    for i in df.columns:
      if (df[i].dtype==object):
        columns += [i]
  
  for i in columns:
    if ( (df[i].dtype==object) and (i in df.columns) ):
      id_dict[i] = {}
      unique_values = df[i].unique()
      for id,name in enumerate(unique_values):
        aux = [0]*(len(unique_values)-1)
        aux.insert(id,1)
        id_dict[i][name] = aux

      transformed_column = df[i].apply(lambda row, value : value[row], value = id_dict[i] )
      
      new_colunms = []
      for count,category in enumerate(id_dict[i].keys()): #value = valores categoricos da coluna 'i'
        new_colunms += [i+'_'+category]
        df.insert( df.columns.get_loc(i) + count +1, new_colunms[count], np.nan)
      
      df.drop( [i], axis=1, inplace=True )
      
      for index in transformed_column.index:
        df.loc[index, new_colunms] = transformed_column[index]
      
    else:
      print('str2int_simple_encoder: coluna especificada não é do tipo "object" ou não existe no dataframe')
      return
  
  return id_dict

##validation_curve_change_param(model,x_train,y_train,parameters = {})

In [None]:
def validation_curve_change_param(model,
                                  x_train,
                                  y_train,
                                  parameters = {}, # definindo os valores de parâmetros a serem testados
                                  ylim=None
                                  ):
  '''
  Example:
  parameters = {'C': np.arange(10000, 100000, 10000),
              'epsilon': [ 1, 5, 10,100,200,300,400,500,600],
              'tol': [0.001,0.01,0.1,1,5,10,100,1000,2000],
              'gamma': np.arange(0.01, 1.2, 0.05), 
              }
  validation_curve_change_param(model = SVR(kernel='rbf'),
                                x_train = x_train,
                                y_train = y_train,
                                parameters = parameters
                                )
  '''
  # Lista para armazenar os valores "Y" para cada hiperparâmetro
  lista_train_scores_mean = []
  lista_train_scores_std = []
  lista_test_scores_mean = []
  lista_test_scores_std = []

  count = 0
  for key, value in parameters.items():
    


    # calculando a curva de validação
    train_scores, test_scores = sklearn.model_selection.validation_curve(
        model, x_train, y_train, 
        param_name=key, 
        param_range=value,
        scoring="r2", 
        n_jobs=-1
        )

    # médias e desvios-padrão dos resultados da validação cruzada (para cada ponto da curva)
    lista_train_scores_mean.append( np.mean(train_scores, axis=1) )
    lista_train_scores_std.append( np.std(train_scores, axis=1) )
    lista_test_scores_mean.append( np.mean(test_scores, axis=1) )
    lista_test_scores_std.append( np.std(test_scores, axis=1) )

  
    ## GRAFICO ##
    plt.subplots(1,1, 
                 #sharex = True, sharey = True
                 )

    # plotando curva correspondente ao treino
    plt.plot(value, #https://matplotlib.org/stable/tutorials/introductory/pyplot.html
             lista_train_scores_mean[count],
             '.-',
             label='Treino')
    plt.fill_between(value,
                     lista_train_scores_mean[count] - lista_train_scores_std[count],
                     lista_train_scores_mean[count] + lista_train_scores_std[count],
                     alpha=0.1)
    
    # plotando curva correspondente ao teste
    plt.plot(value, 
             lista_test_scores_mean[count],
             '.-',
             label='Teste')
    plt.fill_between(value, 
                     lista_test_scores_mean[count] - lista_test_scores_std[count],
                     lista_test_scores_mean[count] + lista_test_scores_std[count],
                     alpha=0.1)

    # formatando gráfico
    if (count==0): plt.title('Curva de Validação')
    plt.xlabel(key)
    plt.ylabel("$R^2$")
    plt.legend(loc="best");
    plt.ylim(ylim)
    ## GRAFICO ##
    
    count +=1

##display_score(m,x_train,x_test,y_train,y_test))

Out-of-bag parameter:

https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710

https://www.analyticsvidhya.com/blog/2020/12/out-of-bag-oob-score-in-the-random-forest-algorithm/

https://stats.stackexchange.com/questions/88980/why-on-average-does-each-bootstrap-sample-contain-roughly-two-thirds-of-observat

https://stats.stackexchange.com/questions/198839/evaluate-random-forest-oob-vs-cv

In [None]:
def c_coeff(v_real = 'class numpy.ndarray',
            v_pred = 'class numpy.ndarray'
            ): #https://www.sciencedirect.com/science/article/abs/pii/S0376738817311572?via%3Dihub
  '''
  Coeficiente proposto por Wessling et al (1997) (https://doi.org/10.1016/0376-7388(93)E0168-J)

  The neural network works predictively if C is smaller than 1. For C=l, the
  predicted permeability for an unknown polymer would be  equal to the average
  permeability of all polymers presented in the set (which is, in fact, useless).

  '''
  v_real = v_real.ravel()
  denominador = sum(abs(v_real.mean() - v_real))
  if denominador!=0:
    return sum(abs(v_pred-v_real))/denominador
  else:
    return np.nan

In [None]:
def rmse(v_real,v_pred): 
  return np.sqrt(sklearn.metrics.mean_squared_error(v_real,v_pred)) #leia sobre sklearn.metrics.mean_squared_error
def r2(v_real,v_pred): 
  return sklearn.metrics.r2_score(v_real,v_pred) #leia sobre sklearn.metrics.r2_score
def mape(v_real,v_pred):
  return sklearn.metrics.mean_absolute_percentage_error(v_real,v_pred) #https://scikit-learn.org/stable/modules/model_evaluation.html#mean-absolute-percentage-error


##função para avaliar RMSE, R2 e OOB_score
def display_score(m,x_train,x_test,y_train,y_test,delog_y=False, base=10):
  
  

  if (delog_y==False):
    y_train_pred = m.predict(x_train)
    y_test_pred = m.predict(x_test)
  else:
    y_train_pred = np.power(base, m.predict(x_train))
    y_test_pred = np.power(base, m.predict(x_test))
    y_train = np.power(base,y_train)
    y_test = np.power(base,y_test)
    
  res = [ [rmse( y_train,y_train_pred ), r2( y_train,y_train_pred ),
           mape( y_train,y_train_pred ), c_coeff( y_train,y_train_pred )],
          [rmse( y_test,y_test_pred ), r2( y_test,y_test_pred ), 
           mape( y_test,y_test_pred ), c_coeff( y_test,y_test_pred )] ]
           #a função display score irá retornar uma tabela
    
  score = pd.DataFrame(res, columns=['RMSE','R2','MAPE','C_coeff'], index = ['Treino','Teste'])

  if hasattr(m, 'oob_score_'): #https://www.programiz.com/python-programming/methods/built-in/hasattr
    
    if (delog_y==False):
      score.loc['OOB'] = [rmse(y_train, m.oob_prediction_), m.oob_score_, 
                          mape(y_train, m.oob_prediction_), c_coeff(y_train,m.oob_prediction_)]
    else:
      y_train_pred = np.power(base, m.oob_prediction_)
      score.loc['OOB'] = [rmse(y_train, y_train_pred), m.oob_score_, 
                          mape(y_train, y_train_pred), c_coeff(y_train,y_train_pred)]

  display(score)

##plot_permutation_importance( model, x_val, y_val, x_val_columns )

In [None]:
def plot_permutation_importance( model, x_val, y_val, x_val_columns ):
  #https://medium.com/horadecodar/gr%C3%A1ficos-de-barra-com-matplotlib-85628bfc4351#:~:text=barh()%3A,os%20seguintes%20par%C3%A2metros%3A
  from sklearn.inspection import permutation_importance

  r = permutation_importance(model, x_val, y_val,
                             n_repeats=30,
                             scoring='r2',
                             random_state=0)

  df = pd.DataFrame( columns=['mean','std'] )

  for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
      #print(f"{x_val_columns[i]:<20}"
      #      f"{r.importances_mean[i]:.3f}"
      #      f" +/- {r.importances_std[i]:.3f}")
      df.loc[x_val_columns[i]] = [r.importances_mean[i], r.importances_std[i]]
  
  return df

##predictions_separate_by_a_variable(model, x_train, x_test, y_test, variable_in_original_databank, original_databank_train, original_databank_test, variable_subgroup = 'all' )

In [None]:
def predictions_separate_by_a_variable(model,
                                       x_train,
                                       x_test,
                                       y_test,
                                       variable_in_original_databank, #coluna 
                                       original_databank_train,
                                       original_databank_test,
                                       variable_subgroup = 'all', #coluna_subgrupo
                                       delog_y = False,
                                       base = 10
                                       ):
  '''
  DEFINIÇÕES E HIPÓTESES:
  Denomina-se databank original como aquele cujos valores não passaram por
  transformações (encoders, padronização, normalização etc.), que são
  iguais/semelhantes ao escrito na fonte de referência.
  A conexão entre os conjuntos de teste/treino e os databanks originais são
  estabelecidos pelos índices dos mesmos.
  
  model = modelo
  x_train = conjunto de treino com variáveis de entrada
  x_test = conjunto de teste com variáveis de entrada
  y_test = conjunto de teste com variáveis alvo
  variable_in_original_databank = COLUNA CATEGÓRICA cujos valores únicos
                                  serviram como referência para separar as
                                  predições
  original_databank_train = databank original, com valores originais NÃO
                            escalonados e sem transformações do conjunto de
                            treino
  original_databank_test = databank original, com valores originais NÃO
                           escalonados e sem transformações do conjunto de
                           teste
  subgroup = COLUNA CATEGÓRICA cujos valores únicos serviram como referência
             para separar as predições em subgrupos (OPCIONAL)

  '''
  #R2 para cada conjunto de teste separado por "gases"
  tabela = pd.DataFrame([], columns=[ 'fração_treino (%)','fração_teste (%)', 'R2_teste', 'RMSE_teste', 'MAPE_teste', 'C_coeff_teste' ] )

  #Pegar a COLUNA ORIGINAL de "variable" (valores originais direto do databank de origem)
  original_column_x_test = original_databank_test.loc[ x_test.index.values, variable_in_original_databank ]
  original_column_x_train = original_databank_train.loc[ x_train.index.values, variable_in_original_databank ]

  #Fração treino & Fração teste
  x_train_size = int(x_train.shape[0])
  x_test_size = int(x_test.shape[0])

  unique_values = set(original_column_x_test.unique().tolist() + original_column_x_train.unique().tolist())
  for i in unique_values:
    #print('    i=',i)
    index_i = original_column_x_test.loc[ original_column_x_test==i ].index
    separate_x_test = x_test.loc[ index_i ]
    separate_y_test = y_test.loc[ index_i ]
    
    #Tamanho da Fração treino & Fração teste
    index_i_x_train = original_column_x_train.loc[ original_column_x_train==i ].index
    separate_x_train_size = int(index_i_x_train.size)
    separate_x_test_size = int(separate_x_test.shape[0])

    #Salvar parametros na tabela
    fracao_treino = np.nan if x_train_size==0 else (separate_x_train_size/x_train_size)
    fracao_teste  = np.nan if x_test_size==0 else (separate_x_test_size/x_test_size)
    
    if (index_i.size==0):
      resultados_teste = [np.nan]*4
    else:
      valor_real = separate_y_test.values
      valor_predito = model.predict(separate_x_test.values)
      #deslogaritmizar Y
      if (delog_y == True):
        valor_real = np.power( base, valor_real )
        valor_predito = np.power( base, valor_predito )
      resultados_teste = [r2( valor_real, valor_predito ),
                          rmse( valor_real, valor_predito ),
                          mape( valor_real, valor_predito ),
                          c_coeff( valor_real, valor_predito )]

    tabela.loc[ i ] = [f'{fracao_treino:.2%} ({separate_x_train_size})',
                       f'{fracao_teste:.2%} ({separate_x_test_size})',
                       f'{resultados_teste[0]:.5}',
                       f'{resultados_teste[1]:.5}',
                       f'{resultados_teste[2]:.5}',
                       f'{resultados_teste[3]:.5}']
    

    #laço para os subgrupo 'i' dentro do grupo 'i' - mesma lógica do codigo escrito acima
    if variable_subgroup!='all':
      subgroup_original_column_x_test = original_databank_test.loc[ index_i, variable_subgroup ]
      subgroup_original_column_x_train = original_databank_train.loc[ index_i_x_train, variable_subgroup ]

      x_train_size_subgroup = int(index_i_x_train.size)
      x_test_size_subgroup = int(index_i.size)

      #print('treino=',x_train_size_subgroup)
      #print('teste=',x_test_size_subgroup)
      #print()

      unique_values_subgroup = set(subgroup_original_column_x_test.unique().tolist() + subgroup_original_column_x_train.unique().tolist())
      for j in unique_values_subgroup:
        #print('j=',j)
        index_i = subgroup_original_column_x_test.loc[ subgroup_original_column_x_test==j ].index
        separate_x_test = x_test.loc[ index_i ]
        separate_y_test = y_test.loc[ index_i ]

        #Tamanho da Fração treino & Fração teste
        index_i_x_train = subgroup_original_column_x_train.loc[ subgroup_original_column_x_train==j ].index
        separate_x_train_size = int(index_i_x_train.size)
        separate_x_test_size = int(separate_x_test.shape[0])
        
        #Salvar parametros na tabela
        fracao_treino = np.nan if x_train_size_subgroup==0 else (separate_x_train_size/x_train_size_subgroup)
        fracao_teste  = np.nan if x_test_size_subgroup==0 else (separate_x_test_size/x_test_size_subgroup)
        
        if (index_i.size==0):
          resultados_teste = [np.nan]*4
        else:
          valor_real = separate_y_test.values
          valor_predito = model.predict(separate_x_test.values)
          #deslogaritmizar Y
          if (delog_y == True):
            valor_real = np.power( base, valor_real )
            valor_predito = np.power( base, valor_predito )
          resultados_teste = [r2( valor_real, valor_predito ),
                              rmse( valor_real, valor_predito ),
                              mape( valor_real, valor_predito ),
                              c_coeff( valor_real, valor_predito )]

        tabela.loc[ str(i)+' / '+str(j) ] = [f'{fracao_treino:.2%} ({separate_x_train_size})',
                                             f'{fracao_teste:.2%} ({separate_x_test_size})',
                                             f'{resultados_teste[0]:.5}',
                                             f'{resultados_teste[1]:.5}',
                                             f'{resultados_teste[2]:.5}',
                                             f'{resultados_teste[3]:.5}'
                                             ]

  #deslogaritmizar Y
  valor_real = y_test.values
  valor_predito = model.predict(x_test.values)
  if (delog_y == True):
    valor_real = np.power( base, valor_real )
    valor_predito = np.power( base, valor_predito )
  
  tabela.loc[ 'all' ] = [f'100% ({x_train_size})',
                         f'100% ({x_test_size})',
                         f'{r2( valor_real, valor_predito):.5}',
                         f'{rmse( valor_real, valor_predito):.5}',
                         f'{mape( valor_real, valor_predito):.5}',
                         f'{c_coeff( valor_real, valor_predito):.5}'
                         ]
  display(tabela)

##Gráficos

### plt_valuecounts_by(df,variable,by)

In [None]:
def plt_valuecounts_by(df = pd.DataFrame(),
                       variable = [],
                       by = '',
                       consider_none = True,
                       consider_zeros = True,
                       library = 'plotly or matplotlib',
                       matplotlib_figsize = 'default', #[width, height]
                       matplotlib_bar_space = 0.8,
                       matplotlib_colors = 'default'):
  '''
  Plotar a qtd de valores existentes da variavel="variable" para cada classe da variável "by"

  OBS.:
  - A espessura das barras é controlada pela largura (width) da figura
  '''
  df = df.copy()

  if (library=='plotly'):
    #https://plotly.com/python/histograms/
    import plotly.graph_objects as go

    fig = go.Figure()

    #Filtrar apenas as linha que possuem dados de "variavel"
    for i in variable:
      
      if (consider_none==False):
        df.loc[ df[i]=='none', i ] = np.nan
        df.loc[ df[i]=='None', i ] = np.nan

      if (consider_zeros==False):
        df.loc[ df[i]==0, i ] = np.nan

      df_aux = df.loc[ df[i].notna(), by ]

      fig.add_trace(go.Histogram(
          x=df_aux,
          histnorm='',
          name=i, # name used in legend and hover labels
          #marker_color='#EB89B5',
          #opacity=0.75
          ))

    fig.update_layout(
        title_text=f'Quantity of data by each {by}', # title of plot
        xaxis_title_text=by, # xaxis label
        yaxis_title_text='Count', # yaxis label
        bargap=0.2, # gap between bars of adjacent location coordinates
        bargroupgap=0.1 # gap between bars of the same location coordinates
        )

    fig.show()
  
  elif (library=='matplotlib'): #https://matplotlib.org/stable/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
    
    matplotlib_width_bar = 1
    unique_by = df[by].unique()

    #Definir as CORES para cada valor único da variável "by".
    #Caso houver mais valores que o tamanho de "colors_reference", as cores serão repetidas:
    if (matplotlib_colors=='default'):
      matplotlib_colors = ['tab:blue','tab:orange','tab:green','tab:red','tab:purple','tab:brown','tab:pink','tab:gray','tab:olive','tab:cyan'] #Tableau Palette
    count, colors = (0, [])
    for i in range(len(variable)):
      colors += [matplotlib_colors[count]]
      count += 1
      if (count==len(matplotlib_colors)):
        count = 0
    print(colors)

    # tamanho da Figura #
    if (matplotlib_figsize=='default'):
      figsize_height = ( 0.35*len(variable) + matplotlib_bar_space )*len(unique_by) + 1
      matplotlib_figsize = [figsize_height, 4.8]
    print('figsize=',matplotlib_figsize)
    fig, ax = plt.subplots(figsize=matplotlib_figsize)
    # tamanho da Figura #

    x = np.arange(len(unique_by))*( len(variable)+matplotlib_bar_space )*matplotlib_width_bar# the label locations
    multiplier = 0

    for i in variable:
      
      measurement = [0]*len(unique_by)
      for j,count in zip( unique_by, np.arange(len(unique_by)) ):
        measurement[count] = df.loc[ df[by]==j, i ].dropna().size

      offset = matplotlib_width_bar * multiplier
      graph = ax.bar(x + offset,
                     measurement, 
                     matplotlib_width_bar, 
                     label=i, 
                     color=colors[multiplier])
      ax.bar_label(graph, 
                   #fontsize=11, 
                   padding=3)
      
      multiplier += 1

    ax.set_ylabel('Frequência')
    ax.set_xticks(x + (len(variable) - 1)*matplotlib_width_bar*0.5, unique_by)
    ax.legend(loc='best')
    return fig, ax
  
  else:
    print('Escolha uma biblioteca')

NameError: ignored

### plt_hist_of_columns(df)

In [None]:
def plt_hist_of_columns(df):
  # converter colunas numéricas para "float"
  float_df_columns = []
  for i in df.columns:
    try:
      df[i] = df[i].astype(float)
    except:
      print(f'heatmap_pearson function: X column "{i}" is a {df[i].dtype}')
    else:
      float_df_columns += [i]
  
  aux = []
  for i in float_df_columns:
    aux += [i]
    if (len(aux)==4):
      try:
        df[aux] = df[aux].astype(float)
      except:
        print()
      df[aux].hist()
      aux = []
  df[aux].hist()

### plot_stacked_hist_or_bar_by(df,variable='',by='',mode='bar or hist')

In [None]:
# Histogramas empilhados por classe "by"
def plot_stacked_hist_or_bar_by(df, variable = '',
                                by = '',
                                mode = 'bar or hist', 
                                alpha = 0.3,
                                bins_hist = 1, 
                                width_bar = 'default',
                                bar_norm = False,
                                colors_reference = ['b','g','r','c','m','y','k'],
                                figsize = [6.4, 4.8],
                                bar_labels=False,
                                x_log_scale_hist = False):
  '''
  Exemplos semelhantes de grafico de barras: https://matplotlib.org/stable/gallery/lines_bars_and_markers/bar_stacked.html#sphx-glr-gallery-lines-bars-and-markers-bar-stacked-py
  
  Essa função se baseia na criação de dicionários para os valores da base (variável "bottom"
  do matplotlib) e das frequencias (variável "height" do matplotlib). As chaves(keys) desses
  dicionários são os valores únicos de "df[variable]" e os valores são referentes à frequencia
  (obtidas a partir da função "value_counts").

  Essa função retorna "fig" e "ax" do Matplotlib. Portanto, o gráfico criado pode ser editado
  posteriormente, mesmo com certa limitação.

  O modo 'hist' serve apenas para quando "variable" possui valores numéricos.
  '''
  import copy
  import math
  df = df.copy()
  
  fig, ax = plt.subplots(figsize=figsize)

  df.dropna(subset=[variable], inplace=True)
  x_range = [df[variable].min(), df[variable].max()]

  unique_by = df[by].unique()
  #Definir as CORES para cada valor único da variável "by".
  #Caso houver mais valores que o tamanho de "colors_reference", as cores serão repetidas:
  count, colors = (0, [])
  for i in range(len(unique_by)):
    if (count==len(colors_reference)):
      count = 0
    colors += [colors_reference[count]]
    count += 1
  
  #dicionario com os valores da base:
  unique_variable = df[variable].unique()
  bottoms = dict(zip(unique_variable,len(unique_variable)*[0]))
  #variavel referencia - dicionario com valores zerados:
  values_0 = copy.copy(bottoms) 

  #Histograma - o dicionário para o "x", "height" e "bottom" é diferente quando se deseja construir um histograma:
  #             faz-se necessário trabalhar com numeros/floats
  if (mode=='hist'):
    if (x_log_scale_hist==False):
      histogram_width = (x_range[1]-x_range[0])/bins_hist
      histogram_x = np.arange( x_range[0], x_range[1], histogram_width ).tolist()
      histogram_bottoms = dict(zip(histogram_x,len(histogram_x)*[0])) ##valores com o valor da base
      histogram_0 = copy.copy(histogram_bottoms) #variavel referencia - dicionario com valores zerados
    elif(x_log_scale_hist==True):
      x_range_log = [0,0]
      x_range_log[0] = math.log10(x_range[0])
      x_range_log[1] = math.log10(x_range[1])

      histogram_width_log = (x_range_log[1]-x_range_log[0])/bins_hist
      histogram_x_log = np.arange( x_range_log[0], x_range_log[1], histogram_width_log ).tolist()
      
      histogram_x = []
      for log_number in histogram_x_log:
        histogram_x += [10**log_number]
      
      histogram_width = []
      for i in range(len(histogram_x)):
        histogram_width += [ 10**(x_range_log[0]+histogram_width_log*(i+1)) - 10**(x_range_log[0]+histogram_width_log*i) ]
      
      histogram_bottoms = dict(zip(histogram_x,len(histogram_x)*[0]))
      histogram_0 = copy.copy(histogram_bottoms)
    else:
      print('ERRO: x_log_scale_hist')
      return



  #Definir variável com o valor total de frequência para cada "variable"
  if (bar_norm==True):
    total_value_counts = df[variable].value_counts()

  count = 0
  for i in unique_by:
    values = copy.copy(values_0) #armazenará os "value_counts" referentes a vada valor unico de uma variavel
    
    #Obter os value counts de cada "variable" para cada "unique_by"
    df_filtrado = df.loc[df[by]==i, variable]
    hist_aux_df = df_filtrado.value_counts()
    for j in hist_aux_df.index:
      if (bar_norm==True):
        values[j] += hist_aux_df[j]/total_value_counts[j] #Normalizar "values" pelo valor total
      else:
        values[j] += hist_aux_df[j]


    if (mode=='bar'):
      ## Grafico ##
      if (width_bar=='default') and not(isinstance(x_range[1], str)): width_bar = (x_range[1]-x_range[0])/len(unique_variable)
      if isinstance(x_range[1], str): width_bar=0.8

      graph = ax.bar(x=list(values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
            height=list(values.values()),
            width=width_bar,
            bottom=list(bottoms.values()),
            align = 'center',
            color = colors[count],
            #edgecolor='black',
            #hatch='//',
            alpha=alpha,
            label= i
            )

      ## Grafico ##
      for j in bottoms.keys():
        bottoms[j] += values[j]
    
    elif (mode=='hist'):
      histogram_values = copy.copy(histogram_0)
      for j in values.keys():
        for k in histogram_x[::-1]:
          if (j>=k):
            histogram_values[k] += values[j]
            break
      ## Grafico ##
      ax.bar(x=list(histogram_values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
            height=list(histogram_values.values()),
            width=histogram_width,
            bottom=list(histogram_bottoms.values()),
            align = 'edge',
            color = colors[count],
            #edgecolor='black',
            #hatch='//',
            alpha=alpha,
            label= i
            )
      if (x_log_scale_hist==True): plt.xscale('log')
      ## Grafico ##
      for j in histogram_bottoms.keys():
        histogram_bottoms[j] += histogram_values[j]
    
    count += 1
  
  # Bar labels
  if (mode=='bar') and (bar_labels==True):
    value_counts = df[variable].value_counts()
    for height,x in zip(value_counts, value_counts.index):
      ax.annotate('{}'.format(height),
                  xy=(x, height),
                  xytext=(0, 5), # 5 points vertical offset
                  textcoords="offset points",
                  ha='center', va='bottom',
                  #fontsize=15,
                  rotation=0)


  ## Grafico ##
  ax.set(#title=f'Frequência de {variable} por tipo de membrana',
         xlabel=variable,
         ylabel='Frequência')
  ax.legend();
  ax.margins(0.05)
  ## Grafico ##

  print('min =',df[variable].min())
  print('max =',df[variable].max())
  
  return fig, ax

In [None]:
# #### BACKUP ####

# import copy

# # Histogramas empilhados por classe "by"
# def plot_stacked_hist_or_bar_by(df, variable = '',
#                                 by = '',
#                                 mode = 'bar or hist', 
#                                 alpha = 0.3,
#                                 bins_hist = 1, 
#                                 width_bar = 'default',
#                                 bar_norm = False,
#                                 colors_reference = ['b','g','r','c','m','y','k'],
#                                 figsize = [6.4, 4.8]):
#   '''
#   Exemplos semelhantes de grafico de barras: https://matplotlib.org/stable/gallery/lines_bars_and_markers/bar_stacked.html#sphx-glr-gallery-lines-bars-and-markers-bar-stacked-py
  
#   Essa função se baseia na criação de dicionários para os valores da base (variável "bottom"
#   do matplotlib) e das frequencias (variável "height" do matplotlib). As chaves(keys) desses
#   dicionários são os valores únicos de "df[variable]" e os valores são referentes à frequencia
#   (obtidas a partir da função "value_counts").

#   Essa função retorna "fig" e "ax" do Matplotlib. Portanto, o gráfico criado pode ser editado
#   posteriormente, mesmo com certa limitação.

#   O modo 'hist' serve apenas para quando "variable" possui valores numéricos.
#   '''
#   fig, ax = plt.subplots(figsize=figsize)

#   df.dropna(subset=[variable], inplace=True)
#   x_range = [df[variable].min(), df[variable].max()]

#   unique_by = df[by].unique()
#   #Definir as CORES para cada valor único da variável "by".
#   #Caso houver mais valores que o tamanho de "colors_reference", as cores serão repetidas:
#   count, colors = (0, [])
#   for i in range(len(unique_by)):
#     if (count==len(colors_reference)):
#       count = 0
#     colors += [colors_reference[count]]
#     count += 1
  
#   #dicionario com os valores da base:
#   unique_variable = df[variable].unique()
#   bottoms = dict(zip(unique_variable,len(unique_variable)*[0]))
#   #variavel referencia - dicionario com valores zerados:
#   values_0 = copy.copy(bottoms) 

#   #Histograma - o dicionário para o "x", "height" e "bottom" é diferente quando se deseja construir um histograma:
#   #             faz-se necessário trabalhar com numeros/floats
#   if (mode=='hist'):
#     histogram_width = (x_range[1]-x_range[0])/bins_hist
#     histogram_x = np.arange( x_range[0], x_range[1], histogram_width ).tolist()
#     histogram_bottoms = dict(zip(histogram_x,len(histogram_x)*[0])) ##valores com o valor da base
#     histogram_0 = copy.copy(histogram_bottoms) #variavel referencia - dicionario com valores zerados

#   #Definir variável com o valor total de frequência para cada "variable"
#   if (bar_norm==True):
#     total_value_counts = df[variable].value_counts()

#   count = 0
#   for i in unique_by:
#     values = copy.copy(values_0) #armazenará os "value_counts" referentes a vada valor unico de uma variavel
    
#     #Obter os value counts de cada "variable" para cada "unique_by"
#     df_filtrado = df.loc[df[by]==i, variable]
#     hist_aux_df = df_filtrado.value_counts()
#     for j in hist_aux_df.index:
#       if (bar_norm==True):
#         values[j] += hist_aux_df[j]/total_value_counts[j] #Normalizar "values" pelo valor total
#       else:
#         values[j] += hist_aux_df[j]


#     if (mode=='bar'):
#       ## Grafico ##
#       if (width_bar=='default') and not(isinstance(x_range[1], str)): width_bar = (x_range[1]-x_range[0])/len(unique_variable)
#       if isinstance(x_range[1], str): width_bar=0.8

#       ax.bar(x=list(values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
#             height=list(values.values()),
#             width=width_bar,
#             bottom=list(bottoms.values()),
#             align = 'center',
#             color = colors[count],
#             #edgecolor='black',
#             #hatch='//',
#             alpha=alpha,
#             label= i
#             )
#       ## Grafico ##
#       for j in bottoms.keys():
#         bottoms[j] += values[j]
    
#     elif (mode=='hist'):
#       histogram_values = copy.copy(histogram_0)
#       for j in values.keys():
#         for k in histogram_x[::-1]:
#           if (j>=k):
#             histogram_values[k] += values[j]
#             break
#       ## Grafico ##
#       ax.bar(x=list(histogram_values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
#             height=list(histogram_values.values()),
#             width=histogram_width,
#             bottom=list(histogram_bottoms.values()),
#             align = 'edge',
#             color = colors[count],
#             #edgecolor='black',
#             #hatch='//',
#             alpha=alpha,
#             label= i
#             )
#       ## Grafico ##
#       for j in histogram_bottoms.keys():
#         histogram_bottoms[j] += histogram_values[j]
    
#     count += 1
  

#   ## Grafico ##
#   ax.set(#title=f'Frequência de {variable} por tipo de membrana',
#          xlabel=variable,
#          ylabel='Frequência')
#   plt.legend();
#   ax.margins(0.05)
#   ## Grafico ##

#   print('min =',df[variable].min())
#   print('max =',df[variable].max())
  
#   return fig, ax

###compare_hists_by(df1, df2, variable = '', by = '')

In [None]:
import copy
def compare_hists_by(df1, df2,
                     variable = '',
                     by = '',
                     df1_name = 'default',
                     df2_name = 'default',
                     alpha = 0.7,
                     bins_hist = 1,
                     colors_reference = ['b','g','r','c','m','y','k'],
                     figsize = [12.8, 7.2]):
  '''
  "variable" precisa ter valores numéricos.

  Essa função retorna "fig" e "ax" do Matplotlib. Portanto, o gráfico criado pode ser editado
  posteriormente, mesmo com certa limitação.
  
  A lógica dessa função foi copiada da função "plot_stacked_hist_or_bar_by".
  Para melhor compreender esse cógido, ler antes o código da função "plot_stacked_hist_or_bar_by"
  '''
  fig, ax = plt.subplots( 2, 2, figsize=figsize )
  df1.dropna(subset=[variable], inplace=True)
  df2.dropna(subset=[variable], inplace=True)
  
  x_range_min = min(df1[variable].min(), df2[variable].min())
  x_range_max = max(df1[variable].max(), df2[variable].max())

  #Achar os valores únicos da variável "by" presentes nos dois dataframes (df1 e df2)
  unique_by = (df1[by].unique().tolist() + df2[by].unique().tolist())
  unique_by = list(dict.fromkeys(unique_by))
  unique_by.sort()
  #Definir as cores para cada valor único da variável "by".
  #Caso houver mais valores que o tamanho de "colors_reference", as cores serão repetidas:
  count, colors = (0, [])
  for i in range(len(unique_by)):
    if (count==len(colors_reference)):
      count = 0
    colors += [colors_reference[count]]
    count += 1
  
  #Histograma - o dicionário para o "x", "height" e "bottom" para plotar um histograma
  histogram_width = (x_range_max-x_range_min)/bins_hist
  histogram_x = np.arange( x_range_min, x_range_max, histogram_width ).tolist()
  histogram_bottoms = dict(zip(histogram_x,len(histogram_x)*[0])) ##valores com o valor da base
  histogram_0 = copy.copy(histogram_bottoms) #variavel referencia - dicionario com valores zerados

  ## Plots individuais ##
  histogram_df = {}
  for (df, position) in [(df1, 0), (df2, 1)]:
    unique_variable = df[variable].unique()
    #variavel referencia - dicionario com valores zerados:
    values_0 = dict(zip(unique_variable,len(unique_variable)*[0]))

    count = 0
    for i in unique_by:
      values = copy.copy(values_0) #armazenará os "value_counts" referentes a vada valor unico de uma variavel
      aux_df = df.loc[df[by]==i, variable]
      hist_aux_df = aux_df.value_counts()
      for j in hist_aux_df.index:
        values[j] += hist_aux_df[j]
      
      ## Histograma ##
      histogram_values = copy.copy(histogram_0)
      for j in values.keys():
        for k in histogram_x[::-1]:
          if (j>=k):
            histogram_values[k] += values[j]
            break

      ## Grafico ##
      ax[position,0].bar(x=list(histogram_values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
                        height=list(histogram_values.values()),
                        width=histogram_width,
                        bottom=list(histogram_bottoms.values()),
                        align = 'edge',
                        color = colors[count],
                        #edgecolor='black',
                        #hatch='//',
                        alpha=alpha,
                        label= i
                        )
      if (position==0):
        if (df1_name=='default'):
          title='df1'
        else:
          title=df1_name
      if (position==1):
        if (df2_name=='default'):
          title='df2'
        else:
          title=df2_name
      ax[position,0].set_title(title)
      ax[position,0].set_ylabel('Frequência')
      if (position==0): ax[position,0].set_xticks([]) #ocultar o eixo x
      if (position==1): ax[position,0].set_xlabel(variable)
      ax[position,0].legend();
      #ax[position,0].margins(0.05)
      ## Grafico ##

      for j in histogram_bottoms.keys():
        histogram_bottoms[j] += histogram_values[j]
      
      count += 1
    
    histogram_df[position] = histogram_bottoms.copy() #Armazenar valor para depois plotar junto o df1 e df2
    histogram_bottoms = histogram_0.copy()

  ## Plots juntos ##
  ## Grafico ##
  ax = plt.subplot(122)
  for i in (0, 1):
    if (i==0):
      color = 'darkred'
      if (df1_name=='default'):
        label='df1'
      else:
        label=df1_name
    if (i==1):
      color = 'cornflowerblue'
      if (df2_name=='default'):
        label='df2'
      else:
        label=df2_name
    ax.bar(x=list(histogram_df[i].keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
                  height=list(histogram_df[i].values()),
                  width=histogram_width,
                  align = 'edge',
                  color = color,
                  #edgecolor='black',
                  #hatch='//',
                  alpha=alpha,
                  label= label
                  )
    ax.legend()
    ax.set(xlabel=variable, ylabel='Frequência')
   ## Grafico ##

  return fig, ax

### heatmap_corr(df, x='all', y='all', method='pearson', min_periods=1, color='di')

In [None]:
def heatmap_corr(df,
                 x='all', 
                 y='all',
                 method='pearson', 
                 min_periods=1,
                 color='di'):
  import pandas as pd
  
  corr_pear = df.corr( min_periods=min_periods, method=method )
  
  if (x=='all'):
    x = corr_pear.columns.tolist()
  #print(x)
  if (y=='all'):
    y = corr_pear.columns.tolist()
  #print(y)
  heatmap_pearson = pd.DataFrame( columns=x, index=y )
  heatmap_pearson = corr_pear.loc[y,x]
  
    #GRAFICO#
  f, ax = plt.subplots(figsize=( 1*len(x)+3, 1*len(y) ))
  if color=='mono':
    colors = ('#00076e', '#1b00ff', '#d0cbff', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  elif (color=='di'):
    colors = ('#7e0000', '#ff0000', '#fecfcf', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  cmap = sns.blend_palette(colors, input='rgb', as_cmap=True)
  sns.heatmap(heatmap_pearson, annot=True, cmap=cmap, ax=ax, center=0) 

  return heatmap_pearson

### heatmap_pearson(df='pandas_DataFrame', x='all', y='all', allow_duplicates=True, color='di', graphic='coeff')

In [None]:
def heatmap_pearson(df='pandas_DataFrame',
                    x='all',
                    y='all',
                    allow_duplicates=True,
                    color='di', 
                    graphic='coeff'):
  from scipy.stats import pearsonr
  '''
  Returns a heatmap plot with pearson's coefficients or their p-values.

  df      = Dataframe
  x and y = list of x and y heatmap columns/axis
  color   = "di" or "mono"
  graphic = "coeff" or "pvalue"
  '''
  if (x=='all'):
    x = df.columns

  # converter colunas numéricas para "float"
  float_df_columnsx = []
  for i in x:
    try:
      df[i] = df[i].astype(float)
    except:
      print(f'heatmap_pearson function: X column "{i}" is a {df[i].dtype}')
    else:
      float_df_columnsx += [i]
  x = float_df_columnsx


  if (y=='all'):
    y = df.columns

  # converter colunas numéricas para "float"
  float_df_columnsy = []
  for i in y:
    try:
      df[i] = df[i].astype(float)
    except:
      print(f'heatmap_pearson function: Y column "{i}" is a {df[i].dtype}')
    else:
      float_df_columnsy += [i]
  y = float_df_columnsy
  

  pear_heatmap = pd.DataFrame( columns=x, index=y, dtype=float)
  pvalue_heatmap = pd.DataFrame( columns=x, index=y, dtype=float )
  for i in x: #columns
    #print(i)
    for j in y: #index
      df_aux = df[[i,j]].dropna()
      
      #Retirar duplicadas nas coordenadas de "df_aux"
      if (allow_duplicates==False):
        old_df = df_aux
        new_df = pd.DataFrame( columns=[i,j] )
        lines_new_df = []
        for index in old_df.index:
          line_old_df = f'{df.loc[index,i]} {df.loc[index,j]}'
          if not(line_old_df in lines_new_df):
            lines_new_df += [line_old_df]
            new_df.loc[index, [i,j] ] = old_df.loc[index, [i,j]]
        df_aux = new_df

      if (df_aux.shape[0]==0):
        pear_heatmap.loc[j,i], pvalue_heatmap.loc[j,i] = (np.nan, np.nan)
        continue
      columnx = df_aux.iloc[:,0]
      columny = df_aux.iloc[:,1]
      #print(columnx.shape, columny.shape)
      pear_heatmap.loc[j,i], pvalue_heatmap.loc[j,i] = pearsonr( columnx, columny )

  ## GRAFICO ##
  if (graphic=='coeff'):
    graphic = pear_heatmap
  elif (graphic=='pvalue'):
    graphic = pvalue_heatmap

  f, ax = plt.subplots(figsize=( 1*len(x)+3, 1*len(y) ))
  if color=='mono':
    colors = ('#00076e', '#1b00ff', '#d0cbff', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  elif (color=='di'):
    colors = ('#7e0000', '#ff0000', '#fecfcf', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  cmap = sns.blend_palette(colors, input='rgb', as_cmap=True)
  sns.heatmap(graphic, annot=True, cmap=cmap, ax=ax, center=0)

  return (pear_heatmap, pvalue_heatmap)

###draw_tree(t, dados, size=10, ratio=1, precision=0) - Modelo RF

In [None]:
def draw_tree(t, dados, size=10, ratio=1, precision=0):
   
    import re
    import graphviz
    import sklearn.tree
    import IPython.display
    
    s=sklearn.tree.export_graphviz(t, out_file=None, feature_names=dados.columns, filled=True,
                                   special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

###plotar_importancias(modelo, tags, n=10) - modelo RF

In [None]:
def plotar_importancias(modelo, tags, n=10):
    
    fig, ax = plt.subplots(1,2, figsize = (20,4))

    coefs = []
    abs_coefs = []

    if hasattr(modelo,'coef_'):
        imp = modelo.coef_
    elif hasattr(modelo,'feature_importances_'):
        imp = modelo.feature_importances_
    else:
        print('sorry, nao vai rolar!')
        return

    coefs = (pd.Series(imp, index = tags))
    coefs.plot(use_index=False, ax=ax[0]);
    abs_coefs = (abs(coefs)/(abs(coefs).sum()))
    abs_coefs.sort_values(ascending=False).plot(use_index=False, ax=ax[1],marker='.')

    ax[0].set_title('Importâncias relativas das variáveis')
    ax[1].set_title('Importâncias relativas das variáveis - ordem decrescente')

    abs_coefs_df = pd.DataFrame(np.array(abs_coefs).T,
                                columns = ['Importancias'],
                                index = tags)

    df = abs_coefs_df['Importancias'].sort_values(ascending=False)
    
    print(df.iloc[0:n])
    plt.figure()
    df.iloc[0:n].plot(kind='barh', figsize=(15,0.25*n), legend=False)
    
    return df

###plot_predictions(y_real = 'list', y_pred = 'list', error_axis = 'absolute or relative')

In [None]:
def plot_predictions(y_real = 'list',
                     y_pred = 'list',
                     figsize = [15., 6.],
                     y_scale = 'default', #log ou linear
                     put_major_label_tick_every = 'default',
                     put_minor_label_tick_every = 'default',
                     show_error_axis = True,
                     error_axis = 'absolute or relative',
                     font_size = 11.,
                     report_big_relative_errors = True,
                     big_errors_limit = 100
                     ):
  
  from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
  import matplotlib

  if len(y_real)!=len(y_pred):
    print('ERRO: tamanho de y_real (',len(y_real),') é diferente do tamanho de y_pred (',len(y_pred),')')
    return
  
  matplotlib.rcParams['font.size'] = font_size
  
  if show_error_axis==True:
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(3, 1, (1, 2))
  else:
    fig, ax = plt.subplots(figsize=figsize)

  len_x = len(y_real)
  x = np.arange(1, len_x+1, 1)
  
  if put_major_label_tick_every=='default': put_major_label_tick_every = round( len_x/10, -1 )
  if put_minor_label_tick_every=='default': put_minor_label_tick_every = round( put_major_label_tick_every/2, 0 )

  ax.plot(x, y_pred, 
          linewidth=2.8, 
          label='Predito')
  
  ax.plot(x, y_real, 
          linewidth=1.0, 
          ls='--',
          marker='o',
          label='real', 
          )
  

  ax.set(xlim=[0, len_x+1])
  ax.grid(True, which='both', axis='x', linestyle='-', color='lightgray')
  ax.xaxis.set_major_locator(MultipleLocator(put_major_label_tick_every))
  ax.xaxis.set_minor_locator(MultipleLocator(put_minor_label_tick_every))

  ax.set_ylabel('Permeabilidade (barrer)')
  
  plt.legend(loc='best')

  if y_scale=='default': y_scale='log'
  plt.yscale(y_scale)
  
  if show_error_axis==True:
    
    if (error_axis == 'absolute'):
      error_name = 'Erro absoluto'
      error_values = []
      print('Erro=pred-real')
      for real,pred in zip(y_real,y_pred):
        error_values += [pred-real]
    elif (error_axis == 'relative'):
      error_name = 'Erro relativo (%)'
      error_values = []
      print('Erro=(pred-real)/real')
      for real,pred in zip(y_real,y_pred):
        error_values += [(pred-real)/real*100]
    else:
      print('ERRO: Escolha uma funcao de erro disponivel (rmse, r2 ou mape)')
    
    if (report_big_relative_errors==True):
      print('Da amostra 1 a',len_x)
      for i in x:
        if (error_values[i-1]>big_errors_limit) or (error_values[i-1]<-big_errors_limit):
          print('Amostra',i,':',error_values[i-1])


    plt.tick_params('x', labelbottom=False)

    ax2 = fig.add_subplot(3, 1, 3, sharex=ax)
    ax2.set_xlabel('Amostra')
    ax2.set_ylabel(error_name)
    #print(error_values)
    #aux_delta = (max(error_values.min())-min(error_values))*0.05
    #ax[0].set_ylim([min(error_values)-aux_delta, max(error_values)-aux_delta])

    ax2.bar(x,error_values, color='tomato')
    ax2.plot(x,error_values, color='firebrick', label='Erro')
    
    if (error_axis == 'absolute'):
      ax2.plot(x, [rmse(y_real,y_pred)]*len_x, 
                linewidth=1.0, 
                ls='--',
                label='RMSE',
                color='firebrick' 
                )
    elif (error_axis == 'relative'):
      ax2.plot(x, [mape(y_real,y_pred)]*len_x, 
                linewidth=1.0, 
                ls='--',
                label='MAPE',
                color='firebrick' 
                )
    
    plt.legend(loc='best')

    return fig, (ax, ax2)
  else:
    ax.set_xlabel('Amostra')

    return fig, ax

#Como exportar e importar com pickle

```
import pickle
```

Exportar:

```
with open( 'example_file.pkl', 'wb' ) as f:
  pickle.dump( "objetos/variáveis", f )
```

Importar

```
with open( 'example_file.pkl', 'rb' ) as f:
  "objetos/variáveis" = pickle.load( f )
```



#Paletas de Cores

In [None]:
sns.color_palette() #https://seaborn.pydata.org/tutorial/color_palettes.html

In [None]:
sns.color_palette("tab10") #https://seaborn.pydata.org/tutorial/color_palettes.html

In [None]:
sns.color_palette("bright") #https://seaborn.pydata.org/tutorial/color_palettes.html

In [None]:
## MATPLOTLIB -> https://matplotlib.org/stable/gallery/color/named_colors.html

In [None]:
print(px.colors.qualitative.Plotly)
print(px.colors.qualitative.D3)
print(px.colors.qualitative.G10)

fig = px.colors.qualitative.swatches()
fig.show()

['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']
['#1F77B4', '#FF7F0E', '#2CA02C', '#D62728', '#9467BD', '#8C564B', '#E377C2', '#7F7F7F', '#BCBD22', '#17BECF']
['#3366CC', '#DC3912', '#FF9900', '#109618', '#990099', '#0099C6', '#DD4477', '#66AA00', '#B82E2E', '#316395']


In [None]:
def show_named_plotly_colours():
    """
    function to display to user the colours to match plotly's named
    css colours.

    Reference:
        #https://community.plotly.com/t/plotly-colours-list/11730/3

    Returns:
        plotly dataframe with cell colour to match named colour name

    """
    s='aliceblue, antiquewhite, aqua, aquamarine, azure,\
        beige, bisque, black, blanchedalmond, blue,\
        blueviolet, brown, burlywood, cadetblue,\
        chartreuse, chocolate, coral, cornflowerblue,\
        cornsilk, crimson, cyan, darkblue, darkcyan,\
        darkgoldenrod, darkgray, darkgrey, darkgreen,\
        darkkhaki, darkmagenta, darkolivegreen, darkorange,\
        darkorchid, darkred, darksalmon, darkseagreen,\
        darkslateblue, darkslategray, darkslategrey,\
        darkturquoise, darkviolet, deeppink, deepskyblue,\
        dimgray, dimgrey, dodgerblue, firebrick,\
        floralwhite, forestgreen, fuchsia, gainsboro,\
        ghostwhite, gold, goldenrod, gray, grey, green,\
        greenyellow, honeydew, hotpink, indianred, indigo,\
        ivory, khaki, lavender, lavenderblush, lawngreen,\
        lemonchiffon, lightblue, lightcoral, lightcyan,\
        lightgoldenrodyellow, lightgray, lightgrey,\
        lightgreen, lightpink, lightsalmon, lightseagreen,\
        lightskyblue, lightslategray, lightslategrey,\
        lightsteelblue, lightyellow, lime, limegreen,\
        linen, magenta, maroon, mediumaquamarine,\
        mediumblue, mediumorchid, mediumpurple,\
        mediumseagreen, mediumslateblue, mediumspringgreen,\
        mediumturquoise, mediumvioletred, midnightblue,\
        mintcream, mistyrose, moccasin, navajowhite, navy,\
        oldlace, olive, olivedrab, orange, orangered,\
        orchid, palegoldenrod, palegreen, paleturquoise,\
        palevioletred, papayawhip, peachpuff, peru, pink,\
        plum, powderblue, purple, red, rosybrown,\
        royalblue, saddlebrown, salmon, sandybrown,\
        seagreen, seashell, sienna, silver, skyblue,\
        slateblue, slategray, slategrey, snow, springgreen,\
        steelblue, tan, teal, thistle, tomato, turquoise,\
        violet, wheat, white, whitesmoke, yellow,\
        yellowgreen'
        
    li=s.split(',')
    li=[l.replace('\n','') for l in li]
    li=[l.replace(' ','') for l in li]

    import pandas as pd
    import plotly.graph_objects as go

    df=pd.DataFrame.from_dict({'colour': li})
    fig = go.Figure(data=[go.Table(
      header=dict(
        values=["Plotly Named CSS colours"],
        line_color='black', fill_color='white',
        align='center', font=dict(color='black', size=14)
      ),
      cells=dict(
        values=[df.colour],
        line_color=[df.colour], fill_color=[df.colour],
        align='center', font=dict(color='black', size=11)
      ))
    ])

    fig.show()
show_named_plotly_colours()

#API Matplotlib

In [None]:
'''

matplotlib.rcParams['font.size'] = 15.
matplotlib.rcParams['font.family'] = "serif"

graph = ax.bar(count_pure_keys, count_pure.values(), width=0.8, color=sns.color_palette())
ax.bar_label(graph, padding=0)

ax.set( ylim=(min,max), xlim=(min,max), autoscale_on=False )
plt.ylim(min,max)
plt.xlim(min,max)


ax.set_title('(a)')
ax.set_ylabel('titulo do eixo y')
ax.set_xlabel('titulo do eixo y ')

ax.set_xticks(posição dos labels, labels=[valores dos labels]) 
ax.tick_params(which='minor major or both', 
               length=4,
               width=3, 
               color='r',
               labelcolor='black',
               labelsize='large',
               grid_color='black',
               grid_linestyle='dotted'
               )
plt.tick_params('x',
                labelsize=6,
                labelbottom=False) 

ax.grid(True, 
        which='both major or minor',
        linestyle='-.',
        axis='x y or both', 
        linestyle='dotted',
        color='black'
        )



#Eixos em escala logarítima
plt.yscale('log')
plt.xscale('log')


#Add legenda
plt.legend(loc='lower right')
# Remover legenda:
ax.get_legend().remove()
# Alterar legenda:
ax.legend(loc='best', ncols=3)
'''