https://docs.python.org/pt-br/3/tutorial/modules.html

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import missingno as msno


import sklearn.metrics

In [None]:
'''
import sklearn.datasets
import sklearn.model_selection

import sklearn.ensemble #bibliotecas de aprendizado de máquina
'''

'\nimport sklearn.datasets\nimport sklearn.model_selection\n\nimport sklearn.ensemble #bibliotecas de aprendizado de máquina\n'

#Declaração de Variáveis

In [None]:
data = '962' #ID do databank
git_import_file = '' #nome da variante do databank original
data_url = ''


#dict para armazenar dados da função ID
ID_dict = {}

#gases com permeabilidades presentes no banco de dados ordem crescente de kinetic diameter
#EM ORDEM CRESCENTE AO D.C., COMO NO DATABANK:
gases = ('He','H2','CO2','O2','H2S','N2','CO','CH4','C2H6','SF6') #referentes aos dados de permeabilidade
gases_kinetic_diameter = {'He':260,
                          'H2':289,
                          'CO2':330,
                          'Ar':340,
                          'O2':346,
                          'H2S':360,
                          'N2':364,
                          'CO':376,
                          'CH4':380,
                          'C2H6':444.3,
                          'SF6':550,
                          'none':0}
gases_effec_diameter = {'He':178, #T>Tg
                        'H2':214,
                        'CO2':302,
                        'O2':289,
                        'Ar':297,
                        'H2S':np.nan,
                        'N2':304,
                        'CO':304,
                        'CH4':318,
                        'C2H6':346,
                        'SF6':np.nan,
                        'none':0}
gases_molar_mass = {'He':4.00, 
                    'H2':2.02,
                    'CO2':44.01,
                    'O2':31.98,
                    'Ar':39.95,
                    'H2S':34.10,
                    'N2':28.00,
                    'CO':28.01,
                    'CH4':16.04,
                    'C2H6':30.07,
                    'SF6':146.06,
                    'none':0.}
gases_polarizability = {'He':0.208, #https://cccbdb.nist.gov/pollistx.asp
                        'H2':0.787,
                        'CO2':2.507,
                        'O2':1.562,
                        'Ar':1.664,
                        'H2S':3.631,
                        'N2':1.710,
                        'CO':1.953,
                        'CH4':2.448,
                        'C2H6':4.226,
                        'SF6':4.490,
                        'none':0.}


gases_kinetic_diameter_inverse = {}
for i in gases_kinetic_diameter.keys():
  gases_kinetic_diameter_inverse[ gases_kinetic_diameter[i] ] = i

gases_effec_diameter_inverse = {}
for i in gases_effec_diameter.keys():
  gases_effec_diameter_inverse[ gases_effec_diameter[i] ] = i
                        
#frequencia de dados para cada gas em situação "pura" e "mixtura"
count_pure = {}
count_mixture = {}


#nome das colunas correspondentes à características da membrana e do processo e 
#seus respectivos índices na coluna
#EM ORDEM:
columns_membrane = ['type', 'description', 'support_material', 'subtype', 'filler_loading',
                    'previous_thickness', 'mean_thickness', 'mean_pore_size', 'pore_volume',
                    'specific_surface_area',  'aging']
columns_process = ['surface_area', 'temperature', 'feed_pressure', 'permeate_pressure',
                   'delta_pressure', 'feed_flow_rate', 'sweep_gas', 'sweep_gas_flow',
                   'stage_cut']
columns_others = ['provided_data_type', 'in_reference_data_location', 'reference', 'url']

columns_membrane_index = {}
columns_process_index = {}
columns_others_index = {}

#A performance de cada gás foi representada pela seguintes variáveis:
prefix1='x_' #Fração mássica/molar/volumétrica
prefix2='Py_' #Permeabilidade
prefix3='Pe_' #Permeância

#VARIÁVEIS AUXILIARES:
dados = pd.DataFrame() #Dataframe a ser utilizados para a previsão. Versão refinada.

##GitHub info

In [None]:
git_username = 'tenoriolms' #username no GitHub
git_repository = 'databank_CH4' #Nome do repositório

git_token = 'não pode' #token para acesso do repositório.
#O github possui um algoritmo para verificar se dentro de cada arquivo importado/commitado
#existe o token de acesso criado, que é secreto. Caso existir, esse token é revogado.
#Como esse notebook irá ser exportado para o github. O token não pode ser escrito aqui.

!git config --global user.email "lhucas_tenorio@hotmail.com"
!git config --global user.name "tenoriolms"

#Funções

##ID(index)

In [None]:
#ID = string utilizada para identificar uma membrana e suas circunstâncias de utilização no banco de dados
#A ID é a soma das variáveis (em forma de strings) que podem variar em uma dada referência
def ID(i, df):
  if i in ID_dict:
    return ID_dict[i]
  else:
    
    aux = str(df['description'][i]) + str(df['filler_loading'][i]) \
    + str(df['mean_thickness'][i]) + str(df['mean_pore_size'][i]) \
    + str(df['temperature'][i]) + str(df['feed_pressure'][i]) \
    + str(df['delta_pressure'][i]) + str(df['feed_flow_rate'][i]) \
    + str(df['stage_cut'][i]) + str(df['aging'][i]) \
    + str(df['reference'][i])
    ID_dict[i] = aux
    
    return aux

##get_key(val,my_dict))

In [None]:
def get_key(val,my_dict): #dict = key : value
    for key, value in my_dict.items():
         if val == value:
             return key
 
    return "get_key function: There is no such Key"

##submit_file(git_export_file)

In [None]:
#exportar para o GitHub
def submit_file(git_export_file):
  !git clone https://{git_token}@github.com/{git_username}/{git_repository}
  !cp {git_export_file} {git_repository}
  %cd {git_repository}
  !git add {git_export_file}
  !git commit -m 'Add/Atualizar arquivo {input_file}'
  !git push -u origin
  %cd ..
  !rm -rf {git_repository}

##import_file(git_import_file)

In [None]:
#importar um arquivo do GitHub
def import_file(git_import_file):
  !git clone https://{git_token}@github.com/{git_username}/{git_repository}
  !cp {git_repository}/{git_import_file} .
  !rm -rf {git_repository}

##Zscores(df_for_scaled, df_reference)

In [None]:
#Escalonar cada coluna utilizando o "z score"
def Zscores(df_for_scaled, df_reference):
  if (any(df_for_scaled.columns != df_reference.columns)):
    print('Zscores function: Dataframes com colunas diferentes')
    return
  
  print(f'Zscores function: columns_reference: {df_reference.columns}')
  for i in df_for_scaled.columns: 
    df_for_scaled[i] = (df_for_scaled[i] - df_reference[i].mean()) / df_reference[i].std()
  #return df_for_scaled

##undo_Zscores(df_scaled, df_reference)

In [None]:
#desfazer o escalonamento realizado para cada coluna utilizando o "z score"
def undo_Zscores(df_scaled, df_reference):
  if (any(df_scaled.columns != df_reference.columns)):
    print('undo_Zscores function: Dataframes com colunas diferentes')
    return
  
  print(f'undo_Zscores function: columns_reference: {df_reference.columns}')
  for i in df_scaled.columns:
    df_scaled[i] = df_scaled[i]*df_reference[i].std() + df_reference[i].mean()
  #return df_scaled

##normalize(df_for_norm, df_reference)

In [None]:
def normalize(df_for_norm, df_reference):
  if (any(df_for_norm.columns != df_reference.columns)):
    print('normalize function: Dataframes com colunas diferentes')
    return
  
  print(f'normalize function: columns_reference: {df_reference.columns}')
  for i in df_for_norm.columns: 
    df_for_norm[i] = (df_for_norm[i] - df_reference[i].min()) / (df_reference[i].max() - df_reference[i].min())
  #return df_for_norm

##undo_normalize(df_for_norm, df_reference)

In [None]:
def undo_normalize(df_normalized, df_reference):
  if (any(df_normalized.columns != df_reference.columns)):
    print('undo_normalize function: Dataframes com colunas diferentes')
    return
  
  print(f'undo_normalize function: columns_reference: {df_reference.columns}')
  for i in df_normalized.columns: 
    df_normalized[i] = df_normalized[i]*(df_reference[i].max() - df_reference[i].min()) + df_reference[i].min()
  #return df_normalized

##str2int_simple_encoder(df,columns='all')

In [None]:
def str2int_simple_encoder(df,columns='all'):
  import pandas as pd
  
  id_dict = {}
  if (columns=='all'):
    
    for i in df.columns:
      if (df[i].dtype==object):
        id_dict[i] = {}
        unique_values = df[i].unique()
        id_dict[i] = {name: id + 1 for id, name in enumerate(unique_values)}

        df[i] = df[i].apply(lambda row, value : value[row], value = id_dict[i] )

  else:
    
    for i in columns:
      if ( (df[i].dtype==object) and (i in df.columns) ):
        id_dict[i] = {}
        unique_values = df[i].unique()
        id_dict[i] = {name: id + 1 for id, name in enumerate(unique_values)}

        df[i] = df[i].apply(lambda row, value : value[row], value = id_dict[i] )
      else:
        print('str2int_simple_encoder: coluna especificada não é do tipo "object" ou não existe no dataframe')
        return
  
  return id_dict

In [None]:
#como era feito anteriormente:
'''
df = dados

#Criar os dicionários para os valores únicos das colunas categóricas
type_id = {}
aux = df['type'].unique()
for i in aux:
  type_id[i] = np.where(aux==i)[0][0]+1
print(type_id)

#converter os valores categóricos da coluna "type" por numéricos
df['type'] = df['type'].apply(lambda row, value : value[row],
                                                    value = type_id )

'''

'\ndf = dados\n\n#Criar os dicionários para os valores únicos das colunas categóricas\ntype_id = {}\naux = df[\'type\'].unique()\nfor i in aux:\n  type_id[i] = np.where(aux==i)[0][0]+1\nprint(type_id)\n\n#converter os valores categóricos da coluna "type" por numéricos\ndf[\'type\'] = df[\'type\'].apply(lambda row, value : value[row],\n                                                    value = type_id )\n\n'

##str2int_hot_encoder(df,columns='all')

In [None]:
def str2int_hot_encoder(df,columns='all'):
  import pandas as pd
  
  id_dict = {}
  if (columns=='all'):
    
    for i in df.columns:
      if (df[i].dtype==object):
        id_dict[i] = {}
        unique_values = df[i].unique()
        for id,name in enumerate(unique_values):
          aux = [0]*(len(unique_values)-1)
          aux.insert(id,1)
          id_dict[i][name] = aux

        df[i] = df[i].apply(lambda row, value : value[row], value = id_dict[i] )

  else:
    
    for i in columns:
      if ( (df[i].dtype==object) and (i in df.columns) ):
        id_dict[i] = {}
        unique_values = df[i].unique()
        for id,name in enumerate(unique_values):
          aux = [0]*(len(unique_values)-1)
          aux.insert(id,1)
          id_dict[i][name] = aux

        df[i] = df[i].apply(lambda row, value : value[row], value = id_dict[i] )
      else:
        print('str2int_simple_encoder: coluna especificada não é do tipo "object" ou não existe no dataframe')
        return
  
  return id_dict

##validation_curve_change_param(model,x_train,y_train,parameters = {})

In [None]:
def validation_curve_change_param(model,
                                  x_train,
                                  y_train,
                                  parameters = {}, # definindo os valores de parâmetros a serem testados
                                  ylim=None
                                  ):
  '''
  Example:
  parameters = {'C': np.arange(10000, 100000, 10000),
              'epsilon': [ 1, 5, 10,100,200,300,400,500,600],
              'tol': [0.001,0.01,0.1,1,5,10,100,1000,2000],
              'gamma': np.arange(0.01, 1.2, 0.05), 
              }
  validation_curve_change_param(model = SVR(kernel='rbf'),
                                x_train = x_train,
                                y_train = y_train,
                                parameters = parameters
                                )
  '''
  # Lista para armazenar os valores "Y" para cada hiperparâmetro
  lista_train_scores_mean = []
  lista_train_scores_std = []
  lista_test_scores_mean = []
  lista_test_scores_std = []

  count = 0
  for key, value in parameters.items():
    


    # calculando a curva de validação
    train_scores, test_scores = sklearn.model_selection.validation_curve(
        model, x_train, y_train, 
        param_name=key, 
        param_range=value,
        scoring="r2", 
        n_jobs=-1
        )

    # médias e desvios-padrão dos resultados da validação cruzada (para cada ponto da curva)
    lista_train_scores_mean.append( np.mean(train_scores, axis=1) )
    lista_train_scores_std.append( np.std(train_scores, axis=1) )
    lista_test_scores_mean.append( np.mean(test_scores, axis=1) )
    lista_test_scores_std.append( np.std(test_scores, axis=1) )

  
    ## GRAFICO ##
    plt.subplots(1,1, 
                 #sharex = True, sharey = True
                 )

    # plotando curva correspondente ao treino
    plt.plot(value, #https://matplotlib.org/stable/tutorials/introductory/pyplot.html
             lista_train_scores_mean[count],
             '.-',
             label='Treino')
    plt.fill_between(value,
                     lista_train_scores_mean[count] - lista_train_scores_std[count],
                     lista_train_scores_mean[count] + lista_train_scores_std[count],
                     alpha=0.1)
    
    # plotando curva correspondente ao teste
    plt.plot(value, 
             lista_test_scores_mean[count],
             '.-',
             label='Teste')
    plt.fill_between(value, 
                     lista_test_scores_mean[count] - lista_test_scores_std[count],
                     lista_test_scores_mean[count] + lista_test_scores_std[count],
                     alpha=0.1)

    # formatando gráfico
    if (count==0): plt.title('Curva de Validação')
    plt.xlabel(key)
    plt.ylabel("$R^2$")
    plt.legend(loc="best");
    plt.ylim(ylim)
    ## GRAFICO ##
    
    count +=1

##display_score(m,x_train,x_test,y_train,y_test))

Out-of-bag parameter:

https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710

https://www.analyticsvidhya.com/blog/2020/12/out-of-bag-oob-score-in-the-random-forest-algorithm/

https://stats.stackexchange.com/questions/88980/why-on-average-does-each-bootstrap-sample-contain-roughly-two-thirds-of-observat

https://stats.stackexchange.com/questions/198839/evaluate-random-forest-oob-vs-cv

In [None]:
def rmse(v_real,v_pred): 
    return np.sqrt(sklearn.metrics.mean_squared_error(v_real,v_pred)) #leia sobre sklearn.metrics.mean_squared_error
def r2(v_real,v_pred): 
    return sklearn.metrics.r2_score(v_real,v_pred) #leia sobre sklearn.metrics.r2_score

##função para avaliar RMSE, R2 e OOB_score
def display_score(m,x_train,x_test,y_train,y_test):
    
    res = [[rmse( y_train,m.predict(x_train) ), r2( y_train,m.predict(x_train) )],
          [rmse( y_test,m.predict(x_test) ), r2( y_test,m.predict(x_test) )]] #a função display score irá retornar uma tabela
    
    score = pd.DataFrame(res, columns=['RMSE','R2'], index = ['Treino','Teste'])

    if hasattr(m, 'oob_score_'): #https://www.programiz.com/python-programming/methods/built-in/hasattr
        score.loc['OOB'] = [rmse(y_train, m.oob_prediction_), m.oob_score_]

    display(score)

##plot_permutation_importance( model, x_val, y_val, x_val_columns )

In [None]:
def plot_permutation_importance( model, x_val, y_val, x_val_columns ):
  #https://medium.com/horadecodar/gr%C3%A1ficos-de-barra-com-matplotlib-85628bfc4351#:~:text=barh()%3A,os%20seguintes%20par%C3%A2metros%3A
  from sklearn.inspection import permutation_importance

  r = permutation_importance(model, x_val, y_val,
                             n_repeats=30,
                             scoring='r2',
                             random_state=0)

  df = pd.DataFrame( columns=['mean','std'] )

  for i in r.importances_mean.argsort()[::-1]:
    if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
      #print(f"{x_val_columns[i]:<20}"
      #      f"{r.importances_mean[i]:.3f}"
      #      f" +/- {r.importances_std[i]:.3f}")
      df.loc[x_val_columns[i]] = [r.importances_mean[i], r.importances_std[i]]
  
  return df

##Gráficos

### plt_valuecounts_by(df,variable,by)

In [None]:
def plt_valuecounts_by(df = pd.DataFrame(),
                       variable = [],
                       by = '',
                       consider_none = False,
                       consider_zeros = False):
  '''
  Plotar a qtd de valores existentes da variavel="variable" para cada classe da variável "by"
  '''
  
  #https://plotly.com/python/histograms/
  import plotly.graph_objects as go

  fig = go.Figure()

  #Filtrar apenas as linha que possuem dados de "variavel"
  for i in variable:
    
    if (consider_none==False):
      df.loc[ df[i]=='none', i ] = np.nan
      df.loc[ df[i]=='None', i ] = np.nan

    if (consider_zeros==False):
      df.loc[ df[i]==0, i ] = np.nan

    df_aux = df.loc[ df[i].notna(), by]

    fig.add_trace(go.Histogram(
        x=df_aux,
        histnorm='',
        name=i, # name used in legend and hover labels
        #marker_color='#EB89B5',
        #opacity=0.75
        ))

  fig.update_layout(
      title_text=f'Quantity of data by each {by}', # title of plot
      xaxis_title_text=by, # xaxis label
      yaxis_title_text='Count', # yaxis label
      bargap=0.2, # gap between bars of adjacent location coordinates
      bargroupgap=0.1 # gap between bars of the same location coordinates
      )

  fig.show()

### plt_hist_of_columns(df)

In [None]:
def plt_hist_of_columns(df):
  # converter colunas numéricas para "float"
  float_df_columns = []
  for i in df.columns:
    try:
      df[i] = df[i].astype(float)
    except:
      print(f'heatmap_pearson function: X column "{i}" is a {df[i].dtype}')
    else:
      float_df_columns += [i]
  
  aux = []
  for i in float_df_columns:
    aux += [i]
    if (len(aux)==4):
      try:
        df[aux] = df[aux].astype(float)
      except:
        print()
      df[aux].hist()
      aux = []
  df[aux].hist()

### heatmap_corr(df, x='all', y='all')

In [None]:
def heatmap_corr(df,
                 x='all', 
                 y='all',
                 method='pearson', 
                 min_periods=1,
                 color='di'):
  import pandas as pd
  
  corr_pear = df.corr( min_periods=min_periods, method=method )
  
  if (x=='all'):
    x = corr_pear.columns.tolist()
  #print(x)
  if (y=='all'):
    y = corr_pear.columns.tolist()
  #print(y)
  heatmap_pearson = pd.DataFrame( columns=x, index=y )
  heatmap_pearson = corr_pear.loc[y,x]
  
    #GRAFICO#
  f, ax = plt.subplots(figsize=( 1*len(x)+3, 1*len(y) ))
  if color=='mono':
    colors = ('#00076e', '#1b00ff', '#d0cbff', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  elif (color=='di'):
    colors = ('#7e0000', '#ff0000', '#fecfcf', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  cmap = sns.blend_palette(colors, input='rgb', as_cmap=True)
  sns.heatmap(heatmap_pearson, annot=True, cmap=cmap, ax=ax, center=0) 

  return heatmap_pearson

### heatmap_pearson(df='pandas_DataFrame', x='all', y='all', allow_duplicates=True, color='di', graphic='coeff')

In [None]:
def heatmap_pearson(df='pandas_DataFrame',
                    x='all',
                    y='all',
                    allow_duplicates=True,
                    color='di', 
                    graphic='coeff'):
  from scipy.stats import pearsonr
  '''
  Returns a heatmap plot with pearson's coefficients or their p-values.

  df      = Dataframe
  x and y = list of x and y heatmap columns/axis
  color   = "di" or "mono"
  graphic = "coeff" or "pvalue"
  '''
  if (x=='all'):
    x = df.columns

  # converter colunas numéricas para "float"
  float_df_columnsx = []
  for i in x:
    try:
      df[i] = df[i].astype(float)
    except:
      print(f'heatmap_pearson function: X column "{i}" is a {df[i].dtype}')
    else:
      float_df_columnsx += [i]
  x = float_df_columnsx


  if (y=='all'):
    y = df.columns

  # converter colunas numéricas para "float"
  float_df_columnsy = []
  for i in y:
    try:
      df[i] = df[i].astype(float)
    except:
      print(f'heatmap_pearson function: Y column "{i}" is a {df[i].dtype}')
    else:
      float_df_columnsy += [i]
  y = float_df_columnsy
  

  pear_heatmap = pd.DataFrame( columns=x, index=y, dtype=float)
  pvalue_heatmap = pd.DataFrame( columns=x, index=y, dtype=float )
  for i in x: #columns
    #print(i)
    for j in y: #index
      df_aux = df[[i,j]].dropna()
      
      #Retirar duplicadas nas coordenadas de "df_aux"
      if (allow_duplicates==False):
        old_df = df_aux
        new_df = pd.DataFrame( columns=[i,j] )
        lines_new_df = []
        for index in old_df.index:
          line_old_df = f'{df.loc[index,i]} {df.loc[index,j]}'
          if not(line_old_df in lines_new_df):
            lines_new_df += [line_old_df]
            new_df.loc[index, [i,j] ] = old_df.loc[index, [i,j]]
        df_aux = new_df

      if (df_aux.shape[0]==0):
        pear_heatmap.loc[j,i], pvalue_heatmap.loc[j,i] = (np.nan, np.nan)
        continue
      columnx = df_aux.iloc[:,0]
      columny = df_aux.iloc[:,1]
      #print(columnx.shape, columny.shape)
      pear_heatmap.loc[j,i], pvalue_heatmap.loc[j,i] = pearsonr( columnx, columny )

  ## GRAFICO ##
  if (graphic=='coeff'):
    graphic = pear_heatmap
  elif (graphic=='pvalue'):
    graphic = pvalue_heatmap

  f, ax = plt.subplots(figsize=( 1*len(x)+3, 1*len(y) ))
  if color=='mono':
    colors = ('#00076e', '#1b00ff', '#d0cbff', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  elif (color=='di'):
    colors = ('#7e0000', '#ff0000', '#fecfcf', '#FFFFFF', '#d0cbff', '#1b00ff', '#00076e')
  cmap = sns.blend_palette(colors, input='rgb', as_cmap=True)
  sns.heatmap(graphic, annot=True, cmap=cmap, ax=ax, center=0)

  return (pear_heatmap, pvalue_heatmap)

### plot_stacked_hist_or_bar_by(df,variable='',by='',mode='bar or hist')

In [None]:
import copy

# Histogramas empilhados por classe "by"
def plot_stacked_hist_or_bar_by(df, variable = '',
                                by = '',
                                mode = 'bar or hist', 
                                alpha = 0.3,
                                bins_hist = 1, 
                                width_bar = 'default',
                                bar_norm = False,
                                colors_reference = ['b','g','r','c','m','y','k'],
                                figsize = [6.4, 4.8]):
  '''
  Exemplos semelhantes de grafico de barras: https://matplotlib.org/stable/gallery/lines_bars_and_markers/bar_stacked.html#sphx-glr-gallery-lines-bars-and-markers-bar-stacked-py
  
  Essa função se baseia na criação de dicionários para os valores da base (variável "bottom"
  do matplotlib) e das frequencias (variável "height" do matplotlib). As chaves(keys) desses
  dicionários são os valores únicos de "df[variable]" e os valores são referentes à frequencia
  (obtidas a partir da função "value_counts").

  Essa função retorna "fig" e "ax" do Matplotlib. Portanto, o gráfico criado pode ser editado
  posteriormente, mesmo com certa limitação.

  O modo 'hist' serve apenas para quando "variable" possui valores numéricos.
  '''
  fig, ax = plt.subplots(figsize=figsize)

  df.dropna(subset=[variable], inplace=True)
  x_range = [df[variable].min(), df[variable].max()]

  unique_by = df[by].unique()
  #Definir as CORES para cada valor único da variável "by".
  #Caso houver mais valores que o tamanho de "colors_reference", as cores serão repetidas:
  count, colors = (0, [])
  for i in range(len(unique_by)):
    if (count==len(colors_reference)):
      count = 0
    colors += [colors_reference[count]]
    count += 1
  
  #dicionario com os valores da base:
  unique_variable = df[variable].unique()
  bottoms = dict(zip(unique_variable,len(unique_variable)*[0]))
  #variavel referencia - dicionario com valores zerados:
  values_0 = copy.copy(bottoms) 

  #Histograma - o dicionário para o "x", "height" e "bottom" é diferente quando se deseja construir um histograma:
  #             faz-se necessário trabalhar com numeros/floats
  if (mode=='hist'):
    histogram_width = (x_range[1]-x_range[0])/bins_hist
    histogram_x = np.arange( x_range[0], x_range[1], histogram_width ).tolist()
    histogram_bottoms = dict(zip(histogram_x,len(histogram_x)*[0])) ##valores com o valor da base
    histogram_0 = copy.copy(histogram_bottoms) #variavel referencia - dicionario com valores zerados

  #Definir variável com o valor total de frequência para cada "variable"
  if (bar_norm==True):
    total_value_counts = df[variable].value_counts()

  count = 0
  for i in unique_by:
    values = copy.copy(values_0) #armazenará os "value_counts" referentes a vada valor unico de uma variavel
    
    #Obter os value counts de cada "variable" para cada "unique_by"
    df_filtrado = df.loc[df[by]==i, variable]
    hist_aux_df = df_filtrado.value_counts()
    for j in hist_aux_df.index:
      if (bar_norm==True):
        values[j] += hist_aux_df[j]/total_value_counts[j] #Normalizar "values" pelo valor total
      else:
        values[j] += hist_aux_df[j]


    if (mode=='bar'):
      ## Grafico ##
      if (width_bar=='default') and not(isinstance(x_range[1], str)): width_bar = (x_range[1]-x_range[0])/len(unique_variable)
      if isinstance(x_range[1], str): width_bar=0.8

      ax.bar(x=list(values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
            height=list(values.values()),
            width=width_bar,
            bottom=list(bottoms.values()),
            align = 'center',
            color = colors[count],
            #edgecolor='black',
            #hatch='//',
            alpha=alpha,
            label= i
            )
      ## Grafico ##
      for j in bottoms.keys():
        bottoms[j] += values[j]
    
    elif (mode=='hist'):
      histogram_values = copy.copy(histogram_0)
      for j in values.keys():
        for k in histogram_x[::-1]:
          if (j>=k):
            histogram_values[k] += values[j]
            break
      ## Grafico ##
      ax.bar(x=list(histogram_values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
            height=list(histogram_values.values()),
            width=histogram_width,
            bottom=list(histogram_bottoms.values()),
            align = 'edge',
            color = colors[count],
            #edgecolor='black',
            #hatch='//',
            alpha=alpha,
            label= i
            )
      ## Grafico ##
      for j in histogram_bottoms.keys():
        histogram_bottoms[j] += histogram_values[j]
    
    count += 1
  

  ## Grafico ##
  ax.set(#title=f'Frequência de {variable} por tipo de membrana',
         xlabel=variable,
         ylabel='Frequência')
  plt.legend();
  ax.margins(0.05)
  ## Grafico ##

  print('min =',df[variable].min())
  print('max =',df[variable].max())
  
  return fig, ax

###compare_hists_by(df1, df2, variable = '', by = '')

In [None]:
import copy
def compare_hists_by(df1, df2,
                     variable = '',
                     by = '',
                     df1_name = 'default',
                     df2_name = 'default',
                     alpha = 0.7,
                     bins_hist = 1,
                     colors_reference = ['b','g','r','c','m','y','k'],
                     figsize = [12.8, 7.2]):
  '''
  "variable" precisa ter valores numéricos.

  Essa função retorna "fig" e "ax" do Matplotlib. Portanto, o gráfico criado pode ser editado
  posteriormente, mesmo com certa limitação.
  
  A lógica dessa função foi copiada da função "plot_stacked_hist_or_bar_by".
  Para melhor compreender esse cógido, ler antes o código da função "plot_stacked_hist_or_bar_by"
  '''
  fig, ax = plt.subplots( 2, 2, figsize=figsize )
  df1.dropna(subset=[variable], inplace=True)
  df2.dropna(subset=[variable], inplace=True)
  
  x_range_min = min(df1[variable].min(), df2[variable].min())
  x_range_max = max(df1[variable].max(), df2[variable].max())

  #Achar os valores únicos da variável "by" presentes nos dois dataframes (df1 e df2)
  unique_by = (df1[by].unique().tolist() + df2[by].unique().tolist())
  unique_by = list(dict.fromkeys(unique_by))
  unique_by.sort()
  #Definir as cores para cada valor único da variável "by".
  #Caso houver mais valores que o tamanho de "colors_reference", as cores serão repetidas:
  count, colors = (0, [])
  for i in range(len(unique_by)):
    if (count==len(colors_reference)):
      count = 0
    colors += [colors_reference[count]]
    count += 1
  
  #Histograma - o dicionário para o "x", "height" e "bottom" para plotar um histograma
  histogram_width = (x_range_max-x_range_min)/bins_hist
  histogram_x = np.arange( x_range_min, x_range_max, histogram_width ).tolist()
  histogram_bottoms = dict(zip(histogram_x,len(histogram_x)*[0])) ##valores com o valor da base
  histogram_0 = copy.copy(histogram_bottoms) #variavel referencia - dicionario com valores zerados

  ## Plots individuais ##
  histogram_df = {}
  for (df, position) in [(df1, 0), (df2, 1)]:
    unique_variable = df[variable].unique()
    #variavel referencia - dicionario com valores zerados:
    values_0 = dict(zip(unique_variable,len(unique_variable)*[0]))

    count = 0
    for i in unique_by:
      values = copy.copy(values_0) #armazenará os "value_counts" referentes a vada valor unico de uma variavel
      aux_df = df.loc[df[by]==i, variable]
      hist_aux_df = aux_df.value_counts()
      for j in hist_aux_df.index:
        values[j] += hist_aux_df[j]
      
      ## Histograma ##
      histogram_values = copy.copy(histogram_0)
      for j in values.keys():
        for k in histogram_x[::-1]:
          if (j>=k):
            histogram_values[k] += values[j]
            break

      ## Grafico ##
      ax[position,0].bar(x=list(histogram_values.keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
                        height=list(histogram_values.values()),
                        width=histogram_width,
                        bottom=list(histogram_bottoms.values()),
                        align = 'edge',
                        color = colors[count],
                        #edgecolor='black',
                        #hatch='//',
                        alpha=alpha,
                        label= i
                        )
      if (position==0):
        if (df1_name=='default'):
          title='df1'
        else:
          title=df1_name
      if (position==1):
        if (df2_name=='default'):
          title='df2'
        else:
          title=df2_name
      ax[position,0].set_title(title)
      ax[position,0].set_ylabel('Frequência')
      if (position==0): ax[position,0].set_xticks([]) #ocultar o eixo x
      if (position==1): ax[position,0].set_xlabel(variable)
      ax[position,0].legend();
      #ax[position,0].margins(0.05)
      ## Grafico ##

      for j in histogram_bottoms.keys():
        histogram_bottoms[j] += histogram_values[j]
      
      count += 1
    
    histogram_df[position] = histogram_bottoms.copy() #Armazenar valor para depois plotar junto o df1 e df2
    histogram_bottoms = histogram_0.copy()

  ## Plots juntos ##
  ## Grafico ##
  ax = plt.subplot(122)
  for i in (0, 1):
    if (i==0):
      color = 'darkred'
      if (df1_name=='default'):
        label='df1'
      else:
        label=df1_name
    if (i==1):
      color = 'cornflowerblue'
      if (df2_name=='default'):
        label='df2'
      else:
        label=df2_name
    ax.bar(x=list(histogram_df[i].keys()), #https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.bar.html#matplotlib.axes.Axes.bar
                  height=list(histogram_df[i].values()),
                  width=histogram_width,
                  align = 'edge',
                  color = color,
                  #edgecolor='black',
                  #hatch='//',
                  alpha=alpha,
                  label= label
                  )
    ax.legend()
    ax.set(xlabel=variable, ylabel='Frequência')
   ## Grafico ##

  return fig, ax

###draw_tree(t, dados, size=10, ratio=1, precision=0) - Modelo RF

In [None]:
def draw_tree(t, dados, size=10, ratio=1, precision=0):
   
    import re
    import graphviz
    import sklearn.tree
    import IPython.display
    
    s=sklearn.tree.export_graphviz(t, out_file=None, feature_names=dados.columns, filled=True,
                                   special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

###plotar_importancias(modelo, tags, n=10) - modelo RF

In [None]:
def plotar_importancias(modelo, tags, n=10):
    
    fig, ax = plt.subplots(1,2, figsize = (20,4))

    coefs = []
    abs_coefs = []

    if hasattr(modelo,'coef_'):
        imp = modelo.coef_
    elif hasattr(modelo,'feature_importances_'):
        imp = modelo.feature_importances_
    else:
        print('sorry, nao vai rolar!')
        return

    coefs = (pd.Series(imp, index = tags))
    coefs.plot(use_index=False, ax=ax[0]);
    abs_coefs = (abs(coefs)/(abs(coefs).sum()))
    abs_coefs.sort_values(ascending=False).plot(use_index=False, ax=ax[1],marker='.')

    ax[0].set_title('Importâncias relativas das variáveis')
    ax[1].set_title('Importâncias relativas das variáveis - ordem decrescente')

    abs_coefs_df = pd.DataFrame(np.array(abs_coefs).T,
                                columns = ['Importancias'],
                                index = tags)

    df = abs_coefs_df['Importancias'].sort_values(ascending=False)
    
    print(df.iloc[0:n])
    plt.figure()
    df.iloc[0:n].plot(kind='barh', figsize=(15,0.25*n), legend=False)
    
    return df

#Como exportar e importar com pickle

```
import pickle
```

Exportar:

```
with open( 'example_file.pkl', 'wb' ) as f:
  pickle.dump( "objetos/variáveis", f )
```

Importar

```
with open( 'example_file.pkl', 'rb' ) as f:
  "objetos/variáveis" = pickle.load( f )
```

