<a href="https://colab.research.google.com/github/yuriborg/dsmkt/blob/main/churn_feature_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AWS Creds

# Bibliotecas

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os, sys
import time
import datetime
import statistics
import re
import scipy
import unicodedata
import json

from scipy.stats import chi2_contingency

import plotly.express as px
from IPython.display import HTML
import plotly.figure_factory as ff

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

plt.rcParams['axes.facecolor'] = 'white'
dt = datetime.datetime.now().strftime('%d%m%Y_%H%M%S')


import boto3
import pandas as pd
from io import StringIO

# Feature Engineering Functions

In [None]:
def get_missing_cols(
    df,
    apply_filter = True,
    threshold = 50,
    cols_to_remove = None,
    verbose = False
):
    t = df.copy()
    if cols_to_remove is not None:
        if verbose:
            print('removing columns: {}'.format(cols_to_remove))
        t = t[list(set(t.columns) - set(cols_to_remove))]
    t_missing = pd.DataFrame({
                    'coluna': t.isna().sum().index,
                    'qtd_faltantes': t.isna().sum().values,
                    '%_faltantes': (t.isna().sum().values)*100/len(t.isna())
                }).sort_values(by=['qtd_faltantes'], ascending = False)
    t_missing['qtd_preenchida'] = list(map(lambda col: len(t[~t[col].isna()]),
                                                          t_missing.coluna.values))
    if apply_filter:
        return t_missing[t_missing['%_faltantes']>=threshold].reset_index(drop=True)
    else:
        return t_missing.reset_index(drop=True)

def findcol(val):
    return list(filter(lambda x: val.lower() in x.lower(), df.columns))

def cramers_V(var1,var2) :
    crosstab = np.array(pd.crosstab(var1, var2, rownames=None, colnames=None)) # Cross table building
    stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
    obs = np.sum(crosstab) # Number of observations
    mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
    return (stat/(obs*mini))

def CramerV_table(df, cols):
    exc_cols=[]
    rows= []
    c=0
    N = len(cols)*len(cols)
    for var1 in cols:
        col = []
        for var2 in cols :
            c+=1
            print('({:.3f}%) ({}/{}) ({}, {})'.format(c*100/N, c, N, var1, var2))

            try:
                cramers =cramers_V(df[var1], df[var2]) # Cramer's V test
                col.append(round(cramers,6)) # Keeping of the rounded value of the Cramer's V
            except Exception as e:
                exc_cols.append('{},{}'.format(var1, var2))

        rows.append(col)

    cramers_results = np.array(rows)

    return pd.DataFrame(cramers_results,
                      columns = cols,
                      index = cols), exc_cols

def CramerV_vector(df, target, cols, verbose = True):
    exc_cols=[]
    rows= []
    d = {'col':[],
       'CramersV':[]}
    c=0
    N = len(cols)
    for var1 in cols:
        c+=1
        if verbose:
            print('({:.3f}%) ({}/{}) ({}, {})'.format(c*100/N, c, N, var1, target))

        try:
            cramers =cramers_V(df[var1], df[target]) # Cramer's V test

            d['col'].append(var1)
            d['CramersV'].append(round(cramers,6))

        except Exception as e:
            exc_cols.append('{}'.format(var1))

    res = pd.DataFrame(d)
    res = res.fillna(0.0)

    res.index = res['col']
    res.sort_values(by=['CramersV'], ascending = False, inplace = True)
    return res[['CramersV']], exc_cols


def save_csv_in_bucket(
    df,
    filename,
    my_bucket = "sami-data-platform-s3-dev-sandbox-data-science",
    access_key = None,
    secret_access_key = None,
):
    if access_key is None:
        access_key = getpass('Enter the access key.\n')
    if secret_access_key is None:
        secret_acess_key = getpass('Enter the secret acess key: \n')

    s3 = boto3.resource(
        service_name='s3',
        region_name='us-east-1',
        aws_access_key_id=access_key,
        aws_secret_access_key=secret_access_key
    )
    csv_buffer = StringIO()
    df.to_csv(csv_buffer)
    s3.Bucket(my_bucket).put_object(Key=filename,
                               Body = csv_buffer.getvalue())

def load_csv_from_bucket(
    filename,
    access_key = access_key,
    secret_access_key = secret_access_key,
    my_bucket = "sami-data-platform-s3-dev-sandbox-data-science"
):
    return pd.read_csv(f's3://{my_bucket}/{filename}', storage_options = {
            'key': access_key,
            'secret': secret_access_key,
        },
        index_col = 0)

def get_cat_num_vars(df):
    # separating categorical and numerical
    cat_cols = df.select_dtypes(exclude='number').columns
    num_cols = df.select_dtypes(include='number').columns
    return cat_cols, num_cols

# Data Extraction

In [None]:
def list_files_in_bucket(
    aws_access_key_id,
    aws_secret_access_key,
    service_name='s3',
    region_name='us-east-1',
    bucket_name = "sami-data-platform-s3-dev-sandbox-data-science"
):
    s3 = boto3.resource(
        service_name=service_name,
        region_name=region_name,
        aws_access_key_id= aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key
    )
    for obj in s3.Bucket(bucket_name).objects.all():
        print(obj)

s3 = boto3.resource(
    service_name='s3',
    region_name='us-east-1',
    aws_access_key_id= access_key,
    aws_secret_access_key= secret_access_key
)
for obj in s3.Bucket("sami-data-platform-s3-dev-sandbox-data-science").objects.all():
    print(obj)

# Dataset preprocessado (imputation, scaling, encoding)


In [None]:
df_prep = load_csv_from_bucket('churn_preprocessed_21112022.csv')
df_prep

In [None]:
print(df_prep.shape)
df_prep = df_prep[
    list(set(df_prep.columns) - set(['churn_churnou', 'churn_não churnou']))
]
print(df_prep.shape)

In [None]:
get_missing_cols(df_prep)

# Dataset sem encoding e scaling

In [None]:
df = load_csv_from_bucket('churn_cleaned_21112022.csv')
df

In [None]:
get_missing_cols(df)

In [None]:
# dummies one-hot-encoding
onehot_cols = list(set(df_prep.columns) - set(df.columns))
onehot_cols

# Feature selection
Há diversos métodos para seleção de features. Alguns métodos são mais apropriados para determinados tipos de dados. Exemplos:

Input numérico, output numérico:
correlação de pearson (linear)
índice de Spearman (não-linear)
Input numérico, output categórico:
ANOVA (linear)
Coeficiente de Kendall (não-linear)
Input categórico, ouput numérico:
Regressão, pode usar os métodos de input numérico e output categórico
Input categórico, ouput categórico:
Teste chi-quadrado de contingência
Teste de informação mútua
Em nosso caso, caso façamos a modelagem com a variável dicotômica "e_desligado" (ou "churn"), devemos usar os métodos de output categórico, e especialmente para nossos dados eventualmente podemos discretizar uma variável contínua, ou usar a forma contínua, ex: nota média HRA (contínua), ou "classe nota HRA" (boa, ruim, média). Contudo, se utilizarmos a variável "tempo de contrato", o output é numérico

# Select KBest

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif

def select_k_best_features(X_feat, y,
                           #num_cols, one_hot_cols,
                           features,
                           K = 10,
                           cols_to_remove = None,
                           score_func = chi2,
                           filter_data = True,
                           apply_minmax = True
                           ):

    X_df = pd.DataFrame(X_feat,
                      columns = features #list(num_cols) + list(one_hot_cols)
                       )

    if cols_to_remove is not None:
        X_df = X_df[
                list(set(X_df.columns) - set(cols_to_remove)
                )
        ]

    #apply SelectKBest class to extract top 10 best features
    bestfeatures = SelectKBest(score_func=score_func, k=K)
    fit = bestfeatures.fit(X_feat, y)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X_df.columns)

    #concat two dataframes for better visualization
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  #naming the dataframe columns
#     print(featureScores.nlargest(K,'Score'))  #print 10 best features

    if apply_minmax:
        min_score = featureScores['Score'].min()
        A_score = featureScores['Score'].max() - min_score
        featureScores['minmax_score'] = list(map(lambda score: (score - min_score)/(A_score),
                                featureScores['Score'].values
                                ))
    if filter_data:
        featureScores = featureScores.sort_values(by=['Score'], ascending = False)
        featureScores = featureScores[~featureScores['Score'].isna()].reset_index(drop=True)

    return featureScores

# Plots

In [None]:

def plot_feat_sel_score(
        feat_sel_tab,
        method_name,
        score_var = 'Score',
        feat_var = 'Specs',
        threshold = 0.01,
        height = 750,
        width = 1500,
        ticks_rotation=0
    ):
    t = feat_sel_tab.copy()
    t.index = t[feat_var].values
    t = t[t['minmax_score']>threshold]

    # ------- heatmap
    plt.figure(figsize=(32,3))
    plt.title(f'Feature Selection - {method_name} (>{threshold})',
              fontsize = 16)
    sns.heatmap(t[['minmax_score']].T,
                annot = True,
                fmt = '.2f')
    plt.show()

    # ------ polar plot

    # better display for labels
#     t = replace_vars(t, feat_var)
#     t[feat_var] = list(map(lambda x: x.replace(' ', '<br>'), t[feat_var].values))


    fig = px.line_polar(t[t['minmax_score']>threshold],
                        r="minmax_score",
                        theta=feat_var,
                        line_close=True,
                        height = height,
                        width = width
                       )
    fig.update_layout(
        title = f'Feature Selection - {method_name} (>{threshold})',
        title_x = 0.5,
#         title_y = 0.6,
        xaxis_title="variáveis",
        yaxis_title="minmax score",
        font=dict(
            family="Lato",
            size=12,
        ),
        polar_angularaxis = dict(
            rotation=ticks_rotation
        )
    )
    fig.show()


def plot_feat_sel_score2(
        feat_sel_tab,
        method_name,
        score_var = 'Score',
        feat_var = 'Specs',
        threshold = 0.01,
        height = 750,
        width = 1500,
        ticks_rotation=0
    ):
    t = feat_sel_tab.copy()
    t.index = t[feat_var].values
    t = t[t['minmax_score']>threshold]

    # ------- heatmap
    plt.figure(figsize=(32,3))
    plt.title(f'Feature Selection - {method_name} (>{threshold})',
              fontsize = 16)
    sns.heatmap(t[['minmax_score']].T,
                annot = True,
                fmt = '.2f')
    plt.show()

    # ------ polar plot

    # better display for labels
#     t = replace_vars(t, feat_var)
#     t[feat_var] = list(map(lambda x: x.replace(' ', '<br>'), t[feat_var].values))


    fig = px.scatter_polar(t[t['minmax_score']>threshold],
                        r="minmax_score",
                        theta=feat_var,
#                         line_close=True,
                        height = height,
                        width = width
                       )
    fig.update_layout(
        title = f'Feature Selection - {method_name} (>{threshold})',
        title_x = 0.5,
#         title_y = 0.6,
        xaxis_title="variáveis",
        yaxis_title="minmax score",
        font=dict(
            family="Lato",
            size=12,
        ),
        polar_angularaxis = dict(
            rotation=ticks_rotation
        )
    )
    fig.show()

# Plot Ajustado Utils

In [None]:
vars_to_adj = [
        'qtd_notas_medias_nps_trans',
         'classe_nota_media<br>nps_global_muito<br>bom_[9-10]',
         'max_tempo_sami_meses',
         'classe_nota_media<br>nps_trans_ruim<br>[3,_5[',
         'soma_dias_atraso_inad',
         'qtd_notas_baixas_nps_trans',
         'nota_max_nps_trans',
         'nota_media_nps_trans',
         'classe_nota_media<br>nps_global_médio<br>[5,_7[',
         'ultimo_metodo_pag_CARTAO',
         'ultimo_metodo_pag_BOLETO',
         'classe_nota_media<br>nps_trans_muito<br>ruim_[0,_3[',
         'classe_uso_guias<br>abaixo_da<br>média_[0,_2.5[',
         'max_t_atend_hora',
         'media_t_atend_hora',
         'min_t_atend_hora',
         'media_tempo_sami_meses',
         'max_t_atend_s',
         'classif_tempo_atend<br>hora_tempo_acima<br>da_média_]50.77,_500]',
         'media_t_atend_s',
         'qtd_notas_medias_nps_global',
         'qtd_notas_boas_nps_trans',
         'classe_nota_media<br>nps_global_bom<br>[7,_9[',
         'nota_max_nps_global',
         'qtd_notas_boas_nps_global',
         'nota_media_nps_global',
         'classe_nota_media<br>nps_trans_médio<br>[5,_7[',
         'classe_nota_media<br>nps_global_muito<br>ruim_[0,_3[',
         'min_t_atend_s',
         'min_tempo_sami_meses',
         'nota_min_nps_trans',
         'classe_nota_media<br>nps_global_ruim<br>[3,_5[',
         'nota_min_nps_global',
         'classe_nota_media<br>nps_trans_bom<br>[7,_9[',
         'qtd_notas_baixas_nps_global',
         'tipo_contato_nps<br>trans_Membro_-<br>Cancelado',
         'media_dias_atraso_inad',
         'tipo_contato_nps<br>global_Membro_-<br>Cancelado',
         'classe_nota_media<br>nps_trans_muito<br>bom_[9-10]',
    'qtd_tickets_cancelamento',
    'pct_tickets_cancelamento'
                  ]

d_vars_to_adj = {
        'qtd_notas_medias_nps_trans':'qtd_notas<br>medias<br>nps_trans',
         'classe_nota_media<br>nps_global_muito<br>bom_[9-10]' : 'nota_nps<br>global_muito<br>bom',
         'max_tempo_sami_meses':'max_tempo<br>sami_meses',
         'classe_nota_media<br>nps_trans_ruim<br>[3,_5[' : 'nota<br>nps_trans<br>ruim ',
         'soma_dias_atraso_inad' : 'soma_dias<br>atraso_inad',
         'qtd_notas_baixas_nps_trans' : 'qtd_notas<br>baixas<br>nps_trans',
         'nota_max_nps_trans' : 'nota_max<br>nps_trans',
         'nota_media_nps_trans' : 'nota<br>nps_trans',
         'classe_nota_media<br>nps_global_médio<br>[5,_7[' : 'nota nps<br>global<br>médio',
         'ultimo_metodo_pag_CARTAO' : 'ultimo_metodo<br>pag_CARTAO',
         'ultimo_metodo_pag_BOLETO' : 'ultimo_metodo<br>pag_BOLETO',
         'classe_nota_media<br>nps_trans_muito<br>ruim_[0,_3[' : 'nota_nps<br>trans_muito<br>ruim',
         'classe_uso_guias<br>abaixo_da<br>média_[0,_2.5[' : 'uso_guias<br>abaixo_da<br>média',
         'max_t_atend_hora' : 'max_t<br>atend_h',
         'media_t_atend_hora': 'med_t<br>atend_h',
         'min_t_atend_hora': 'min_t<br>atend_h',
         'media_tempo_sami_meses': 'media_tempo<br>sami_meses',
         'max_t_atend_s': 'max_t<br>atend_s',
         'classif_tempo_atend<br>hora_tempo_acima<br>da_média_]50.77,_500]': 'temp_atend<br>acima<br>da_média',
         'media_t_atend_s' : 'media_t<br>atend_s',
         'qtd_notas_medias_nps_global' : 'qtd_notas<br>medias_nps<br>global',
         'qtd_notas_boas_nps_trans' : 'qtd_notas<br>boas_nps<br>trans',
         'classe_nota_media<br>nps_global_bom<br>[7,_9[' : 'nota nps<br>global_bom',
         'nota_max_nps_global':'nota_max<br>nps_global',
         'qtd_notas_boas_nps_global': 'qtd_notas<br>boas nps<br>global',
         'nota_media_nps_global': 'nota_media<br>nps_global',
         'classe_nota_media<br>nps_trans_médio<br>[5,_7[': 'nota_nps<br>trans_médio',
         'classe_nota_media<br>nps_global_muito<br>ruim_[0,_3[': 'nota_nps<br>global<br>muito_ruim',
         'min_t_atend_s': 'min_t<br>atend_s',
         'min_tempo_sami_meses' : 'min_tempo<br>sami_meses',
         'nota_min_nps_trans' : 'nota_min<br>nps_trans',
         'classe_nota_media<br>nps_global_ruim<br>[3,_5[' : 'nota nps<br>global_ruim',
         'nota_min_nps_global': 'nota_min<br>nps_global',
         'classe_nota_media<br>nps_trans_bom<br>[7,_9[':  'nota nps<br>global_bom',
         'qtd_notas_baixas_nps_global': 'qtd_notas<br>baixas_nps<br>global',
         'tipo_contato_nps<br>trans_Membro_-<br>Cancelado': 'tp_contato<br>nps_trans<br>membro-cancel',
         'media_dias_atraso_inad': 'media_dias<br>atraso_inad',
         'tipo_contato_nps<br>global_Membro_-<br>Cancelado': 'tp_contato<br>nps_glob<br>membro-cancel',
         'classe_nota_media<br>nps_trans_muito<br>bom_[9-10]': 'nota_nps<br>trans<br>muito_bom',
        'qtd_tickets_cancelamento': 'qtd_tickets<br>cancel',
        'pct_tickets_cancelamento': 'pct_tickets<br>cancel'
}
# def adjust_sel_vars(x):
#     if x in vars_to_adj:
#         return x.replace('_', '<br>')
#     else:
#         return x

#plot ajustado - plot_feat_sel_score_adj

In [None]:
def adjust_discretized(x):
    x = x.replace("global_muito_ruim", 'global<br>muito_ruim').replace('global_medio', 'global<br>medio').replace('nota_media_nps_', 'nota_nps<br>')
    x = x.split('_]')[0].split('_[')[0].split(' ]')[0].split(' [')[0]

    if 'nota<br>media' in x:
        x = x.replace('nota<br>media','nota')
    if 'tipo_contato' in x:
        x = x.replace('tipo_contato', 'tp_cont').replace('-','').replace('Membro_','membro ').replace('_Cancelado', '<br>cancel')
    if 'classe_nota' in x:
        return x.replace('classe_', '').replace('media<br>', '').replace('media ', '').split('_[')[0]
    else:
        return x

def adjust_sel_vars(x):
    if x in vars_to_adj:
        return d_vars_to_adj[x]
    else:
        return x

def plot_feat_sel_score_adj(
    feat_sel_tab,
    method_name,
    score_var = 'Score',
    feat_var = 'Specs',
    threshold = 0.01,
    height = 900,
    width = 1600,
    ticks_rotation=0,
    title_x = 0.2,
    title_y = 0.1,
    title_fontsize = 15,

    save_plot = True,
#     titulo = '',
    out_path = '/dbfs/FileStore/shared_uploads/pedro.bloss@samisaude.com/plots_churn'
):
    vars_to_remove = list(filter(lambda x: 'não_preenchido' in x or 'não preenchido' in x or x[-4:] == '_nan',
                                feat_sel_tab[feat_var].values
                                ))
    feat_sel_tab = feat_sel_tab[~feat_sel_tab[feat_var].isin(vars_to_remove)]

    t = feat_sel_tab.copy()
    t.index = t[feat_var].values
    t = t[t['minmax_score']>threshold]

    # ------- heatmap
    plt.figure(figsize=(32,3))
    plt.title(f'Feature Selection - {method_name} (>{threshold})',
              fontsize = 16)

    t.index = list(map(lambda x: x.replace('<br>', ' '),
                      list(t.index)
                      ))
    sns.heatmap(t[['minmax_score']].T,
                annot = True,
                fmt = '.2f')
    plt.show()

    # ------ polar plot

    # better display for labels
    #     t = replace_vars(t, feat_var)
    #     t[feat_var] = list(map(lambda x: x.replace(' ', '<br>'), t[feat_var].values))


    t[feat_var] = list(map(adjust_sel_vars, t[feat_var].values))
    t[feat_var] = list(map(adjust_discretized, t[feat_var].values))

    fig = px.line_polar(t[t['minmax_score']>threshold],
                        r="minmax_score",
                        theta=feat_var,
                        line_close=True,
                        height = height,
                        width = width
                       )
    fig.update_layout(
        title = f'Feature Selection<br>{method_name}<br>(>{threshold})',
        title_font_size = title_fontsize,
        title_x = title_x,
        title_y = title_y,
    #         title_y = 0.6,
        xaxis_title="variáveis",
        yaxis_title="minmax score",
        font=dict(
            family="Lato",
            size=12,
        ),
        polar_angularaxis = dict(
            rotation=ticks_rotation
        )
    )
    if save_plot:
        out_file = os.path.join(out_path,
                               'featsel_{}.html'.format(
                                    method_name.replace('+','_').replace(' ','').replace('(','_').replace(')','')
                                    )
                               )
        fig.write_html(out_file)
    fig.show()

# Coeficiente Tau de Kendall

In [None]:
from scipy.stats import kendalltau

def calc_kendall_tau(
                    df,
                    output_var = 'e_desligado',
                    K=20
                    ):
    d = {
        'var': [],
        'kendall_coef': [],
        'p_val': []
    }
    cols_to_remove = [
        'contract_id',
    'e_desligado',
    ]
    sel_cols = list(set(df.columns) - set(cols_to_remove))
    for var in sel_cols:
        tau, p = kendalltau(df[var].astype(float).fillna(0).values,
                        df[output_var].values)
        d['var'].append(var)
        d['kendall_coef'].append(tau)
        d['p_val'].append(p)

    fs_kendall = pd.DataFrame(d)
    fs_kendall['minmax_score'] = list(map(lambda x: (x - fs_kendall['kendall_coef'].min())/(fs_kendall['kendall_coef'].max() - fs_kendall['kendall_coef'].min()),
                                        fs_kendall['kendall_coef'].values))
    fs_kendall = fs_kendall.sort_values(by=['minmax_score'], ascending = False).reset_index(drop=True)
    return fs_kendall.head(K)

# Dataset pré-processado - scaling, encoding

# Chi2

In [None]:
cols_to_remove = [
    'contract_id',
    'e_desligado',
    'tempo_churn_meses',
#     'tempo_contrato_meses'
]
sel_cols = list(set(df_prep.columns) - set(cols_to_remove))

len(sel_cols)

In [None]:
feature_sel_chi2_prep = select_k_best_features(X_feat = df_prep[sel_cols],
                        y = df_prep['e_desligado'].values,
                      features =sel_cols,
                      score_func = chi2,
                      K = 30
                      )
feature_sel_chi2_prep

In [None]:
plot_feat_sel_score(feature_sel_chi2_prep,
                   'SelectKBest+chi2 (prep)')

In [None]:
plot_feat_sel_score_adj(
    feature_sel_chi2_prep,
    'SelectKBest+chi2 (prep)',
    threshold = 0.1,
    height = 500,
    width = 700,
    title_x = 0.05,
    title_y = 0.95
)

#Output Categórico


Input num - Output cat
ANOVA: Analysis of Variance
Esta análise determina a variância das features, i.e., o quanto impacta no desfecho.

Utiliza-se a distribuição F com hipóteses:
H_0: \sigma^2_x = \sigma^2_yH
0
​
 :σ
x
2
​
 =σ
y
2
​

H_1: \sigma^2_x \neq \sigma^2_yH
1
​
 :σ
x
2
​
 ≠σ
y
2
​

e as hipóteses da ANOVA são:
H_0: \mu_i = \mu, \forall iH
0
​
 :μ
i
​
 =μ,∀i
(médias iguais)
H_1: \exists \mu_k : \mu_k \neq \muH
1
​
 :∃μ
k
​
 :μ
k
​
 ≠μ
(pelo menos uma é diferente)

Para a ANOVA, usamos a função de score f_classif().

# ANOVA - 1 numéricas iniciais


In [None]:
cat_cols, num_cols = get_cat_num_vars(df)
num_cols

In [None]:
cols_to_remove = [
    'contract_id',
    'e_desligado',
    'tempo_churn_meses',
#     'tempo_contrato_meses'
]
df_prep_sel_cols = list(set(df_prep.columns) - set(cols_to_remove))

df_prep[df_prep_sel_cols].head()

In [None]:
list(set(df_prep.columns) - set(list(set(num_cols) - set(cols_to_remove))))

In [None]:
list(set(df_prep.columns) - set(list(set(num_cols) - set(cols_to_remove))))

In [None]:
plot_feat_sel_score_adj(
    fs_anova_num,
   'SelectKBest+ANOVA (num)',
    threshold = 0.01
)

# ANOVA - 2 preprocessado

In [None]:
cols_to_remove = [
    'contract_id',
    'e_desligado',
    'tempo_churn_meses',
#     'tempo_contrato_meses'
]
df_prep_sel_cols = list(set(df_prep.columns) - set(cols_to_remove))

fs_anova_prep = select_k_best_features(
    X_feat = df_prep[df_prep_sel_cols].astype(float).fillna(0),
                        y = df_prep['e_desligado'].values,
                      features = df_prep_sel_cols,
                      score_func = f_classif,
                      K = 'all'
           )
fs_anova_prep.T

In [None]:

fs_anova_prep['Specs'].values

In [None]:

plot_feat_sel_score(fs_anova_prep,

                   'SelectKBest+ANOVA (prep)')

In [None]:
plot_feat_sel_score_adj(
    fs_anova_prep,
   'SelectKBest+ANOVA (prep)',
    threshold = 0.01,
    title_x = 0.15,
    title_y = 0.95,
    title_fontsize = 18
)

In [None]:
plot_feat_sel_score_adj(
    fs_anova_prep,
   'SelectKBest+ANOVA (prep)',
    threshold = 0.1,
    title_fontsize = 18,
    title_x = 0.05,
    title_y = 0.95,
    height = 500,
    width = 800
)

#Tau de Kendall 1 - Numéricas Iniciais

In [None]:
fs_kendall_num = calc_kendall_tau(df_prep[sel_cols_num + ['e_desligado']], K = 100)
fs_kendall_num

In [None]:
plot_feat_sel_score(fs_kendall_num,
                   "SelectKBest+Kendall's Tau (num)",
                   score_var = 'kendall_coef',
                    feat_var = 'var',
                    threshold = 0.1,
                    width = 1700,
                    height = 1000,
                    ticks_rotation=0
                   )

In [None]:
plot_feat_sel_score(fs_kendall_num,
                   "SelectKBest+Kendall's Tau (num)",
                   score_var = 'kendall_coef',
                    feat_var = 'var',
                    threshold = 0.1,
                    width = 1700,
                    height = 1000,
                    ticks_rotation=0
                   )

# Tau Kendall 2 - preprocessado

In [None]:
fs_kendall_prep = calc_kendall_tau(df_prep[df_prep_sel_cols + ['e_desligado']], K = 100)

In [None]:
plot_feat_sel_score(fs_kendall_prep,
                   "SelectKBest+Kendall's Tau (prep)",
                   score_var = 'kendall_coef',
                    feat_var = 'var',
                    threshold = 0.1,
                    width = 1700,
                    height = 1000,
                    ticks_rotation=0
                   )

In [None]:
plot_feat_sel_score_adj(
    fs_kendall_prep,
   "SelectKBest+Kendall's Tau (prep)",
    score_var = 'kendall_coef',
    feat_var = 'var',
    threshold = 0.5,
    title_x = 0.15,
    title_y = 0.95,
    title_fontsize = 20,

)

In [None]:
plot_feat_sel_score_adj(
    fs_kendall_prep,
   "SelectKBest+Kendall's Tau (prep)",
    score_var = 'kendall_coef',
    feat_var = 'var',
    threshold = 0.6,
    title_x = 0.05,
    title_y = 0.95,
    title_fontsize = 20,
    height = 700,
    width = 1000
)

In [None]:
fs_kendall_prep

In [None]:
def adjust_sel_cat_cols(x):
    patterns = [match.start() for match in re.finditer('_', x)]
    x = x.replace(' ','_')
    if len(patterns) >= 4:
        x = x[:patterns[3]] +  ' ' + x[patterns[3]+1:]
    if len(patterns) >= 3:
        x = x[:patterns[1]] +  ' ' + x[patterns[1]+1:]
    return x

fs_kendall_prep['var'] = list(map(adjust_sel_cat_cols, fs_kendall_prep['var'].values))
fs_kendall_prep['var'] = list(map(lambda x: x.replace(' ', '<br>'), fs_kendall_prep['var'].values))
fs_kendall_prep

In [None]:
fs_kendall_prep