<a href="https://colab.research.google.com/github/yuriborg/dsmkt/blob/main/churn_infos_baseipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Engineering Functions

In [None]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode('utf-8')

def read_sheet(
    sheet_ID,
    sheet_range_name
):
    # Call the Sheets API
    fromSheet = service.spreadsheets()
    result_sheet = fromSheet.values().get(spreadsheetId=sheet_ID,
                                range=sheet_range_name).execute()
    values_sheet = result_sheet.get('values', [])
    data = pd.DataFrame(values_sheet[1:], columns=values_sheet[0])
    return data

def apply_standardization(
    data
):
    #Apply functions to remove accents, parentheses with empty spaces
    #replaces upper case letters withll case letters,
    #and replaces empty spaces with underscores
    data.columns = list(map(remove_accents, data.columns))
    data.columns = list(map(lambda x: x.lower().replace(' ', '_').replace('(', '').replace(')', ''),
                             data.columns))
    return data

def findcol(val):
    return list(filter(lambda x: val.lower() in x.lower(), df.columns))


def get_cat_num_vars(df):
    # separating categorical and numerical
    cat_cols = df.select_dtypes(exclude='number').columns
    num_cols = df.select_dtypes(include='number').columns
    return cat_cols, num_cols



# Data Extraction


In [None]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode('utf-8')

def read_sheet(
    sheet_ID,
    sheet_range_name
):
    # Call the Sheets API
    fromSheet = service.spreadsheets()
    result_sheet = fromSheet.values().get(spreadsheetId=sheet_ID,
                                range=sheet_range_name).execute()
    values_sheet = result_sheet.get('values', [])
    data = pd.DataFrame(values_sheet[1:], columns=values_sheet[0])
    return data

def apply_standardization(
    data
):
    #Apply functions to remove accents, parentheses with empty spaces
    #replaces upper case letters withll case letters,
    #and replaces empty spaces with underscores
    data.columns = list(map(remove_accents, data.columns))
    data.columns = list(map(lambda x: x.lower().replace(' ', '_').replace('(', '').replace(')', ''),
                             data.columns))
    return data

def findcol(val):
    return list(filter(lambda x: val.lower() in x.lower(), df.columns))


def get_cat_num_vars(df):
    # separating categorical and numerical
    cat_cols = df.select_dtypes(exclude='number').columns
    num_cols = df.select_dtypes(include='number').columns
    return cat_cols, num_cols


# Balanceamento

In [None]:
def get_balance(df):
    return pd.DataFrame({
            'classes': (df['e_desligado'].value_counts()).index,
            'frequência': (df['e_desligado'].value_counts()).values,
            '%': (df['e_desligado'].value_counts()*100/len(df)).values
        })
get_balance(df)

In [None]:
import re
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode('utf-8')


def filter_alphanum_str(x):
    return remove_accents(re.sub(r'\W+', '', x))

def get_features_labels_subset(df,
                               features,
                               p = 0.8,
                               target_label = 'e_desligado',
                               adjust_columns = False
                              ):
    t = df.copy()

    if adjust_columns:
        t.columns = list(map(
            lambda x: x.replace('.0', '').replace('>', 'maior').replace('<','menor').replace('[','').replace(']','').replace('-','').replace(',','_').replace('+','mais'),
                                   t.columns))
        t.columns = list(map(filter_alphanum_str, t.columns))

        features = list(map(
            lambda x: x.replace('.0', '').replace('>', 'maior').replace('<','menor').replace('[','').replace(']','').replace('-','').replace(',','_').replace('+','mais'),
                                   features))
        features = list(map(filter_alphanum_str, features))


    N = len(t)
    N_train = round(N * p)
    N_test = N - N_train

    X_train, X_test = t[features].loc[:N_train], t[features].loc[N_train:]
    y_train, y_test = t[target_label].loc[:N_train].values, t[target_label].loc[N_train:].values

    return X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = get_features_labels_subset(df,
                               features,
                               p = 0.8,
                               target_label = 'e_desligado',
                               adjust_columns = True
                              )
X_train.shape, X_test.shape, X_train.shape[0]+X_test.shape[0], df.shape

In [None]:
t = X_train.copy()
t['e_desligado'] = y_train

get_balance(t)

In [None]:
t = X_test.copy()
t['e_desligado'] = y_test

get_balance(t)

# Amostragem com balanceamento

In [None]:
df[df['e_desligado'] == 1].loc[:10]

In [None]:
len(df[df['e_desligado']==1])

In [None]:
len(df[df['e_desligado']==1])*0.8

In [None]:
import math

def get_features_labels_balanced(
    df,
    features,
    p = 0.8,
    target_label = 'e_desligado',
   adjust_columns = False
):
    t = df.copy()

    # shuffling indexes
    t = t.sample(frac=1).reset_index(drop=True)

    if adjust_columns:
        t.columns = list(map(
            lambda x: x.replace('.0', '').replace('>', 'maior').replace('<','menor').replace('[','').replace(']','').replace('-','').replace(',','_').replace('+','mais'),
                                   t.columns))
        t.columns = list(map(filter_alphanum_str, t.columns))

        features = list(map(
            lambda x: x.replace('.0', '').replace('>', 'maior').replace('<','menor').replace('[','').replace(']','').replace('-','').replace(',','_').replace('+','mais'),
                                   features))
        features = list(map(filter_alphanum_str, features))

    # train data qtd
    N = len(t)
    N_train = math.ceil(N * p)
    N_test = N - N_train

    # initial balancement
    qtd_pos = len(t[t[target_label] == 1])
    qtd_neg = len(t[t[target_label] == 0])
    print('qtd_pos: {} ({:.3f}%), qtd_neg: {} ({:.3f}%)'.format(
        qtd_pos, (qtd_pos*100/(qtd_pos+qtd_neg)),
        qtd_neg, (qtd_neg*100/(qtd_pos+qtd_neg))
    ))
    # label proportion
    N_train_pos = math.ceil(qtd_pos * p)
    N_train_neg =  math.ceil(qtd_neg * p)
    N_tot = N_train_pos + N_train_neg
    print('N_train_pos: {} ({:.3f}%), N_train_neg: {} ({:.3f}%)'.format(
        N_train_pos, (N_train_pos*100/N_tot),
        N_train_neg, (N_train_neg*100/N_tot)
    ))
    # positive samples
    pos_train_data_sample = t[t[target_label] == 1].reset_index(drop=True).loc[:N_train_pos]

    X_train_pos = pos_train_data_sample[features]
    y_train_pos = pos_train_data_sample[target_label].values

    # negative samples
    neg_train_data_sample = t[t[target_label] == 0].reset_index(drop=True).loc[:N_train_neg]

    X_train_neg = neg_train_data_sample[features]
    y_train_neg = neg_train_data_sample[target_label].values


    # Training data
    X_train = pd.concat([X_train_pos, X_train_neg],
                       axis = 0)
    y_train = list(y_train_pos) + list(y_train_neg)


    # test data
    test_data_sample = t[
        (~t.index.isin(pos_train_data_sample.index)) &
        (~t.index.isin(neg_train_data_sample.index))
    ]
    X_test = test_data_sample[features]
    y_test = test_data_sample[target_label].values

    return X_train, y_train, X_test, y_test


X_train, y_train, X_test, y_test = get_features_labels_balanced(
    df,
    features,
    p = 0.8,
    target_label = 'e_desligado',
   adjust_columns = False
)

In [None]:
t = X_train.copy()
t['e_desligado'] = y_train

get_balance(t)

In [None]:
t = X_test.copy()
t['e_desligado'] = y_test

get_balance(t)

# Preenchimento de Base

In [None]:
import missingno as msno

In [None]:
len(
list(df0.columns)
)

In [None]:
plt.title("Preenchimento das variáveis",
         fontsize = 20)
plt.xlabel("% preenchimento",
          fontsize = 18)
msno.bar(df0[list(df0.columns)[:51]])
plt.figure(figsize=(5,8))
plt.show()

In [None]:
df_missinng = pd.DataFrame({
    'col': list(df0.columns),
    'qtd_faltantes': list(map(lambda col: df0[col].isna().sum(),
                             df0.columns
                             )),
    'pct_faltantes': list(map(lambda col: df0[col].isna().sum()*100/len(df0),
                             df0.columns
                             )),

})
df_missinng[df_missinng['pct_faltantes']>90].sort_values(by=['pct_faltantes'],
                                                        ascending = False).reset_index(drop=True)

In [None]:
plt.title("Preenchimento das variáveis",
         fontsize = 20)
plt.xlabel("% preenchimento",
          fontsize = 18)
msno.bar(df0[list(df0.columns)[51:]])
plt.figure(figsize=(5,8))
plt.show()