<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Checando-os-dados-que-foram-carregados" data-toc-modified-id="Checando-os-dados-que-foram-carregados-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Checando os dados que foram carregados</a></span></li><li><span><a href="#Checando-criação-de-Features" data-toc-modified-id="Checando-criação-de-Features-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Checando criação de Features</a></span></li><li><span><a href="#Checando-premissas-para-o-moldelo" data-toc-modified-id="Checando-premissas-para-o-moldelo-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Checando premissas para o moldelo</a></span></li></ul></div>

In [84]:
import pandas as pd
import numpy as np
import pytest
from statsmodels.tsa.stattools import adfuller

In [4]:
def load_data():
    df = pd.read_csv('src/train.csv')
    return df

In [21]:
# a function that makes something wrong and remove a line
def some_function_error(df):
    return df.iloc[-1]


#for example a function make data stationary; 
def some_function(df):
    df.age = np.log(df.age)
    return df

# for example a function that insert features; 
def some_function_add_columns(df_added):
    df_added['new_column'] = np.log(df_added.age)
    return df_added

### Checando os dados que foram carregados

In [65]:
def test_load_data():
    message = 'O dataset não foi carregado'
    assert load_data().empty != True, message

In [19]:
def test_primary_key():
    df = load_data()
    message = "Existem duplicatas na base"
    assert len(df['surveyid'].unique())==df.shape[0], message

In [79]:
def test_missing():
    #catch missing values
    df = load_data()
    column = 'cons_tobacco'
    expected = 0
    actual = df[column].isna().sum()
    message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
    assert actual == expected, message

### Checando criação de Features

In [42]:
def test_expected_shape():
    df = load_data()
    
    expected = df.shape
    actual = some_function_error(df).shape
    message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
    assert all([some_function_error(df).shape == df.shape]), message


In [50]:
def test_expect_shape_lines():
    #check if number of lines is correct
    df = load_data()
    
    expected = df.shape
    actual = some_function_error(df).shape
    message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
    assert all([some_function_add_columns(df).shape[0] == df.shape[0]]), message

In [52]:
def test_expect_shape_columns():
    #check if number of lines is correct
    df = load_data()
    
    expected = df.shape
    actual = some_function_error(df).shape
    message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
    assert all([some_function_add_columns(df).shape[1] == df.shape[1]]), message

In [59]:
def test_var_range():
    df = load_data()
    message = "Existem valores fora do range esperado"
    assert all (df['age']<0), message


In [137]:
def test_future_vars():
    df = load_data()
    expected = create_future_vars(df)
    last = df[-1]
    message = "Data futura:  {0} menor que a ultima data do dataframe {1}".format(expected, last)
    assert all (expected <= last), message

In [111]:
# check dummies
def check_data_range(data, lower=0, upper=1):
    assert min(data) >= lower, f"minimum value less than {lower}"
    assert max(data) <= upper, f"maximum value greater than {upper}"

def test_range():
    df = load_data()

    zero_one_cols = ['labor_force_part_rate', 'hotel_occup_rate',
                     'hotel_avg_daily_rate', 'unemp_rate']
    for col in zero_one_cols:
        check_data_range(df['labor_force_part_rate'])

### Checando premissas para o moldelo

In [83]:
def test_vazamento_feature():
    df = load_data()
    expected = 90
    actual = correlation(df)
    message = "Expected return value: {0}, Actual return value: {1}".format(expected, actual)
    assert all (actual < expected), message

In [107]:
def test_stationary():
    df = load_data()
    X = df.age.values
    expected = 0.05
    actual = adfuller(X)
    message = 'A serie não é estacionaria'
    assert  (actual[1] <= expected), message