This notebook is used to create the dataframe for testing purposes once.

In [1]:
# imports
import os
import yaml
import time
import pandas as pd
from pathlib import Path

In [2]:
# coding

def get_config():
    ''' Returns the configuration file '''
    ROOT = os.getcwd()
    # we are in tests dir, but need src
    CONFIG_PATH = '../src/config/config.yml'
    CONFIG_FILE = os.path.join(ROOT, CONFIG_PATH)
    if os.path.exists(CONFIG_FILE):
        with open(CONFIG_FILE, 'r') as f:
            try:
                config_dict = yaml.safe_load(f.read())
                print(f'Configuration yml file content is:\n {config_dict}')
                return config_dict
            except yaml.YAMLError as e:
                print(f"YAMLError parsing config.yml: {e}")
            except Exception as e:
                print(f"Exception creating config file: {e}")
    else:
        print(f"CONFIG_FILE: {CONFIG_FILE}")

In [3]:
config_file = get_config()
print(f"config_file:\n {config_file}")

Configuration yml file content is:
config_file:


In [4]:
# df creation and storage
data = '../' + config_file['etl']['orig_census_dvc_url']
df_test_raw = pd.read_csv(data)[:1500]
 
# store test dataframe as .csv files in tests dir,
# reason for that workflow:
# dvc handling with github actions with 'local remote' dvc setting is not working as expected,
# issue with creating fixtures for unit tests
filepath = Path('./df_test_1500raw.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_test_raw.to_csv(filepath, index=False)
time.sleep(15)
print('Sleep time is over. Subset csv file with 1500 rows shall be stored.')

Sleep time is over. Subset csv file with 1500 rows shall be stored.


In [5]:
df_test_raw.head()

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
df_test_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              1500 non-null   int64 
 1    workclass       1500 non-null   object
 2    fnlgt           1500 non-null   int64 
 3    education       1500 non-null   object
 4    education-num   1500 non-null   int64 
 5    marital-status  1500 non-null   object
 6    occupation      1500 non-null   object
 7    relationship    1500 non-null   object
 8    race            1500 non-null   object
 9    sex             1500 non-null   object
 10   capital-gain    1500 non-null   int64 
 11   capital-loss    1500 non-null   int64 
 12   hours-per-week  1500 non-null   int64 
 13   native-country  1500 non-null   object
 14   salary          1500 non-null   object
dtypes: int64(6), object(9)
memory usage: 175.9+ KB
