In [None]:
# default_exp core

# dproc

> API details.

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
#export
import json
import os
import glob
import io
import re
import zipfile
from datetime import date
from datetime import datetime
from functools import wraps
from pathlib import Path
import logging
import numpy as np
import pandas as pd
import pandas_profiling
import pandas_flavor as pf
import janitor
from dataclasses import dataclass
from typing import List, Dict, Tuple, Sequence

In [4]:
#export
@dataclass
class Meta:
    _datatypes: pd.DataFrame = None
    _definition: pd.DataFrame = None
    _description: pd.DataFrame = None
    _entity_definition: pd.DataFrame = None
    _entity: str = None
    _is_unique_id: List = None
    _is_mandatory: List = None
    _name_clean: str = None
    _name_raw: str = None
        
    @property
    def definition(self) -> pd.DataFrame:
        return self._definition

    @definition.setter
    def definition(self, df: pd.DataFrame) -> None:
        self._definition = df
        
    @property
    def entity(self) -> str:
        return self._entity

    @entity.setter
    def entity(self, value: str) -> None:
        self._entity = value
        self._entity_definition = self._definition[self._definition.entity == self._entity]
        self._is_unique_id = list(self._entity_definition[self._entity_definition.is_unique_id == True]['name_clean'].values)
        self._is_mandatory = list(self._entity_definition[self._entity_definition.is_mandatory == True]['name_clean'].values)
        self._description = pd.DataFrame(self._entity_definition[['name_clean', 'description']].values, columns=['name', 'description'])
        
    @property
    def entity_definition(self) -> pd.DataFrame:
        return self._entity_definition
    
    @property
    def name_clean(self):
        return self._name_clean

    @name_clean.setter
    def name_clean(self, names: List) -> List:
        self._name_clean = list(name)
        
    @property
    def name_raw(self):
        return self._name_raw

    @name_raw.setter
    def name_raw(self, names: List) -> str:
        self._name_raw = list(names)
        
    @property
    def is_unique_id(self):
        return self._is_unique_id

    @is_unique_id.setter
    def is_unique_id(self, cols: List) -> None:
        self._is_unique_id = cols
    
    @property
    def is_mandatory(self):
        return self._is_mandatory

    @is_mandatory.setter
    def is_mandatory(self, cols: List) -> None:
        self._is_mandatory = cols
    
    @property
    def description(self):
        return self._description
    
    @description.setter
    def description(self, description: Dict) -> pd.DataFrame:
        df = pd.DataFrame()
        df['name'] = list(description.keys())
        df['description'] = list(description.values())
        self._description = df
    
    @property
    def datatypes(self):
        return self._datatypes
    
    @datatypes.setter
    def datatypes(self, datatypes: Dict) -> pd.DataFrame:
        df = pd.DataFrame()
        df['name'] = list(datatypes.keys())
        df['datatype'] = list(datatypes.values())
        self._datatypes = df
    
    def make_definition(self, df=None, name_raw=None, name_clean=None, name_dashboard=None, is_unique_id=None, is_mandatory=None, datatype=None):
        """Make definition out of properties"""
        dd = pd.DataFrame()
        try:
            if name_raw:
                dd['name_raw'] = name_raw
            elif list(df.columns):
                name_raw = list(df.columns)
                dd['name_raw'] = name_raw
                
        except:
            print('No raw names to start with. Please provide a list of raw names or a dataframe.')
            return None
        
        if name_clean:
            dd['name_clean'] = name_clean
        else:
            name_clean = list(pd.DataFrame(columns=name_raw).clean_names().columns)
            dd['name_clean'] = name_clean

        if name_dashboard:
            dd['name_dashboard'] = name_dashboard
        else:
            name_dashboard = []
            for name in name_clean:
                ''.join([x.capitalize() for x in name.split('_')]) 
                name_dashboard.append(''.join([x.capitalize() for x in name.split('_')])) 
            dd['name_dashboard'] = name_dashboard
            
        dd['is_unique_id'] = is_unique_id
        
        if is_mandatory:
            dd['is_mandatory'] = is_mandatory
        else:
            dd['is_mandatory'] = True
        
        if datatype:
            dd['datatype'] = datatype
        else:  
            dd['datatype'] = pd.DataFrame(df.infer_objects().dtypes).reset_index()[0].astype(str).values
        
        return dd

In [5]:
#export
meta = Meta()

#### Example: make_definition

In [6]:
df = pd.util.testing.makeMixedDataFrame()
df.columns = ['UNIQUE ID', 'Basic value', 'textual DEscription', 'DATE-OF-OCCURENCE']
df

Unnamed: 0,UNIQUE ID,Basic value,textual DEscription,DATE-OF-OCCURENCE
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [7]:
dd = meta.make_definition(df)
dd

Unnamed: 0,name_raw,name_clean,name_dashboard,is_unique_id,is_mandatory,datatype
0,UNIQUE ID,unique_id,UniqueId,,True,float64
1,Basic value,basic_value,BasicValue,,True,float64
2,textual DEscription,textual_description,TextualDescription,,True,object
3,DATE-OF-OCCURENCE,date_of_occurence,DateOfOccurence,,True,datetime64[ns]


In [8]:
#export
@pf.register_dataframe_method
def step_rename_cols(df, mapping=None):
    """Rename columns (raw column names -> clean column names)"""
    if not mapping:
        mapping = dict(
            zip(
                meta.entity_definition["name_raw"].values,
                meta.entity_definition["name_clean"].values,
            )
        )
    df = df.rename(columns=mapping)
    return df

In [9]:
#export
@pf.register_dataframe_method
def step_replace_missing_with_nan(df, missing_values=None):
    """Replace all missing values with nan"""
    if missing_values is None:
        missing_values = ["nan", "NAN", "NaN", "none", "None", "NONE", '""', "''", ""]
    for col in df.select_dtypes(include="object").columns:
        df.loc[df[col].isin(missing_values), col] = np.nan
    return df

In [10]:
#export
@pf.register_dataframe_method
def step_remove_not_needed_cols(df, cols=None):
    """Remove not needed columns"""
    if not cols:
        cols = list(meta.entity_definition[meta.entity_definition.is_mandatory == False]["name_clean"].values)    
    df = df.drop(columns=cols)
    return df

In [11]:
#export
@pf.register_dataframe_method
def step_remove_rows_with_missing_ids(df, id_cols=None):
    """Remove rows with invalid ids"""
    if not id_cols:
        id_cols = list(meta.entity_definition[meta.entity_definition.is_unique_id == True]["name_clean"].values)        
    df = df.dropna(subset=id_cols)
    return df

In [12]:
#export
@pf.register_dataframe_method
def step_remove_duplicate_rows(df, id_cols=None, keep="last"):
    """Remove duplicate rows"""
    if not id_cols:
        id_cols = meta.entity_definition[meta.entity_definition.is_unique_id == True]["name_clean"].values
    df.drop_duplicates(subset=id_cols, keep=keep, inplace=True)
    return df

In [13]:
#export
@pf.register_dataframe_method
def step_format_dates(df, cols=None, date_format=None):
    """Format date columns"""
    if not cols:
        cols = meta.entity_definition[meta.entity_definition.datatype.isin(["date", "datetime"])]["name_clean"].values
    for col in cols:
        if date_format:
            df[col] = pd.to_datetime(df[col], format=date_format)
        else:
            df[col] = pd.to_datetime(df[col], infer_datetime_format=True)
    return df

In [14]:
#export
def _round_float(value, decimal_places=2):
    '''Helper function to round s if s is float'''
    m = re.match("(\d+\.\d+)",value.__str__())
    try:
        r = round(float(m.groups(0)[0]),decimal_places)
    except:
        r = value
    return r

In [15]:
#export
@pf.register_dataframe_method
def step_format_round_numeric_cols(df, cols=None, decimal_places=2):
    """Round columns to defined number of decimal places"""
    if not cols:
        cols = meta.entity_definition[meta.entity_definition["dtype"].isin(["float"])]["name_clean"].values
    for col in cols:    
        df[col] = df[col].apply(_round_float, args=(decimal_places,))
    return df

In [16]:
#export
@pf.register_dataframe_method
def step_change_dtypes(df, date_format="%Y-%m-%d %H:%M:%S"):
    """Change datatypes"""
    _entity_definition = meta.entity_definition[meta.entity_definition.is_mandatory == True]
    cols = list(
        zip(
            _entity_definition["name_clean"].values,
            _entity_definition["dtype"].values,
        )
    )
    for col in cols:
        if col[1] == "int":
            df[col[0]] = pd.to_numeric(df[col[0]], errors='coerce').astype('Int64')
        elif col[1] == "float":
            df[col[0]] = df[col[0]].astype(float)
        elif col[1] == "str":
            df[col[0]] = df[col[0]].astype(str)
        elif col[1] == "category":
            df[col[0]] = df[col[0]].astype("category")
        elif col[1] == "bool":
            df[col[0]] = df[col[0]].astype(bool)
    return df

In [17]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
