Safe Room
When should you use defensive programming?
Should you use structed exception handling for data cleaning?

Name, Dob, Salary
Alice, '06/29/1999', 50000
Bob, '10/15/200', 60000
Charlie, '04/04/2002', 45000
Dave, , 55000

How much is code for code sake?

In [7]:
from datetime import datetime

def calculate_age(dob, today):
    if not isinstance(dob, str) or not dob:
        raise ValueError("dob must be a non-empty string")
    try:
        birthdate = datetime.strptime(dob, "%m/%d/%Y")
    except ValueError:
        raise ValueError("Invalid date format. Please use MM/DD/YYYY.")
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age

calculate_age('06/29/1999',datetime.today())

24

In [None]:
from datetime import datetime

def is_valid_date(date_str):
    try:
        datetime.strptime(date_str, "%m/%d/%Y")
        return True
    except ValueError:
        return False

def get_clean_data(raw_data):
    filtered_df = raw_data.dropna(subset=['Dob'])
    dob_valid_mask = filtered_df['Dob'].apply(lambda x: is_valid_date(x))
    clean_data = filtered_df[dob_valid_mask]
    return clean_data

def calculate_age(birthdate, today):
    age = today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))
    return age


Try using Object Oriented?

In [4]:
from datetime import datetime

class MyClass:
    def __init__(self, name, birthdate):
        self._validate_name(name)
        self._validate_birthdate(birthdate)
        self._name = name
        self._birthdate = datetime.strptime(birthdate, "%m/%d/%Y")

    def _validate_name(self, name):
        if not isinstance(name, str) or not name:
            raise ValueError("Name must be a non-empty string")

    def _validate_birthdate(self, birthdate):
        try:
            birthdate = datetime.strptime(birthdate, "%m/%d/%Y")
        except ValueError:
            raise ValueError("Invalid date format. Please use MM/DD/YYYY.")

    def get_name(self):
        return self._name

    def get_age(self, today):
        age = today.year - self._birthdate.year - ((today.month, today.day) < (self._birthdate.month, self._birthdate.day))
        return age


try:
    obj = MyClass("Jane Doe", '06/29/1999')
    today = datetime.today()
    print(f"Name: {obj.get_name()}, Age: {obj.get_age(today)}")
except ValueError as e:
    print(f"Error: {e}")


Name: Jane Doe, Age: 24


Imperative way

In [None]:
import pandas as pd

def main():
    input_file_path = 'input_data.csv'
    output_file_path = 'output_data.csv'
    data = get_data(input_file_path)
    cleaned_data = clean_data(data)
    results = analyze_data(cleaned_data)
    write_data(results, output_file_path)


get_data - file does not exist
clean_data - data is uncleanable
analyze_data - data is unanalyzable
write_data - folder does not exist

where should we validate arguments?
Everywhere? Nowhere?

Does anayze_data care where there data comes from?
Does analyze_data care if there is a missing field?

In [None]:
import pandas as pd

def get_data(file_path):
    if file_path is None or file_path == "":
        raise ValueError("File path cannot be None or empty.")
    if not os.path.exists(file_path):
        raise ValueError(f"The file '{file_path}' does not exist.")
    df = pd.read_csv(input_file)
    return df

#etc...

In [None]:
def main():
    try:
        data = get_data(input_file_path)
        cleaned_data = clean_data(data)
        results = analyze_data(cleaned_data)
        write_data(results, output_file_path)
    with:
        etc...

In [None]:
def main():
    #validate
    input_file_path = 'input_data.csv'
    output_file_path = 'output_data.csv'
    data = get_data(input_file_path)

    #No validate
    cleaned_data = clean_data(data)
    results = analyze_data(cleaned_data)
    write_data(results, output_file_path)
