In [1]:
import pandas as pd
from datetime import datetime, date



In [13]:
# Sample dataset with different DOB formats
data = {
    "User ID": ["U1", "U2", "U3", "U4", "U5"],
    "Date of Birth": [
        "1995-05-14",       # YYYY-MM-DD string
        "14-05-1995",       # DD-MM-YYYY string
        "14/05/1995",       # DD/MM/YYYY string  09/06/1973
        pd.Timestamp("1995-05-14"),  # Pandas Timestamp
        pd.to_datetime("1995-05-14").date(),  # datetime.date
    ]
}

df = pd.DataFrame(data)
print(df)
print(df.dtypes)


  User ID        Date of Birth
0      U1           1995-05-14
1      U2           14-05-1995
2      U3           14/05/1995
3      U4  1995-05-14 00:00:00
4      U5           1995-05-14
User ID          object
Date of Birth    object
dtype: object


In [20]:
import pandas as pd
from datetime import datetime, date
import re

def preprocess_date(value):
    """
    Normalizes a value to a datetime.date object.

    Handles pd.NaT, pd.Timestamp, datetime objects, and common date strings.
    
    Parameters:
    - value: The input value to convert.

    Returns:
    - A datetime.date object if conversion is successful, otherwise None.
    """
    # 1. Handle missing values (NaN, None, empty strings, pd.NaT)
    if pd.isna(value) or value is None or (isinstance(value, str) and not value.strip()):
        return None
    
    # 2. Handle datetime/timestamp objects first, as they are a subclass of date
    if isinstance(value, pd.Timestamp):
         return pd.to_datetime(value.date())

    if isinstance(value, datetime):
        return pd.to_datetime(value.date())

    # 3. Handle datetime.date objects directly
    if isinstance(value, date):
        #return value
        return pd.to_datetime(value)

    # 4. Handle string values
    if isinstance(value, str):
        # Clean the string by stripping whitespace and common non-date characters
        cleaned_value = value.strip().replace("/", "-")
        
        # Define a list of common date formats to try
        date_formats = ["%Y-%m-%d", "%d-%m-%Y", "%m-%d-%Y"]

        for fmt in date_formats:
            try:
                # Attempt to parse the cleaned string with each format
                 return pd.to_datetime(datetime.strptime(cleaned_value, fmt).date())
            except (ValueError, TypeError):
                # If a ValueError occurs, continue to the next format
                continue
    
    # 5. Return None if the type is not handled
    return None

In [21]:
# Assuming your DataFrame is named 'df'
# And the normalize_to_date function is already defined
# Apply the normalization function to the 'Date of Birth' column
df['DOB_normalized'] = df['Date of Birth'].apply(preprocess_date)

# Convert the normalized column to a proper datetime type
#df['DOB_normalized'] = pd.to_datetime(df['DOB_normalized'])

# You can now see the correct data types and values
print(df.dtypes)
print(df)

User ID                   object
Date of Birth             object
DOB_normalized    datetime64[ns]
dtype: object
  User ID        Date of Birth DOB_normalized
0      U1           1995-05-14     1995-05-14
1      U2           14-05-1995     1995-05-14
2      U3           14/05/1995     1995-05-14
3      U4  1995-05-14 00:00:00     1995-05-14
4      U5           1995-05-14     1995-05-14
