In [6]:
import pandas as pd
df = pd.read_csv("date_parser_testcases.csv")
df.head()


Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021


In [15]:
import re

month_map = {
    'january': '01', 'february': '02', 'march': '03', 'april': '04',
    'may': '05', 'june': '06', 'july': '07', 'august': '08',
    'september': '09', 'october': '10', 'november': '11', 'december': '12',
    'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04', 'jun': '06', 'jul': '07',
    'aug': '08', 'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'
}

def clean_day(day_str):
    return re.sub(r'(st|nd|rd|th)', '', day_str)

def fix_year(year):
    year = int(year)
    if year < 100:  # convert 2-digit year
        return f"20{year:02d}" if year < 50 else f"19{year:02d}"
    return str(year)

def parse_date(text):
    text = text.lower()

    patterns = [
        # DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY
        r'(\b\d{1,2})[\/\.\-](\d{1,2})[\/\.\-](\d{2,4})',
        # YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD
        r'(\b\d{4})[\/\.\-](\d{1,2})[\/\.\-](\d{1,2})',
        # MM/DD/YYYY (US format)
        r'(\b\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})',
        # 25th December 2023 or 25 Dec 2023
        r'(\d{1,2})(?:st|nd|rd|th)?\s+(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s*,?\s*(\d{4})',
        # December 25, 2023
        r'(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+(\d{1,2})(?:st|nd|rd|th)?,?\s*(\d{4})',
        # 1st of January 2023
        r'(\d{1,2})(?:st|nd|rd|th)?\s+of\s+(jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s*,?\s*(\d{4})'
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            g = match.groups()
            if len(g) == 3:
                # Numeric: DD/MM/YYYY, DD-MM-YYYY, etc.
                if g[0].isdigit() and g[1].isdigit():
                    if len(g[2]) == 4:
                        day, month, year = g
                    elif len(g[0]) == 4:
                        year, month, day = g
                    else:  # Might be US-style MM/DD/YYYY
                        month, day, year = g
                    return f"{int(day):02d}/{int(month):02d}/{fix_year(year)}"
                # With month names
                elif g[0].isdigit():  # 25th Dec 2023
                    day = clean_day(g[0])
                    month = month_map[g[1][:3]]
                    year = fix_year(g[2])
                    return f"{int(day):02d}/{month}/{year}"
                elif g[1].isdigit():  # Dec 25 2023
                    day = clean_day(g[1])
                    month = month_map[g[0][:3]]
                    year = fix_year(g[2])
                    return f"{int(day):02d}/{month}/{year}"
    return "DATE_NOT_FOUND"


In [16]:
df["Parsed Output"] = df["Input"].apply(parse_date)


In [17]:
df["Correct"] = df["Parsed Output"] == df["Expected Output"]


In [18]:
display(df)

Unnamed: 0,Input,Expected Output,Parsed Output,Correct
0,"The event will take place on March 5, 2023.",05/03/2023,05/03/2023,True
1,Her birthday is on 07/08/1990.,07/08/1990,07/08/1990,True
2,The deadline is 2022-12-31.,31/12/2022,31/12/2022,True
3,We met on 1st of January 2000.,01/01/2000,01/01/2000,True
4,"The concert is scheduled for 15th September, 2...",15/09/2021,15/09/2021,True
...,...,...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023,04/07/2023,True
96,The final date for submission is 30th November...,30/11/2022,30/11/2022,True
97,"The annual conference is on 15th October 2023,...",15/10/2023,15/10/2023,True
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990,20/05/1990,True


In [19]:
df.to_csv("parsed_dates_output.csv", index=False)
