In [1]:
"""
Build a date parser using basic text processing and rules. (No ML models)

- Given a piece of text, extract the day, month and year info and present it in DD/MM/YYYY format.
    - Example: “I went to London on 21st June, 2024” → 21/06/2024
- Use only default python packages and regex (no ML models OR external libraries)"""

'\nBuild a date parser using basic text processing and rules. (No ML models)\n\n- Given a piece of text, extract the day, month and year info and present it in DD/MM/YYYY format.\n    - Example: “I went to London on 21st June, 2024” → 21/06/2024\n- Use only default python packages and regex (no ML models OR external libraries)'

In [13]:
import re
import pandas as pd

def parse_date(text):
    month_mapping = {
        "January": "01", "February": "02", "March": "03", "April": "04",
        "May": "05", "June": "06", "July": "07", "August": "08",
        "September": "09", "October": "10", "November": "11", "December": "12",
        "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "Jun": "06",
        "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
    }

    patterns = [
        r"(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+),?\s+(\d{4})",  # 1st January 2000
        r"([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})",  # January 1, 2000
        r"(\d{1,2})/(\d{1,2})/(\d{4})",  # 01/01/2000 or 1/1/2000
        r"(\d{4})[-/](\d{1,2})[-/](\d{1,2})",  # 2000-01-01
        r"(\d{1,2})[-.](\d{1,2})[-.](\d{4})",  # 01.01.2000
        r"(\d{4})[-/.](\d{1,2})[-/.](\d{1,2})",  # 2000.01.01
        r"(\d{1,2})\s+of\s+([A-Za-z]+)\s+(\d{4})",  # 1st of January 2000
        r"(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})",  # 1st January 2000
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            if pattern == patterns[0] or pattern == patterns[7]:
                day, month_str, year = match.groups()
                month = month_mapping.get(month_str, "00")
            elif pattern == patterns[1]:
                month_str, day, year = match.groups()
                month = month_mapping.get(month_str, "00")
            elif pattern == patterns[2] or pattern == patterns[4]:
                day, month, year = match.groups()
            elif pattern == patterns[3] or pattern == patterns[5]:
                year, month, day = match.groups()
            elif pattern == patterns[6]:
                day, month_str, year = match.groups()
                month = month_mapping.get(month_str, "00")

            day = day.zfill(2)
            month = month.zfill(2)
            year = year[-2:]
            return f"{day}/{month}/{year}"

    return None


input_file = '/content/date_parser_testcases.csv'
df = pd.read_csv(input_file)
df['parsed_date'] = df['Input'].apply(parse_date)
output_file = '/content/drive/MyDrive/SNLP/output2.csv'
df.to_csv(output_file, index=False)
print("Date parsing complete. Check output.csv for results.")


Date parsing complete. Check output.csv for results.


In [15]:
import re
import pandas as pd

def parse_date(text):
    # Month mapping for textual months
    month_mapping = {
        "January": "01", "February": "02", "March": "03", "April": "04",
        "May": "05", "June": "06", "July": "07", "August": "08",
        "September": "09", "October": "10", "November": "11", "December": "12",
        "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "Jun": "06",
        "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
    }

    patterns = [
        # Match patterns like '1st January 2000'
        r"(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})",
        # Match patterns like 'January 1, 2000'
        r"([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})",
        # Match patterns like '01/01/2000' or '1/1/2000'
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        # Match patterns like '2000-01-01'
        r"(\d{4})[-/](\d{1,2})[-/](\d{1,2})",
        # Match patterns like '01.01.2000'
        r"(\d{1,2})[.-](\d{1,2})[.-](\d{4})",
        # Match patterns like '2000.01.01'
        r"(\d{4})[.-](\d{1,2})[.-](\d{1,2})",
        # Match patterns like '1st of January 2000'
        r"(\d{1,2})\s+of\s+([A-Za-z]+)\s+(\d{4})",
        # Match patterns like '31-12-2022'
        r"(\d{1,2})[-](\d{1,2})[-](\d{4})",
        # Match patterns like '31.12.2022'
        r"(\d{1,2})[.](\d{1,2})[.](\d{4})",
        # Match patterns like '31/12/22'
        r"(\d{1,2})/(\d{1,2})/(\d{2})"
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            groups = match.groups()
            if pattern in [patterns[0], patterns[7]]:
                day, month_str, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern == patterns[1]:
                month_str, day, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern in [patterns[2], patterns[4], patterns[8]]:
                day, month, year = groups
            elif pattern in [patterns[3], patterns[5]]:
                year, month, day = groups
            elif pattern == patterns[6]:
                day, month_str, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern == patterns[9]:
                day, month, year = groups
            elif pattern == patterns[10]:
                day, month, year = groups
                year = '20' + year if int(year) < 50 else '19' + year

            day = day.zfill(2)
            month = month.zfill(2)
            year = year[-2:]
            return f"{day}/{month}/{year}"

    return None

input_file = '/content/date_parser_testcases.csv'
df = pd.read_csv(input_file)
df['parsed_date'] = df['Input'].apply(parse_date)
output_file = '/content/drive/MyDrive/SNLP/output3.csv'
df.to_csv(output_file, index=False)
print("Date parsing complete. Check output.csv for results.")


Date parsing complete. Check output.csv for results.


In [16]:
import re
import pandas as pd

def parse_date(text):
    # Month mapping for textual months
    month_mapping = {
        "January": "01", "February": "02", "March": "03", "April": "04",
        "May": "05", "June": "06", "July": "07", "August": "08",
        "September": "09", "October": "10", "November": "11", "December": "12",
        "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "Jun": "06",
        "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
    }

    patterns = [
        # Match patterns like '1st January 2000'
        r"(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})",
        # Match patterns like 'January 1, 2000'
        r"([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})",
        # Match patterns like '01/01/2000' or '1/1/2000'
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        # Match patterns like '2000-01-01'
        r"(\d{4})[-/](\d{1,2})[-/](\d{1,2})",
        # Match patterns like '01.01.2000'
        r"(\d{1,2})[.-](\d{1,2})[.-](\d{4})",
        # Match patterns like '2000.01.01'
        r"(\d{4})[.-](\d{1,2})[.-](\d{1,2})",
        # Match patterns like '1st of January 2000'
        r"(\d{1,2})\s+of\s+([A-Za-z]+)\s+(\d{4})",
        # Match patterns like '31-12-2022'
        r"(\d{1,2})[-](\d{1,2})[-](\d{4})",
        # Match patterns like '31.12.2022'
        r"(\d{1,2})[.](\d{1,2})[.](\d{4})",
        # Match patterns like '31/12/22'
        r"(\d{1,2})/(\d{1,2})/(\d{2})",
        # Match patterns like '08/31/2021'
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        # Match patterns like '2021.12.31'
        r"(\d{4})[.](\d{1,2})[.](\d{1,2})",
        # Match patterns like '03/14/2022'
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        # Match patterns like '2022-03-03'
        r"(\d{4})[-](\d{1,2})[-](\d{1,2})"
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            groups = match.groups()
            if pattern in [patterns[0], patterns[7]]:
                day, month_str, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern == patterns[1]:
                month_str, day, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern in [patterns[2], patterns[4], patterns[8]]:
                day, month, year = groups
            elif pattern in [patterns[3], patterns[5]]:
                year, month, day = groups
            elif pattern == patterns[6]:
                day, month_str, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern == patterns[9]:
                day, month, year = groups
            elif pattern == patterns[10]:
                day, month, year = groups
                year = '20' + year if int(year) < 50 else '19' + year

            # Correct two-digit years
            year = year if len(year) == 4 else '20' + year

            day = day.zfill(2)
            month = month.zfill(2)
            year = year[-2:]
            return f"{day}/{month}/{year}"

    return None


input_file = '/content/date_parser_testcases.csv'
df = pd.read_csv(input_file)
df['parsed_date'] = df['Input'].apply(parse_date)
output_file = '/content/drive/MyDrive/SNLP/output4.csv'
df.to_csv(output_file, index=False)
print("Date parsing complete. Check output.csv for results.")


Date parsing complete. Check output.csv for results.


In [17]:
import re
import pandas as pd

def parse_date(text):
    # Month mapping for textual months
    month_mapping = {
        "January": "01", "February": "02", "March": "03", "April": "04",
        "May": "05", "June": "06", "July": "07", "August": "08",
        "September": "09", "October": "10", "November": "11", "December": "12",
        "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "Jun": "06",
        "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12"
    }

    patterns = [
        # Match patterns like '1st January 2000'
        r"(\d{1,2})(?:st|nd|rd|th)?\s+([A-Za-z]+)\s+(\d{4})",
        # Match patterns like 'January 1, 2000'
        r"([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})",
        # Match patterns like '01/01/2000' or '1/1/2000'
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        # Match patterns like '2000-01-01'
        r"(\d{4})[-/](\d{1,2})[-/](\d{1,2})",
        # Match patterns like '01.01.2000'
        r"(\d{1,2})[.-](\d{1,2})[.-](\d{4})",
        # Match patterns like '2000.01.01'
        r"(\d{4})[.-](\d{1,2})[.-](\d{1,2})",
        # Match patterns like '1st of January 2000'
        r"(\d{1,2})\s+of\s+([A-Za-z]+)\s+(\d{4})",
        # Match patterns like '31-12-2022'
        r"(\d{1,2})[-](\d{1,2})[-](\d{4})",
        # Match patterns like '31.12.2022'
        r"(\d{1,2})[.](\d{1,2})[.](\d{4})",
        # Match patterns like '31/12/22'
        r"(\d{1,2})/(\d{1,2})/(\d{2})",
        # Match patterns like '08/31/2021'
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        # Match patterns like '2021.12.31'
        r"(\d{4})[.](\d{1,2})[.](\d{1,2})",
        # Match patterns like '03/14/2022'
        r"(\d{1,2})/(\d{1,2})/(\d{4})",
        # Match patterns like '2022-03-03'
        r"(\d{4})[-](\d{1,2})[-](\d{1,2})"
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            groups = match.groups()
            if pattern in [patterns[0], patterns[6]]:
                day, month_str, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern == patterns[1]:
                month_str, day, year = groups
                month = month_mapping.get(month_str, "00")
            elif pattern in [patterns[2], patterns[4], patterns[8]]:
                day, month, year = groups
            elif pattern in [patterns[3], patterns[5]]:
                year, month, day = groups
            elif pattern == patterns[9]:
                day, month, year = groups
                year = '20' + year if int(year) < 50 else '19' + year
            elif pattern == patterns[10]:
                day, month, year = groups
                year = '20' + year if int(year) < 50 else '19' + year
            elif pattern == patterns[11]:
                day, month, year = groups
                year = '20' + year if int(year) < 50 else '19' + year

            # Correct two-digit years
            year = year if len(year) == 4 else '20' + year

            day = day.zfill(2)
            month = month.zfill(2)
            year = year[-2:]
            return f"{day}/{month}/{year}"

    return None


input_file = '/content/date_parser_testcases.csv'
df = pd.read_csv(input_file)
df['parsed_date'] = df['Input'].apply(parse_date)
output_file = '/content/drive/MyDrive/SNLP/output5.csv'
df.to_csv(output_file, index=False)
print("Date parsing complete. Check output.csv for results.")


Date parsing complete. Check output.csv for results.


In [20]:
print(df.head(20).query('parsed_date.notna()'))

                                                Input Expected Output  \
0         The event will take place on March 5, 2023.      05/03/2023   
1                      Her birthday is on 07/08/1990.      07/08/1990   
2                         The deadline is 2022-12-31.      31/12/2022   
5                       Let's catch up on 02.04.2022.      02/04/2022   
6                      The project started on 5/6/19.      05/06/2019   
7                          He was born on 1987/11/23.      23/11/1987   
8                      Christmas is on 25th Dec 2024.      25/12/2024   
9              The meeting is set for April 03, 2020.      03/04/2020   
10  Her birthdate, noted as 1997-05-20, is in the ...      20/05/1997   
12                       The exam date is 2021.11.10.      10/11/2021   
13                      They got married on 12/12/12.      12/12/2012   
15                  Submit your report by 08/31/2021.      31/08/2021   
16                The course starts on 1st July 202