# Assignment 1
Date parser

In [100]:
import re

class DateExtractor:

    # Regular expression pattern to match dates in various formats
    # dd/mm/yyyy
    date_pattern = r'(\d{1,2})/(\d{1,2})/(\d{4})|'
    # yyyy month dd
    date_pattern += r'(\d{4})\s+(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,|\.]?\s+(\d{1,2})|'
    # dd month yyyy
    date_pattern += r'(\d{1,2})[?:st|nd|rd|st]?\s+(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,|\.]?\s+(\d{4})|'
    # dd month
    date_pattern += r'(\d{1,2})[?:st|nd|rd|st]?\s+(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)[,|\.]?|'
    # dd(st|nd|rd|th) of month
    date_pattern += r'(\d{1,2})(?:st|nd|rd|th) of (January|February|March|April|May|June|July|August|September|October|November|December)|'
    # month dd yyyy
    date_pattern += r'(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?[,|\.]?|Nov(?:ember)?|Dec(?:ember)?)[,|\.]?\s+(\d{1,2})[,\.]?\s+(\d{4})|'
    # month dd
    date_pattern += r'(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?[,|\.]?|Nov(?:ember)?|Dec(?:ember)?)[,|\.]?\s+(\d{1,2})'

    # Conversion from month number to name
    month_numeric_to_name_dict = {
        "01": "January",
        "02": "February",
        "03": "March",
        "04": "April",
        "05": "May",
        "06": "June",
        "07": "July",
        "08": "August",
        "09": "September",
        "10": "October",
        "11": "November",
        "12": "December"
    }

    # Conversion from month abbreviation to name
    month_abbreviation_to_name_dict = {
        "Jan": "January",
        "Feb": "February",
        "Mar": "March",
        "Apr": "April",
        "May": "May",
        "Jun": "June",
        "Jul": "July",
        "Aug": "August",
        "Sep": "September",
        "Oct": "October",
        "Nov": "November",
        "Dec": "December"
    }

    def ExtractDates(input_string : str) -> list:
        # Find all dates in the input string, using the pattern defined above
        result = re.findall(DateExtractor.date_pattern, input_string)

        # Extrace tuples of date data from the result of the regular expression parsing
        result = [tuple(x for x in pattern if x != "") for pattern in result]

        # return in standard format
        return [DateExtractor.DateTupleParser(pattern) for pattern in result] 
    
    def DateTupleParser(extraction : tuple) -> str:
        day = ""
        month = ""
        year = ""

        # Guess what each entry in the extracted tuple is
        for entry in extraction:
            if entry.isnumeric():   # Date or year
                if len(entry) < 3:  # Day
                    day = entry
                else:
                    year = entry
            else:                   # Month
                month = entry

        if month == "":             # Month is numeric
            month = extraction[1]   # Second entry is month

        # Month to full name
        if month.isnumeric():
            month = DateExtractor.month_numeric_to_name_dict[month]
        elif len(month) == 3:
            month = DateExtractor.month_abbreviation_to_name_dict[month]

        if year == "":              # default year
            year = "2023"
        return f"{day}-{month}-{year}"  # return in required format

In [104]:

# Input string containing dates
input_string = """Chandrayaan 3, India's third lunar exploration mission, was officially announced by ISRO Chairman, S. Somanath (https://www.isro.gov.in/Secretaryisro.html) on January 1, 2023 . The ambitious project aimed to build upon the successes of its predecessors, Chandrayaan 1 (launched on 22 Oct.  2008) and Chandrayaan 2 (launched on July 22, 2019). Chandrayaan-3 was launched on 14 July 2023, at 2:35 pm IST as scheduled, from Satish Dhawan Space Centre Second Launch Pad in Sriharikota, Andhra Pradesh, India. TransLunar Injection was done on 2023 August 1. The spacecraft entered lunar orbit on 5 Aug 2023, with an expected landing near the lunar South Pole on the 23rd of August. On 5/08/2023, ISRO performed a lunar-orbit insertion (LOI), successfully placing the Chandrayaan-3 spacecraft into orbit around the Moon. The LOI operation was carried out from the ISRO Telemetry, Tracking, and Command Network (ISTRAC) located in Bengaluru.
On 17 August, after a series of lunar-bound maneuvers, the Vikram lander separated from the propulsion module to begin the last phase of the mission. Chandrayaan-3 lander and rover landed near the lunar south pole region on August 23, making India the first nation to successfully land a spacecraft near the lunar south pole, and the fourth country to soft-land on the Moon."""

# use the above defined functions
_ = [print(date) for date in DateExtractor.ExtractDates(input_string)]


1-January-2023
22-October-2008
22-July-2019
14-July-2023
1-August-2023
5-August-2023
23-August-2023
08-August-2023
17-August-2023
23-August-2023
