In [None]:
### Part 2 - Build a Date Parser

In [4]:
import re
from datetime import datetime

month_map = {
    'january': '01', 'jan': '01',
    'february': '02', 'feb': '02',
    'march': '03', 'mar': '03',
    'april': '04', 'apr': '04',
    'may': '05',
    'june': '06', 'jun': '06',
    'july': '07', 'jul': '07',
    'august': '08', 'aug': '08',
    'september': '09', 'sep': '09', 'sept': '09',
    'october': '10', 'oct': '10',
    'november': '11', 'nov': '11',
    'december': '12', 'dec': '12'
}


In [5]:
def normalize_text(text):
    # Lowercase, remove suffixes (st, nd, rd, th), remove 'of'
    text = text.lower()
    text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', text)
    text = text.replace(" of ", " ")
    return text

In [6]:
def extract_date(text):
    text = normalize_text(text)
    
    # Patterns to match
    patterns = [
        # Match: March 5, 2023 / April 03, 2020
        r'(?P<month>[a-z]+)[\s\-]*(?P<day>\d{1,2}),?\s*(?P<year>\d{2,4})',
        
        # Match: 1 January 2000 / 15 September 2021
        r'(?P<day>\d{1,2})[\s\-]*(?P<month>[a-z]+),?\s*(?P<year>\d{2,4})',
        
        # Match: 2022-12-31 / 2021.11.10 / 1997/05/20
        r'(?P<year>\d{4})[\-./](?P<month>\d{1,2})[\-./](?P<day>\d{1,2})',
        
        # Match: 5/6/19 or 12/12/12 (assume DD/MM/YY or fallback to MM/DD/YY)
        r'(?P<d1>\d{1,2})[\/.\-](?P<d2>\d{1,2})[\/.\-](?P<d3>\d{2,4})',
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            parts = match.groupdict()

            # Case 1: Month name present
            if 'month' in parts and parts['month'].isalpha():
                month = month_map.get(parts['month'].lower())
                if not month:
                    continue
                day = parts['day'].zfill(2)
                year = parts['year']
            # Case 2: YYYY-MM-DD or similar
            elif 'year' in parts and 'month' in parts and 'day' in parts:
                day = parts['day'].zfill(2)
                month = parts['month'].zfill(2)
                year = parts['year']
            # Case 3: Ambiguous numeric (d1/d2/d3)
            elif 'd1' in parts and 'd2' in parts and 'd3' in parts:
                d1, d2, d3 = parts['d1'], parts['d2'], parts['d3']
                year = d3 if len(d3) == 4 else ('20' + d3 if int(d3) <= 25 else '19' + d3)
                # Try MM/DD/YYYY first (common US format)
                for first_try in [(d2, d1), (d1, d2)]:
                    month, day = first_try[0].zfill(2), first_try[1].zfill(2)
                    try:
                        datetime.strptime(f"{day}/{month}/{year}", "%d/%m/%Y")
                        break
                    except:
                        continue
                else:
                    continue
            else:
                continue

            try:
                datetime.strptime(f"{day}/{month}/{year}", "%d/%m/%Y")
                return f"{day}/{month}/{year}"
            except:
                continue

    return "INVALID"

In [7]:
import csv

with open('date_parser_testcases.csv', newline='') as f:
    reader = csv.DictReader(f)
    results = []

    for i, row in enumerate(reader):
        input_text = row['Input']
        expected = row['Expected Output']
        parsed = extract_date(input_text)
        status = "PASS" if parsed == expected else f"FAIL (Got: {parsed})"
        results.append((input_text, expected, parsed, status))
        
        if i < 5:
            print(f"Input:    {input_text}")
            print(f"Expected: {expected}")
            print(f"Parsed:   {parsed}")
            print(f"Status:   {status}")
            print("-" * 50)


Input:    The event will take place on March 5, 2023.
Expected: 05/03/2023
Parsed:   05/03/2023
Status:   PASS
--------------------------------------------------
Input:    Her birthday is on 07/08/1990.
Expected: 07/08/1990
Parsed:   07/08/1990
Status:   PASS
--------------------------------------------------
Input:    The deadline is 2022-12-31.
Expected: 31/12/2022
Parsed:   31/12/2022
Status:   PASS
--------------------------------------------------
Input:    We met on 1st of January 2000.
Expected: 01/01/2000
Parsed:   01/01/2000
Status:   PASS
--------------------------------------------------
Input:    The concert is scheduled for 15th September, 2021.
Expected: 15/09/2021
Parsed:   15/09/2021
Status:   PASS
--------------------------------------------------


In [42]:
num_passes = sum(1 for r in results if r[3].startswith("PASS"))
total_cases = len(results)

accuracy = (num_passes / total_cases) * 100
print(f"\nAccuracy: {accuracy:.2f}%")



Accuracy: 97.00%


In [None]:
##Part 3

In [23]:
import re
import unicodedata

In [10]:
import pandas as pd 
df = pd.read_csv("pronoun_testcases.csv")

In [12]:
import spacy 
nlp = spacy.load("en_core_web_sm")



In [32]:
pronoun_map = {
    "female": {
        "he": "she",
        "him": "her",
        "his": "her",
        "himself": "herself"
    },
    "male": {
        "she": "he",
        "her": "him",     
        "hers": "his",
        "herself": "himself"
    }
}

In [33]:
def normalize_quotes(text):
    return text.replace("’", "'").replace("‘", "'")


In [34]:
def transform_gender(text, target_gender):
    text = normalize_quotes(text)
    doc = nlp(text)
    new_tokens = []

    for token in doc:
        word_lower = token.text.lower()
        if word_lower in pronoun_map[target_gender]:
            replacement = pronoun_map[target_gender][word_lower]
            if token.text[0].isupper():
                replacement = replacement.capitalize()
            new_tokens.append(replacement + token.whitespace_)
        else:
            new_tokens.append(token.text_with_ws)

    return ''.join(new_tokens).strip()


In [35]:
df["input_text"] = df["input_text"].apply(normalize_quotes)
df["expected_output"] = df["expected_output"].apply(normalize_quotes)

In [36]:
df["actual_output"] = df.apply(lambda row: transform_gender(row["input_text"], row["target_gender"]), axis=1)

In [38]:
df["match"] = df["expected_output"] == df["actual_output"]

In [40]:
pd.set_option("display.max_colwidth", None)
display(df[["input_text", "target_gender", "expected_output", "actual_output", "match"]].head(10))

Unnamed: 0,input_text,target_gender,expected_output,actual_output,match
0,He is going to the market.,female,She is going to the market.,She is going to the market.,True
1,His book is on the table.,female,Her book is on the table.,Her book is on the table.,True
2,I saw him yesterday.,female,I saw her yesterday.,I saw her yesterday.,True
3,He hurt himself.,female,She hurt herself.,She hurt herself.,True
4,I called him last night.,female,I called her last night.,I called her last night.,True
5,That is his car.,female,That is her car.,That is her car.,True
6,He told me about his trip.,female,She told me about her trip.,She told me about her trip.,True
7,The teacher gave him a warning.,female,The teacher gave her a warning.,The teacher gave her a warning.,True
8,He blames himself for the mistake.,female,She blames herself for the mistake.,She blames herself for the mistake.,True
9,He brought his laptop.,female,She brought her laptop.,She brought her laptop.,True


In [41]:
accuracy = df["match"].mean() * 100
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 80.77%


In [None]:
#Possibly because of her being transformed to him or his. 