In [None]:
import pandas as pd
import re

PART2


In [None]:
def normalize_date_text(text):
    text = re.sub(r'(\d+)(st|nd|rd|th)', r'\1', text)
    text = text.replace(',', '').lower()
    return text

In [None]:
def improved_extract_date(text):
    text = normalize_date_text(text)

    months = {
        'jan': '01', 'january': '01',
        'feb': '02', 'february': '02',
        'mar': '03', 'march': '03',
        'apr': '04', 'april': '04',
        'may': '05',
        'jun': '06', 'june': '06',
        'jul': '07', 'july': '07',
        'aug': '08', 'august': '08',
        'sep': '09', 'sept': '09', 'september': '09',
        'oct': '10', 'october': '10',
        'nov': '11', 'november': '11',
        'dec': '12', 'december': '12'
    }

    patterns = [
        (r'\b(\d{1,2})[./-](\d{1,2})[./-](\d{2,4})\b', lambda d: (
            d[0].zfill(2), d[1].zfill(2), '20' + d[2] if len(d[2]) == 2 else d[2])),

        (r'\b(\d{4})[/-](\d{1,2})[/-](\d{1,2})\b', lambda d: (
            d[2].zfill(2), d[1].zfill(2), d[0])),

        (r'\b(\d{1,2})\s+(?:of\s+)?([a-zA-Z]+)\s+(\d{4})\b', lambda d: (
            d[0].zfill(2), months.get(d[1][:3], '00'), d[2])),

        (r'\b([a-zA-Z]+)\s+(\d{1,2})\s+(\d{4})\b', lambda d: (
            d[1].zfill(2), months.get(d[0][:3], '00'), d[2])),

        (r'\b(\d{1,2})\s+([a-zA-Z]+)\s+(\d{4})\b', lambda d: (
            d[0].zfill(2), months.get(d[1][:3], '00'), d[2]))
    ]

    for pattern, formatter in patterns:
        match = re.search(pattern, text)
        if match:
            try:
                d, m, y = formatter(match.groups())
                if d != '00' and m != '00' and len(y) == 4:
                    return f"{d}/{m}/{y}"
            except:
                continue
    return "Could not parse"

In [None]:
df = pd.read_csv('date_parser_testcases.csv')


In [None]:
df['Parsed Output'] = df['Input'].apply(improved_extract_date)
df['Correct'] = df['Parsed Output'] == df['Expected Output']


In [None]:
accuracy = df['Correct'].mean()
print(f"Accuracy: {accuracy:.2%}")
df[['Input', 'Parsed Output', 'Expected Output', 'Correct']].head(10)


Accuracy: 85.00%


Unnamed: 0,Input,Parsed Output,Expected Output,Correct
0,"The event will take place on March 5, 2023.",05/03/2023,05/03/2023,True
1,Her birthday is on 07/08/1990.,07/08/1990,07/08/1990,True
2,The deadline is 2022-12-31.,31/12/2022,31/12/2022,True
3,We met on 1st of January 2000.,01/01/2000,01/01/2000,True
4,"The concert is scheduled for 15th September, 2...",15/09/2021,15/09/2021,True
5,Let's catch up on 02.04.2022.,02/04/2022,02/04/2022,True
6,The project started on 5/6/19.,05/06/2019,05/06/2019,True
7,He was born on 1987/11/23.,23/11/1987,23/11/1987,True
8,Christmas is on 25th Dec 2024.,25/12/2024,25/12/2024,True
9,"The meeting is set for April 03, 2020.",03/04/2020,03/04/2020,True


In [None]:
text = "My birthdays on March 2, 2024"

In [None]:
improved_extract_date(text)


'02/03/2024'

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
import pandas as pd


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
pronoun_map = {
    'he': 'she',
    'she': 'he',
    'him': 'her',
    'her': 'him',
    'his': 'her',
    'hers': 'his',
    'himself': 'herself',
    'herself': 'himself'
}

poss_adj_map = {
    'his': 'her',
    'her': 'his'
}

In [None]:
def swap_pronouns_with_dep(text):
    doc = nlp(text)
    new_tokens = []
    dep_info = []

    for token in doc:
        tok_lower = token.text.lower()

        if tok_lower in pronoun_map:
            if token.dep_ in ["poss", "possessive"]:
                swapped = poss_adj_map.get(tok_lower, token.text)
            elif token.dep_ == "dobj" and tok_lower.endswith("self"):
                swapped = pronoun_map.get(tok_lower, token.text)
            else:
                swapped = pronoun_map.get(tok_lower, token.text)

            if token.text[0].isupper():
                swapped = swapped.capitalize()
            new_tokens.append(swapped)
        else:
            new_tokens.append(token.text)

        dep_info.append((token.text, token.dep_))

    swapped_sentence = spacy.tokens.Doc(doc.vocab, words=new_tokens).text
    return swapped_sentence, dep_info



In [None]:
df = pd.read_csv('pronoun_testcases (1).csv')


In [None]:
df.head()

Unnamed: 0,input_text,target_gender,expected_output
0,He is going to the market.,female,She is going to the market.
1,His book is on the table.,female,Her book is on the table.
2,I saw him yesterday.,female,I saw her yesterday.
3,He hurt himself.,female,She hurt herself.
4,I called him last night.,female,I called her last night.


In [None]:
results = []
for idx, row in df.iterrows():
    swapped_text, dep_info = swap_pronouns_with_dep(row['input_text'])
    results.append({
        'original_text': row['input_text'],
        'swapped_text': swapped_text,
        'dependency_info': dep_info
    })

In [None]:
result_df = pd.DataFrame(results)


In [None]:
result_df[['original_text', 'swapped_text']]

Unnamed: 0,original_text,swapped_text
0,He is going to the market.,She is going to the market .
1,His book is on the table.,Her book is on the table .
2,I saw him yesterday.,I saw her yesterday .
3,He hurt himself.,She hurt herself .
4,I called him last night.,I called her last night .
5,That is his car.,That is her car .
6,He told me about his trip.,She told me about her trip .
7,The teacher gave him a warning.,The teacher gave her a warning .
8,He blames himself for the mistake.,She blames herself for the mistake .
9,He brought his laptop.,She brought her laptop .


In [None]:
results_df = pd.concat([df, result_df], axis=1)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(results_df['expected_output'].apply(lambda x: x.strip('.')), results_df['swapped_text'].apply(lambda x: x.strip(' .')))

0.9230769230769231