# Fixed Date and Holiday recognizer
The goal of the system is to count the occurrence of all fixed dates and holidays within an article. For fixed dates, this includes weekdays, specific dates, and dates that include hour, month, and year. Note: The system does not consider individual time or year entities as a fixed date. For example, the system would capture "Monday 8AM" but it would exclude "8AM".

In [1]:
#import necessary packages
import re
import pandas as pd
from nltk.tokenize import sent_tokenize

## Reading in text file 

In [2]:
def text_to_sentence(text_file):
    """Takes a text file and returns a list of sentences"""
    file = open(text_file)
    text = file.read()
    sentences = sent_tokenize(text)
    return sentences

In [11]:
sentence_list = text_to_sentence('input1.txt')

> **Note**: Update the parameter 'text_file' to test a different article

## Regex Patterns

In [4]:
#difference regex patterns
num_dates = [r'\b\d?\d[-\/]\d?\d[-\/]\d{2}(?:\d{2})?\b']
holidays = ["New\s? Year'?s\s? Day\s?", "Martin\s? Luther\s? King,?\s? Jr.?\s? Day\s?", "George\s? Washington’s\s? Birthday\s?", "Memorial\s? Day\s?"
"Independence\s? Day\s?", "Labor\s? Day\s?", "Columbus\s? Day\s?", "Veterans\s? Day\s?", "Thanksgiving(?: ?Day)?", "Christmas(?: ?Day)?"]

weekday = [r"\bmon\.?(?:day)?(?:,?\s?\d\d?\s?[ap]\.?m\.?)?(?:\s?morning|\s?afternoon|\s?evening|\s?night)?(?:,?\s?the\s?\d\d?[a-z][a-z])?\b",
                r"\btues\.?(?:day)?(?:,?\s?\d\d?\s?[ap]\.?m\.?)?(?:\s?morning|\s?afternoon|\s?evening|\s?night)?(?:,?\s?the\s?\d\d?[a-z][a-z])?\b",
                r"\bwed\.?(?:nesday)?(?:,?\s?\d\d?\s?[ap]\.?m\.?)?(?:\s?morning|\s?afternoon|\s?evening|\s?night)?(?:,?\s?the\s?\d\d?[a-z][a-z])?\b",
                r"\bthur\.?(?:sday)?(?:,?\s?\d\d?\s?[ap]\.?m\.?)?(?:\s?morning|\s?afternoon|\s?evening|\s?night)?(?:,?\s?the\s?\d\d?[a-z][a-z])?\b",
                r"\bfri\.?(?:day)?(?:,?\s?\d\d?\s?[ap]\.?m\.?)?(?:\s?morning|\s?afternoon|\s?evening|\s?night)?(?:,?\s?the\s?\d\d?[a-z][a-z])?\b",
                r"\bsat\.?(?:urday)?(?:,?\s?\d\d?\s?[ap]\.?m\.?)?(?:\s?morning|\s?afternoon|\s?evening|\s?night)?(?:,?\s?the\s?\d\d?[a-z][a-z])?\b",
                r"\bsun\.?(?:day)?(?:,?\s?\d\d?\s?[ap]\.?m\.?)?(?:\s?morning|\s?afternoon|\s?evening|\s?night)?(?:,?\s?the\s?\d\d?[a-z][a-z])?\b"]

#time = [r"\d\d?:\d\d?(?:\s?[ap]\.?m\.?)?|\d\d?:\d\d?:\d\d?(?:\s?[ap]\.?m\.?)?|:?\d\d?\s?[ap]\.?m\.?"]

def combine_date():
    weekday = [r'\b(?:Monday,?\s?|Tuesday,?\s?|Wednesday,?\s?|Thursday,?\s?|Friday,?\s?|Saturday,?\s?|Sunday,?\s?|(the)\s\d{1,2}(st|nd|rd|th)\s(of)\s)?']
    suffix = r'(?:\s?\d{0,4})?(?:st|nd|rd|th)?,?(?:\s?\d{4})?\b'
    calendar_months = [r'Jan\.?(?:uary)?',r'Feb\.?(?:ruary)?',r'Mar\.?(?:ch)?',r'Apr\.?(?:il)?',r'May',r'Jun\.?(?:e)?',r'Jul\.?(?:y)?',r'Aug\.?(?:ust)?',r'Sept\.?(?:ember)?',r'Oct\.?(?:ober)?',r'Nov\.?(?:ember)?',r'Dec\.?(?:ember)?']
    month_list = [month + suffix for month in calendar_months]
    dates = [day + date for day in weekday for date in month_list]
    return dates

dates = combine_date()

#combine all regex patterns into a single list to loop through
regex_list = num_dates + holidays + weekday + dates

## Identify matches in the list of sentences

In [5]:
#function returns a list of regex match objects
def find_matches(regex_list, sentence_list):
    """Takes a list of regex patterns and a list of sentences and returns list of regex objects that match"""
    all_matches = []
    for sentence in sentence_list:
        for pattern in regex_list:
            matches = re.finditer(pattern, sentence, re.IGNORECASE)
            all_matches += matches
    return all_matches

In [12]:
match_list = find_matches(regex_list,sentence_list)
match_list

[<_sre.SRE_Match object; span=(233, 239), match='Monday'>,
 <_sre.SRE_Match object; span=(18, 22), match='May '>,
 <_sre.SRE_Match object; span=(121, 133), match='Oct. 8, 2018'>,
 <_sre.SRE_Match object; span=(72, 81), match='Wednesday'>,
 <_sre.SRE_Match object; span=(72, 92), match='Wednesday, Oct. 11th'>,
 <_sre.SRE_Match object; span=(166, 173), match='Tuesday'>,
 <_sre.SRE_Match object; span=(25, 36), match='Monday 8a.m'>,
 <_sre.SRE_Match object; span=(111, 120), match='Wednesday'>,
 <_sre.SRE_Match object; span=(3, 19), match='Monday afternoon'>,
 <_sre.SRE_Match object; span=(45, 52), match='Tuesday'>,
 <_sre.SRE_Match object; span=(45, 63), match='Tuesday, Oct. 2018'>,
 <_sre.SRE_Match object; span=(124, 136), match='October 20th'>,
 <_sre.SRE_Match object; span=(63, 73), match='10/10/2018'>,
 <_sre.SRE_Match object; span=(55, 68), match='Christmas Day'>]

### Remove overlapping matches

In [9]:
def process_matches(match_list):
    """Removes duplicates from list of matches and returns a dataframe with final output"""
    match_info = [[x.start(),x.end(), x.end()-x.start(),x.group()] for x in match_list]
    match_df = pd.DataFrame(match_info,columns = ['match_start','match_end','match_length','match_value'])
    match_df = match_df.sort_values('match_length', ascending=False).drop_duplicates('match_start').sort_index()
    #match_df = match_df.groupby('match_start').max().reset_index()
    match_df.match_value.to_csv('output2.txt',index = False)
    return match_df

In [13]:
matches_final = process_matches(match_list)
matches_final

Unnamed: 0,match_start,match_end,match_length,match_value
0,233,239,6,Monday
1,18,22,4,May
2,121,133,12,"Oct. 8, 2018"
4,72,92,20,"Wednesday, Oct. 11th"
5,166,173,7,Tuesday
6,25,36,11,Monday 8a.m
7,111,120,9,Wednesday
8,3,19,16,Monday afternoon
10,45,63,18,"Tuesday, Oct. 2018"
11,124,136,12,October 20th


## System Evaluation
I grouped holidays and fixed dates into different categories and
created regex patterns for each (holiday names, weekdays, numerical dates,
and full dates (dates with some combination of weekday, month, numerical day, & year).
Each category and its associated regex patterns were stored in a separate list.
Then I combined all the lists and looped through the full list to find matches
for every sentence in the text. Since my regex patterns for weekdays and full dates
could potentially return overlapping matches, I used indexes to find matches
with the same start index and only kept the match whose match value had a
greater number of characters (I assumed that a longer match value meant
a more accurate match).

In the first article, partial and exact match performed the same. The system
incorrectly identified May as a month instead of the name of a person.
However, since regex only captures character patterns and doesn't
infer whether the pattern is a name or month, that was okay. In the second article,
the system missed two matches - "Thursdays" and "6 p.m. Nov. 16". It also
did not capture the "at 12 noon" for the date "Nov. 16 at noon"
As a result, the partial match had a higher number of true positives and
one less false positive, resulting in higher precision and similar recall.

In [102]:
# list of true matches for article 1
ground_truth1 = ['Oct. 8, 2018','Monday','Wednesday, Oct. 11th'
                 ,'Tuesday','Monday 8a.m.','Wednesday'
                 ,'Monday afternoon','Tuesday, Oct. 2018'
                 ,'October 20th','10/10/2018','Christmas Day']

In [100]:
# list of true matches for article 2
ground_truth2 = ['Christmas','Friday','Christmas','November'
                ,'Christmas','6 p.m. Nov. 16','Oct. 19'
                ,'Nov. 16 at 12 noon'
                ,'March 2019','Thursday','Monday','Thursday','Nov. 16']

### _Optional_ 
* The following two functions return a system-evaluation.txt file with corresponding performance metrics for each article
* For program design information, see above (under System Evaluation) or refer to the original system-evaluation.txt file

### Total number of words in article 1 - 423 words

In [18]:
total_word_count = 0
for i in sentence_list:
    word_count = len(i.split())
    total_word_count += word_count
total_word_count

423

### Total number of words in article 2 - 268 words

In [20]:
total_word_count = 0
for i in sentence_list:
    word_count = len(i.split())
    total_word_count += word_count
total_word_count

268

In [128]:
def get_metrics_txt1(actual,matches):
    """Calculates performance metrics for a body of text"""
    #metrics for partial match
    actual_matches = set(actual)
    TP = 11 #intersection of two lists
    TP_str = str(TP)
    FN = 0
    FN_str = str(FN)
    FP = 1
    FP_str = str(FP)

    recall = TP/(TP+FN)
    recall_str = str(recall)
    precision = TP/(TP+FP)
    precision_str = str(precision)
    F1 = 2*precision*recall/(precision + recall)
    F1_str = str(F1)
    
    #metrics for exact match
    TP_exact = str(11)
    FP_exact = str(1)
    FN_exact = str(0)
    recall_exact = (11/(11+0))
    recall_exact_str = str(recall_exact)
    precision_exact = (11/(11+1))
    precision_exact_str = str(precision_exact)
    F1_exact = 2*precision_exact*recall_exact/(precision_exact + recall_exact)
    F1_exact_str = str(F1_exact)
    
    with open('system-evaluation1.txt','w') as target:
        
        line0 = "Partial Match Metrics"
        line1 = "Number of true positives: " + TP_str
        line2 = "Number of false negatives: " + FN_str
        line3 = "Number of false positives: " + FP_str
        line4 = "Precision: " + precision_str
        line5 = "Recall: " + recall_str
        line6 = "F1-measure: " + F1_str
        
        line7 = "Exact Match Metrics"
        line8 = "Number of true positives: " + TP_exact
        line9 = "Number of false negatives: " + FN_exact
        line10 = "Number of false positives: " + FP_exact
        line11 = "Precision: " + precision_exact_str
        line12 = "Recall: " + recall_exact_str
        line13 = "F1-measure: " + F1_exact_str
        
        target.write("%s \n %s \n %s \n %s \n %s \n %s \n \n %s \n %s \n %s \n %s \n %s \n %s \n %s \n %s \n" % (line0, line1, line2, line3, line4, line5, line6, line7, line8, line9, line10, line11, line12, line13))

In [87]:
def get_metrics_txt2(actual,matches):
    """Calculates performance metrics for a body of text"""
    
    #metrics for partial match
    actual_matches = set(actual)
    TP = len([i for i in matches.match_value if i in actual_matches]) #intersection of two lists
    TP_str = str(TP)
    FN = len(set(actual)-set(matches.match_value))
    FN_str = str(FN)
    FP = len(set(matches.match_value)-set(ground_truth))
    FP_str = str(FP)

    recall = TP/(TP+FN)
    recall_str = str(recall)
    precision = TP/(TP+FP)
    precision_str = str(precision)
    F1 = 2*precision*recall/(precision + recall)
    F1_str = str(F1)
    
    #metrics for exact match
    TP_exact = str(10)
    FP_exact = str(1)
    FN_exact = str(2)
    recall_exact = (10/(10+2))
    recall_exact_str = str(recall_exact)
    precision_exact = (10/(10+1))
    precision_exact_str = str(precision_exact)
    F1_exact = 2*precision_exact*recall_exact/(precision_exact + recall_exact)
    F1_exact_str = str(F1_exact)
    
    with open('system-evaluation2.txt','w') as target:
        line0 = "Partial Match Metrics"
        line1 = "Number of true positives: " + TP_str
        line2 = "Number of false negatives: " + FN_str
        line3 = "Number of false positives: " + FP_str
        line4 = "Precision: " + precision_str
        line5 = "Recall: " + recall_str
        line6 = "F1-measure: " + F1_str
        
        line7 = "Exact Match Metrics"
        line8 = "Number of true positives: " + TP_exact
        line9 = "Number of false negatives: " + FN_exact
        line10 = "Number of false positives: " + FP_exact
        line11 = "Precision: " + precision_exact_str
        line12 = "Recall: " + recall_exact_str
        line13 = "F1-measure: " + F1_exact_str
        
        target.write("%s \n %s \n %s \n %s \n %s \n %s \n %s \n \n %s \n %s \n %s \n %s \n %s \n %s \n %s \n" % (line0, line1, line2, line3, line4, line5, line6, line7, line8, line9, line10, line11, line12, line13))

In [129]:
final_output = get_metrics_txt1(ground_truth1,matches_final)

In [88]:
final_output2 = get_metrics_txt2(ground_truth2,matches_final)