In [1]:
from datetime import timedelta, datetime
import datetime as dt
import time

from dateutil.parser import parse
from dateutil import rrule, relativedelta
import re

from IPython.display import display

In [10]:
today = dt.date.today()
start_delta = timedelta(days=today.weekday())

display(today.replace(day=1))

next_month = today.replace(day=28) + dt.timedelta(days=4)
next_month - timedelta(days=next_month.day)

datetime.date(2021, 5, 1)

datetime.date(2021, 5, 31)

In [2]:
# print("""
# 28th February 2021
# 28 February 2021
# 1 February
# first February 2020
# February 2021
# second March
# thrid May 2021
# on the 11
# on the first
# on the third of January last year
# on the third of January this year
# 12 of this month
# first sunday of this week
# on thursday
# """)

# TEXT 2 INTEGER

In [3]:
# reg variables
month_of_year = r"""[jJ]an(?:uary)?\s|[fF]eb(?:ruary)?\s|[mM]ar(?:ch)?\s|[aA]pr(?:il)?\s|[mM]ay\s|[jJ]un(?:e)?\s|[jJ]ul(?:y)?\s|[aA]ug(?:ust)?\s|[sS]ep(?:tember)?\s|[oO]ct(?:ober)?\s|[nN]ov(?:ember)?\s|[dD]ec(?:ember)\s"""
days_of_week = r"""(?:sun|mon|tues|wednes|thurs|fri|satur)day"""
day_surfix = r"""(?:st|[nr]d|th)"""
year_period = r"""(?:day|week|month|quarter|year)"""
tense_period = r"""(?:this|last|next|past|previous|future)"""

In [8]:
class DateExtractor:
    def __init__(self, doc):
        self.doc = doc
        self.replaced_values = {}
        
    def _text2int_(self, textnum, numwords={}):
        
        if not numwords:
            units = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen",
                     "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen"]

            tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

            for idx, word in enumerate(units):    numwords[word] = (1, idx)
            for idx, word in enumerate(tens):    numwords[word] = (1, idx * 10)

        ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
        ordinal_endings = [('ieth', 'y'), ('th', '')]

        def get_numeric(word):
            ordinal_word = word
            
            if word in ordinal_words:
                return (1, ordinal_words[word])

            else:
                for ending, replacement in ordinal_endings:
                    if word.endswith(ending):
                        word = "%s%s" % (word[:-len(ending)], replacement)
                        
                if word not in numwords:
                    return (None, None)
                
                return numwords[word]

        result = 0
        previous_idx = 0
        previous_value = ""
        previous_word = ""

        is_hyphen = True if "-" in textnum else False
        textnum = textnum.replace("-", " ")
        for idx, word in enumerate(textnum.split()):
            scale, num = get_numeric(word)

            if scale is None:
                previous_value = ""
                previous_word = ""
                continue

            if previous_value != "":
                result += num
                textnum = textnum.replace(previous_value + " " + word, str(result))
                
                del self.replaced_values[previous_value]
                self.replaced_values[result] = previous_word + "-" + word if is_hyphen else previous_word + " " + word
            else:
                result = num

                previous_value = str(result)
                previous_word = str(word)
                
                textnum = textnum.replace(word, previous_value)
                self.replaced_values[previous_value] = word

        return textnum.strip()
    
    def _extract_date(self, text):
        text = f" {text} "
        try:
            date = re.findall(r"""\s(?:[\d]{1,2}"""+day_surfix+"""?\s)(?:of\s)?(?:"""+month_of_year+""")(?:\s[\d]{4})?|(?:"""+month_of_year+""")(?:\s[\d]{1,2}"""+day_surfix+"""?\s)?(?:\s[\d]{4})?""", text)[0]
            
            return_string = date
            for key, val in self.replaced_values.items():
                return_string = return_string.replace(str(key), val)
            
            return date, return_string
        except IndexError as e:
            text = re.findall(r"""(?:(?:of|on)\sthe\s)[\d]{1,2}"""+day_surfix+"""?""", text)
            if text:
                for w in text:
                    try:
                        w = w if not re.search(r"""[\d]{1,2}""", w) else re.findall(r"""[\d]{1,2}""", w)[0]
                        return str(w), text[0]
                    except ValueError:
                        return None

        
    
    def get_various_date(self, text):
        try:
            text = self._text2int_(textnum=text)
            
            reg = re.findall(r"""(?:"""+tense_period+"""(?:\s[\d]{1,3})?(?:\s"""+year_period+"""(?:s)?)(?:\sago)?(?:\s"""+days_of_week+""")?)|(?:(?:to|yester)day|tomorrow)|(?:(?:"""+tense_period+"""\s)?(?:week\s)?"""+days_of_week+""")|(?:\s[\d]{1,3})(?:\s"""+year_period+"""(?:s)?)(?:\sago)|(?:(?:[\d]{1,2}"""+day_surfix+"""?)?\squarter)""", text.lower())
            reg = " ".join(w for w in reg)
        except IndexError:
            return None
                
        is_past = True if re.findall(r"""(?:last|past|ago|previous)""", reg) else False
        period = int(re.findall("""[\d]{1,3}""", reg)[0]) if re.findall("""[\d]{1,3}""", reg) else 1
        
        return_string = reg
        for key, val in self.replaced_values.items():
            return_string = return_string.replace(str(key), val)

        if reg == "today":
            return datetime.date(datetime.now()), datetime.date(datetime.now()), return_string

        elif reg == "yesterday":
            return datetime.date(datetime.now()) - timedelta(days=1), datetime.date(datetime.now()) - timedelta(days=1), return_string

        elif reg == "tomorrow":
            return datetime.date(datetime.now()) + timedelta(days=1), datetime.date(datetime.now()) + timedelta(days=1), return_string


        elif re.search(r"""(?:\sday)""", reg):
            if is_past:
                return datetime.date(datetime.now()) - timedelta(days=int(re.findall("""[\d]{1,3}""", reg)[0])), datetime.date(datetime.now()), return_string
            else:
                day_range = int(re.findall("""[\d]{1,3}""", reg)[0]) if re.findall("""[\d]{1,3}""", reg) else 0
                return datetime.date(datetime.now()), datetime.date(datetime.now()) + timedelta(days=day_range), return_string

        elif re.search(r"""\sweek(?:\s"""+days_of_week+""")?(?:\?)?""", reg) or re.search(days_of_week, reg):
            today = dt.date.today()
            start_delta = timedelta(days=today.weekday(), weeks=period)

            if re.search(days_of_week, reg):
                day_of_week = {"monday": 0, "tuesday": 1, "wednesday": 2, "thursday": 3, "friday": 4, "saturday": 5, "sunday": 6}
                idx_weekday = day_of_week[re.findall(days_of_week, reg)[0]]

            if is_past:
                start_date = today - start_delta

                if re.search(days_of_week, reg):
                    return start_date + timedelta(days=idx_weekday), start_date + timedelta(days=idx_weekday), return_string
                
                return start_date, (start_date + timedelta(days=7 * period)) - timedelta(days=1), return_string
            else:
                start_date = today - timedelta(days=today.weekday())
                
                if re.search(days_of_week, reg):
                    if re.search(r"""(?:\s[\d]{1,2})""", reg):
                        return (start_date + timedelta(days=7 * period)) + timedelta(days=idx_weekday), today, return_string
                    
                    return start_date + timedelta(days=idx_weekday), today, return_string

                return start_date, today, return_string

        elif re.search(r"""\smonth""", reg):
            now = time.localtime()
            today = dt.date.today()

            if is_past:
                new_date = time.localtime(time.mktime((now.tm_year, now.tm_mon - period, 1, 0, 0, 0, 0, 0, 0)))[:3]
                new_date = datetime(*new_date).date()
                
                try:
                    today = today.replace(month=today.month - 1)
                except ValueError:
                    if today.month - 1 == 2:
                        today = today.replace(month=today.month - 1, day=28)
                
                next_month = today.replace(day=28) + timedelta(days=4)
                last_day_of_month = next_month - timedelta(days=next_month.day)

                return new_date, last_day_of_month, return_string
            else:
                new_date = time.localtime(time.mktime((now.tm_year, now.tm_mon + period, now.tm_mday, 0, 0, 0, 0, 0, 0)))[:3]

                if re.search(r"""(?:this(?:\s1)?\smonth)""", reg):
                    new_date = time.localtime(time.mktime((now.tm_year, now.tm_mon, now.tm_mday, 0, 0, 0, 0, 0, 0)))[:3]
                    today = today.replace(month=today.month, day=1)
                    
                else:
                    today = today.replace(month=today.month + 1, day=1)

                last_day_of_month = datetime(*new_date).date()
                
                if re.search(r"""(?:this\smonth)""", reg):
                    last_day_of_month = dt.date.today()

                return today, last_day_of_month, return_string

        elif re.search(r"""\squarter""", reg):
            try:
                quarter_period = re.findall("""(?:(?:[\d]{1,2}(?:st|[nr]d|th)?)?\squarter(?:\s[\d]{1,2}(?:st|[nr]d|th)?)?)""", reg)
                quarter_period = int(re.findall("""[\d]{1,2}""", quarter_period[0])[0])
            except Exception as e:
                quarter_period = 1
            
            quarter_dictionary = {1:1, 2:4, 3:7, 4:10}

            today = datetime.now()
            today = today.replace(month=quarter_dictionary.get(quarter_period))

            year = today.year

            quarters = rrule.rrule(rrule.MONTHLY, bymonth=(1, 4, 7, 10), bysetpos=-1, dtstart=datetime(year, 1, 1), count=8)

            first_day = quarters.before(today)
            last_day = quarters.after(today) - relativedelta.relativedelta(days=1)
            
            first_day, last_day = first_day.date(), last_day.date()
            
            if re.search(r"""\syear""", reg):
                try:
                    quarter_year_period = re.findall("""(?:(?:[\d]{1,2}(?:st|[nr]d|th)?)?\syear(?:\s[\d]{1,2}(?:st|[nr]d|th)?)?)""", reg)
                    quarter_year_period = int(re.findall("""[\d]{1,2}""", quarter_year_period[0])[0])
                except Exception as e:
                    quarter_year_period = 0
                    
                if is_past:
                    quarter_year_period = quarter_year_period if quarter_year_period else 1
                    first_day = first_day.replace(year=first_day.year - quarter_year_period)
                    last_day = last_day.replace(year=last_day.year - quarter_year_period)
                else:
                    if re.search(r"""(?:this\syear)""", reg):
                        first_day = first_day.replace(year=first_day.year)
                        last_day = last_day.replace(year=last_day.year)
                    else:
                        first_day = first_day.replace(year=first_day.year + quarter_year_period)
                        last_day = last_day.replace(year=last_day.year + quarter_year_period)
            

            return first_day, last_day, return_string

        elif re.search(r"""\syear""", reg):   
            today = dt.date.today()
            if is_past:
                last_period_year = today.year - period
                last_year = today.year - 1

                return dt.date(year=last_period_year, month=1, day=1), dt.date(year=last_year, month=12, day=31), return_string
            else:
                if re.search(r"""(?:this\syear)""", reg):
                    return today.replace(month=1, day=1), today.replace(month=today.month, day=today.day), return_string

                next_period_year = today.year + period
                next_year = today.year + 1

                return dt.date(year=next_year, month=1, day=1), dt.date(year=next_period_year, month=today.month, day=today.day), return_string

        return None
    
    def extract(self):
        text = self._text2int_(textnum=self.doc)
        
        date_2 = self.get_various_date(text)
        try:
            string_date_1 = self._extract_date(text=text)
            
            if string_date_1:
                if isinstance(string_date_1, tuple):
                    date_1 = parse(string_date_1[0]).date()
                    string_date_1 = string_date_1[1]
                else:
                    date_1 = parse(string_date_1).date()

                string_date_1 = string_date_1.strip()
            else:
                date_1 = None

        except ValueError as e:
            if date_2: 
                txt = re.findall(r"""(?:of|on)\sthe\s[\d]{1,2}"""+day_surfix+"""?""", text)
                if txt:
                    day = int(re.findall(r"""[\d]{1,3}""", txt[0])[0])
                    date_2 = date_2[0].replace(day=day), date_2[0].replace(day=day)
            
            date_1 = None
        
        if date_1 and date_2:
            string_date_2 = date_2[-1]
            date_2 = date_2[0]
            
            is_past = True if re.findall(r"""(?:last|past|ago|previous)""", text) else False
                        
            date_1 = date_1.replace(year=date_2.year)
            date_2 = date_2.replace(day=date_1.day)
            
            if not is_past or not re.search(r"""month""", string_date_2):
                date_2 = date_2.replace(month=date_1.month)

            return date_2, date_2, string_date_1 + " " + string_date_2

        elif date_1 and not date_2:
            if re.search(r"""[\d]{1,2}"""+day_surfix+"""?(?:\sof)?\s(?:"""+month_of_year+""")(?:\s[\d]{4})|(?:"""+month_of_year+""")\s[\d]{1,2}"""+day_surfix+"""?(?:\s[\d]{4})|(?:"""+month_of_year+""")(?:\s[\d]{4})\s[\d]{1,2}"""+day_surfix+"""?""", string_date_1):
                return date_1, date_1, string_date_1
            
            elif re.search(r"""^(?:(?:[\d]{4})\s)?(?:"""+month_of_year+""")(?:\s[\d]{4})?$""", string_date_1):
                first_day_of_month = date_1.replace(day=1)
                next_month = date_1.replace(day=28) + timedelta(days=4)
                last_day_of_month = next_month - timedelta(days=next_month.day)
                
                return first_day_of_month, last_day_of_month, string_date_1
            
            else:
                return date_1, date_1, string_date_1
                

        elif date_2 and not date_1:
            return date_2
        
        return None
    
text = "how was my margin this week"
res = DateExtractor(text).extract()
print(res)

(datetime.date(2021, 5, 3), datetime.date(2021, 5, 5), 'this week')


In [5]:
# "Load DateExtractor Class"

In [6]:
# text = "6 months ago"
# res = DateExtractor(text).extract()
# print(res)