In [276]:
import PyPDF2
import spacy
from spacy.matcher import Matcher
import re
import stanza
import dateparser
import os

In [277]:
file_path = 'extracted_text.txt'
file = open(file_path, 'r')

# Read the entire file content
text = file.read()

file.close()

In [278]:
# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')

# Process the text using spaCy
doc = nlp(text)

matcher = Matcher(nlp.vocab)

info = {}

In [279]:
# when key words match, find the info by regex. match_func is the new matcher for specific rule
# doc is whole doc, reg_pattern is the patter for result
def second_match(match_func, doc, start, end, reg_pattern):
    res = None
    span = doc[start: end]
    
    res_matched = match_func(span)
    if res_matched:
        res = re.search(reg_pattern, span.text).group()
    return res

In [280]:
# Define the first search patterns
rent_value_pattern = [
    {"LOWER": {"IN": ["annual", "initial"]}, "OP": "?"},
    {"LOWER": "rent"}
]
term_pattern = [{"LOWER": "term"}]
commencement_date_pattern = [{"LOWER": "term"}, {"LOWER": "commencement"}, {"LOWER": "date"}]

# Add the patterns to the matcher
matcher.add("RENT_VALUE", [rent_value_pattern])
matcher.add("TERM", [term_pattern])
matcher.add("COMMENCEMENT_DATE", [commencement_date_pattern])

# Find matches in the text
matches = matcher(doc)

In [281]:
# for match rent
match_rent_after = Matcher(nlp.vocab)

match_rent_before = Matcher(nlp.vocab)

rent_after_pattern = rent_value_pattern +[
             {"OP": "*"},
             {"LOWER": {"IN": ["£", "$", "€"]}},
             {"TEXT": {"REGEX": r"[\d,.]+"}}]

rent_before_pattern =[{"OP": "*"},
             {"LOWER": {"IN": ["£", "$", "€"]}},
             {"TEXT": {"REGEX": r"[\d,.]+"}}] + rent_value_pattern

match_rent_after.add("rent_after",[rent_after_pattern])
match_rent_before.add("rent_after",[rent_before_pattern])


In [282]:
# for match term
match_term = Matcher(nlp.vocab)

# works for x years, x (x) years... with spaces.
term_length_pattern = [
    {"LIKE_NUM": True, "OP": "+"},
    {"IS_SPACE": True, "OP": "*"},
    {"IS_PUNCT": True, "OP":"?"},
    {"LIKE_NUM": True, "OP":"?"},
    {"IS_PUNCT": True, "OP":"?"},
    {"IS_SPACE": True, "OP": "*"},
    {"LOWER": {"IN": ["year", "years", "month", "months"]}}
]

match_term.add("TERM_LENGTH", [term_length_pattern])

In [283]:
def extract_dates_with_sutime_and_dateparser(text):
    nlp = stanza.Pipeline(processors='tokenize,ner', lang='en')
    doc = nlp(text)

    dates = []
    for sentence in doc.sentences:
        for entity in sentence.ents:
            if entity.type == 'DATE':
                dates.append(entity.text)

    # If SUTime extracted any dates, return the first one
    if dates:
        return [dates[0].replace('\n', ' ')]

    # If no dates were extracted by SUTime, try using dateparser
    parsed_dates = dateparser.parse(text, settings={'STRICT_PARSING': False})
    if parsed_dates:
        return [parsed_dates[0].strftime('%Y-%m-%d')]

    return []

In [284]:
rent = None
term_length = None
commencement_date = None

# Extract and print the matched spans
for match_id, start, end in matches:
#     matched_text = doc[start:end].text
    if nlp.vocab.strings[match_id] == "RENT_VALUE":
        if rent:
            continue
        else:
            rent = second_match(match_rent_after, doc,start,min(end+50,len(doc)),r'[£$€][\d,.]+')
            if rent:
                continue
            else:
                rent = second_match(match_rent_before, doc,max(start-15,0),end,r'[£$€][\d,.]+')
    elif nlp.vocab.strings[match_id] == "TERM":
        if term_length:
            continue
        else:
            span = doc[start: min(end+20,len(doc))]
            matches_terms = match_term(span)
            for term_id, start_t, end_t in matches_terms:
                term_length = span[start_t:end_t].text

    elif nlp.vocab.strings[match_id] == "COMMENCEMENT_DATE":
        if commencement_date:
            continue
        else:
            commencement_date = extract_dates_with_sutime_and_dateparser(doc[start:end +25].text)

info['Annual Rent']= rent
info['Term']= term_length
info['Term Commencement Date'] = commencement_date
print(info)

2023-08-03 09:51:48 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

2023-08-03 09:51:49 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2023-08-03 09:51:49 INFO: Using device: cpu
2023-08-03 09:51:49 INFO: Loading: tokenize
2023-08-03 09:51:49 INFO: Loading: ner
2023-08-03 09:51:50 INFO: Done loading processors!


{'Annual Rent': '£15,625', 'Term': 'ten (10) years', 'Term Commencement Date': ['24 December 2016']}
