In [1]:
from contract import ContractPipeline
from ner.clf_ner import CLF_NER
from ner.regex_ner import RegexNER
from ner.normalization.date_normalizer import DateNorm
import pandas as pd


  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
contract_pipeline = ContractPipeline()
gov_law_ner = CLF_NER(keywords=["law","jurisdicition","governing"],model="sguarnaccio/gov_law_clf_ner")
contract_pipeline.add_pipe(name="governing_law",component=gov_law_ner)




In [3]:
effective_date_rules = [(r"(?:effective|dated|made)*? (?:as of|on)*? ((?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|Decmebter)?[a-z\s,.]*(?:\d{1,2}[-/th|st|nd|rd)\s,]*)+(?:\d{2,4})+)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated|made)*? (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated|made)*? (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated|made)*? (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(day)\s*(of)\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated|made)*? (?:as of|on)*? (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated|made)*? (?:as of|on)*?  ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*of\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated|made)*? (?:as of|on)*?  (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE")]

eff_date_ner = RegexNER(rules=effective_date_rules,normalizer=DateNorm())
eff_date_ner.load_raw_rules(effective_date_rules)
contract_pipeline.add_pipe(name="effective_date",component=eff_date_ner)

In [4]:
contract_pipeline.pipeline

[{'component': <utils.clean_text.TextCleaner at 0x1c2e1471910>,
  'name': 'clean_text',
  'params': {'lower': False,
   'remove_num': False,
   'add_stop_words': None,
   'remove_stop_words': None}},
 {'component': <tokenization.tokenizer.Tokenizer at 0x1c2decce050>,
  'name': 'tokenizer'},
 {'component': <tokenization.sentence.SentenceTokenizer at 0x1c2ca012c50>,
  'name': 'sentence_tokenizer'},
 {'component': <tokenization.segments.SectionSegmenter at 0x1c2c9d04490>,
  'name': 'section_segmenter'},
 {'component': <definitions.definitions.DefinitionFinder at 0x1c2e13af2d0>,
  'name': 'definition_finder'},
 {'component': <ner.clf_ner.CLF_NER at 0x1c2e1362450>,
  'name': 'governing_law'},
 {'component': <ner.regex_ner.RegexNER at 0x1c2e137ec90>,
  'name': 'effective_date'}]

In [5]:
with open("./tests/test20.txt",encoding="utf-8") as f:
    text = f.read() 


In [6]:
doc = contract_pipeline(text)

In [7]:
pd.DataFrame(
    [
        (segment.section,segment.subsection,segment.title,segment.text) 
        for segment in doc.segments]
    ,columns=["Section","Subsection","Title","Text"]).dropna(how="all")\
.to_excel('./tests/test.xlsx')

In [8]:
pd.DataFrame([(df.term,df.definition,df.phrase) for df in doc.glossary],columns=["Term","Definition","Phrase"])

Unnamed: 0,Term,Definition,Phrase
0,Agreement,ascribed to such term in the Recitals.,The following words and phrases as used herein...
1,beneficial owner,"any person who, directly or indirectly, throug...","( b) A"" beneficial owner"" of a security includ..."
2,Code,the United States Internal Revenue Code of 198...,"( c)"" Code"" shall mean the United States Inter..."
3,Common Stock,ascribed to such term in the Recitals.,"( d)"" Common Stock"" shall have the meaning asc..."
4,Company,"GS Inc., together with its Subsidiaries.","( e)"" Company"" shall mean GS Inc., together wi..."
5,Continuing Provisions,ascribed to such term in Section 7.1(b).,"( f)"" Continuing Provisions"" shall have the me..."
6,Covered Persons,"the Participating Managing Directors, whose na...","( g)"" Covered Persons"" shall mean the Particip..."
7,Covered Shares,"the aggregate of any shares of Common Stock, i...","( h)"" Covered Shares"" shall mean the aggregate..."
8,Effective Date,"the close of business on December 31, 2019.( j...","( i)"" Effective Date"" shall mean the close of ..."
9,Exchange Act,the United States Securities Exchange Act of 1...,"( i)"" Effective Date"" shall mean the close of ..."


In [9]:
for ent in doc.ents:
    print(ent.name,ent.start,ent.normalized)

LAWS OF THE STATE OF DELAWARE 33740 None
 on December 31, 2019 4042 2019-12-31
 on May 7, 1999 5641 1999-05-07
 on May 7, 1999 6420 1999-05-07
 on January 16, 2003 7703 2003-01-16
 on April 1, 2003 7767 2003-04-01
 on May 7, 1999 27584 1999-05-07
