In [1]:
from contract import ContractPipeline
from ner.clf_ner import CLF_NER
from ner.regex_ner import RegexNER
from normalization.date_normalizer import DateNorm
from normalization.gov_normalizer import GovNorm
from normalization.lang_normalizer import LangNorm
from normalization.entity_normalizer import EntityNormalizer
from classification.transformer_classifier import TransformersClassifier
from classification.sklearn_classifier import SklearnClassifier
from ner.transformer_ner import TransformersNER
import pandas as pd


  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
contract_pipeline = ContractPipeline()


In [3]:
gov_law_ner = CLF_NER(keywords=["law","jurisdicition","governing"],model="sguarnaccio/gov_law_clf_ner",normalizer=GovNorm())
contract_pipeline.add_pipe(name="governing_law",component=gov_law_ner)


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sguarnaccio/gov_law_clf_ner/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sguarnaccio/gov_law_clf_ner/resolve/main/config.json HTTP/1.1" 200 0
  return self.fget.__get__(instance, owner)()
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sguarnaccio/gov_law_clf_ner/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


In [4]:
effective_date_rules = [(r"(?:effective|dated) (?:as of|on)*? ((?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|Decmebter)?[a-z\s,.]*(?:\d{1,2}[-/th|st|nd|rd)\s,]*)+(?:\d{2,4})+)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(day)\s*(of)\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*?  ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*of\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*?  (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE")]

eff_date_ner = RegexNER(normalizer=DateNorm())
eff_date_ner.load_raw_rules(effective_date_rules)
contract_pipeline.add_pipe(name="effective_date",component=eff_date_ner)

In [5]:
currency_rules = [(r"(?P<currency>[\$£€¥₹]|(?:USD|US Dollar|GBP|British Pound|EUR|Euro|JPY|Japanese Yen|INR|Indian Rupee|CAD|Canadian Dollar|AUD|Australian Dollar|CHF|Swiss Franc|CNY|Chinese Yuan|SGD|Singapore Dollar|NZD|New Zealand Dollar|HKD|Hong Kong Dollar|SEK|Swedish Krona|NOK|Norwegian Krone|KRW|South Korean Won|MXN|Mexican Peso|BRL|Brazilian Real|TRY|Turkish Lira|ZAR|South African Rand|IDR|Indonesian Rupiah|MYR|Malaysian Ringgit|PHP|Philippine Peso|THB|Thai Baht|HUF|Hungarian Forint|CZK|Czech Koruna|ILS|Israeli New Shekel|PLN|Polish Złoty|DKK|Danish Krone|AED|United Arab Emirates Dirham|SAR|Saudi Riyal|RON|Romanian Leu|RUB|Russian Ruble|CLP|Chilean Peso|TWD|New Taiwan Dollar|ARS|Argentine Peso|COP|Colombian Peso|VND|Vietnamese Đồng|NGN|Nigerian Naira|UAH|Ukrainian Hryvnia|EGP|Egyptian Pound|QAR|Qatari Riyal|BDT|Bangladeshi Taka|PKR|Pakistani Rupee|PEN|Peruvian Sol))\s*(?P<amount>[0-9]+(?:[,.][0-9]{3})*(?:[,.][0-9]+)?)","currency")]
currency_ner = RegexNER()
currency_ner.load_raw_rules(currency_rules)
contract_pipeline.add_pipe(name="currency",component=currency_ner)

In [6]:
# document_type_classifier = TransformersClassifier(
#     model="sguarnaccio/document_type_identification",
#     attribute="document_type",
#     method = "lines",
#     positive_class="LABEL_1"
# )
# contract_pipeline.add_pipe(
#   name="document_type_classifier",
#    component=document_type_classifier,
#     params={"text_range":(0,15)})

In [7]:
document_type_classifier = SklearnClassifier(
    model="./classification/pretrained/document_type_model.pkl",
    method="lines",
    positive_class=1,
    attribute="document_type")
document_type_classifier.model = document_type_classifier.model["Linear SVM"]["model"]
contract_pipeline.add_pipe(name="document_type_classifier",component=document_type_classifier,params={"text_range":(0,15)})

In [8]:
language_classifier = SklearnClassifier(
    model="./classification/pretrained/document_language_model.pkl",
    attribute="language",
    method = "lines",
    positive_class="multi",
    normalizer=LangNorm()
)
model = language_classifier.model["Linear SVM"]

language_classifier.model = model["model"]
language_classifier.label_encoder = model["label_encoder"]
contract_pipeline.add_pipe(
    name="language_classifier",
    component=language_classifier,
    before="tokenizer",
    params={"text_range":(0,50)})


In [9]:
# language_classifier = TransformersClassifier(
#     model="papluca/xlm-roberta-base-language-detection",
#     attribute="language",
#     method = "lines",
#     positive_class="multi",
#     normalizer=LangNorm()
# )
# contract_pipeline.add_pipe(
#   name="language_classifier",
#    component=language_classifier,
#     params={"text_range":(0,15)})

In [10]:
le_ner = TransformersNER(keywords=["signature"],model="sguarnaccio/le_signatory",normalizer=EntityNormalizer())
contract_pipeline.add_pipe(name="legal_entities",component=le_ner)

DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /sguarnaccio/le_signatory/resolve/main/config.json HTTP/1.1" 200 0


In [11]:
contract_pipeline.pipeline

[{'component': <utils.ocr.FileProcessor at 0x1bc31f9b110>,
  'name': 'file_loader'},
 {'component': <utils.clean_text.TextCleaner at 0x1bc185eb150>,
  'name': 'clean_text',
  'params': {'lower': False,
   'remove_num': False,
   'add_stop_words': None,
   'remove_stop_words': None}},
 {'component': <classification.sklearn_classifier.SklearnClassifier at 0x1bc4d4f6dd0>,
  'name': 'language_classifier',
  'params': {'text_range': (0, 50)}},
 {'component': <tokenization.tokenizer.Tokenizer at 0x1bc18512590>,
  'name': 'tokenizer'},
 {'component': <tokenization.sentence.SentenceTokenizer at 0x1bc318f8610>,
  'name': 'sentence_tokenizer'},
 {'component': <tokenization.segments.SectionSegmenter at 0x1bc31f9b090>,
  'name': 'section_segmenter'},
 {'component': <definitions.definitions.DefinitionFinder at 0x1bc3226f050>,
  'name': 'definition_finder'},
 {'component': <ner.clf_ner.CLF_NER at 0x1bc322b9290>,
  'name': 'governing_law'},
 {'component': <ner.regex_ner.RegexNER at 0x1bc4d580b50>,
  

In [None]:
import os
for file in os.list_dir("tests")

In [12]:
doc = contract_pipeline("./tests/test7.txt")

DEBUG:root:Preprocessed input file: C:\Users\seang\AppData\Local\Temp\tmp5hqkw3uc\input.txt, format: txt


C:\Users\seang\AppData\Local\Temp\tmp5hqkw3uc\input.txt


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.gleif.org:443
DEBUG:urllib3.connectionpool:https://api.gleif.org:443 "GET /api/v1/lei-records?filter%5Bentity.legalName%5D=Owl%20Rock%20Technology%20Advisors%20Llc HTTP/1.1" 200 None


In [13]:
pd.DataFrame(
    [(segment.section,segment.subsection,segment.title,segment.text) 
        for segment in doc.segments]
    ,columns=["Section","Subsection","Title","Text"]).dropna(how="all")\
.to_excel('./tests/test.xlsx')

In [14]:
pd.DataFrame([(df.term,df.definition,df.phrase) for df in doc.glossary],columns=["Term","Definition","Phrase"])

Unnamed: 0,Term,Definition,Phrase
0,Agreement,"this Agreement, as amended from time to time.",""" Agreement"" shall mean this Agreement, as ame..."
1,Cause,"set forth in Section 14."" Collateral Manager I...",""" Cause"" shall have the meaning set forth in S..."
2,Collateral Manager Information,ascribed to such term in the Offering Circular.,""" Cause"" shall have the meaning set forth in S..."
3,Collateral Manager Securities,any Securities owned by the Collateral Manager...,""" Collateral Manager Securities"" shall mean an..."
4,Governing Instruments,"the memorandum of association, articles of ass...",""" Governing Instruments"" shall mean the memora..."
5,Notice of Removal,"set forth in Section 14."" Offering Circular"" s...",""" Notice of Removal"" shall have the meaning se..."
6,Offering Circular,the final Offering Circular with respect to th...,""" Notice of Removal"" shall have the meaning se..."
7,Related Person,"with respect to any Person, the owners of the ...",""" Related Person"" shall mean with respect to a..."
8,Responsible Officer,", with respect to any Person, any duly authori...",""" Responsible Officer"" shall mean, with respec..."
9,Termination Notice,set forth in Section 14. 2.General Duties and ...,""" Termination Notice"" shall have the meaning s..."


In [15]:
for ent in doc.ents:
    print(ent.name,ent.normalized,ent.label)

LAW OF THE STATE OF NEW YORK United States, New York gov_law [None, None, None, None, None]
dated as of December 16, 2020 2020-12-16 EFFECTIVE_DATE [None, None, None, None, None, None, None]
dated as of December 16, 2020 2020-12-16 EFFECTIVE_DATE [None, None, None, None, None, None, None]
owl rock technology advisors llc Owl Rock Technology Advisors Llc legal_entity [None, None, None, None, None]


In [16]:
doc.language

'English'

In [17]:
doc.document_type

'COLLATERAL MANAGEMENT AGREEMENT'

In [18]:
for ent in doc.ents:
    if ent.label == "legal_entity":
        print(ent.name,ent.bbox_span)

owl rock technology advisors llc [None, None, None, None, None]
