In [1]:
from contract import ContractPipeline
from ner.clf_ner import CLF_NER
from ner.regex_ner import RegexNER
from normalization.date_normalizer import DateNorm
from normalization.gov_normalizer import GovNorm
from normalization.lang_normalizer import LangNorm
from normalization.entity_normalizer import EntityNormalizer
from classification.transformer_classifier import TransformersClassifier
from classification.sklearn_classifier import SklearnClassifier
from ner.transformer_ner import TransformersNER
import pandas as pd


  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
contract_pipeline = ContractPipeline()


In [3]:
gov_law_ner = CLF_NER(keywords=["law","jurisdicition","governing"],model="sguarnaccio/gov_law_clf_ner",normalizer=GovNorm())
contract_pipeline.add_pipe(name="governing_law",component=gov_law_ner)




In [4]:
effective_date_rules = [(r"(?:effective|dated) (?:as of|on)*? ((?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|Decmebter)?[a-z\s,.]*(?:\d{1,2}[-/th|st|nd|rd)\s,]*)+(?:\d{2,4})+)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(day)\s*(of)\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*?  ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*of\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*?  (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE")]

eff_date_ner = RegexNER(normalizer=DateNorm())
eff_date_ner.load_raw_rules(effective_date_rules)
contract_pipeline.add_pipe(name="effective_date",component=eff_date_ner)

In [5]:
currency_rules = [(r"(?P<currency>[\$£€¥₹]|(?:USD|US Dollar|GBP|British Pound|EUR|Euro|JPY|Japanese Yen|INR|Indian Rupee|CAD|Canadian Dollar|AUD|Australian Dollar|CHF|Swiss Franc|CNY|Chinese Yuan|SGD|Singapore Dollar|NZD|New Zealand Dollar|HKD|Hong Kong Dollar|SEK|Swedish Krona|NOK|Norwegian Krone|KRW|South Korean Won|MXN|Mexican Peso|BRL|Brazilian Real|TRY|Turkish Lira|ZAR|South African Rand|IDR|Indonesian Rupiah|MYR|Malaysian Ringgit|PHP|Philippine Peso|THB|Thai Baht|HUF|Hungarian Forint|CZK|Czech Koruna|ILS|Israeli New Shekel|PLN|Polish Złoty|DKK|Danish Krone|AED|United Arab Emirates Dirham|SAR|Saudi Riyal|RON|Romanian Leu|RUB|Russian Ruble|CLP|Chilean Peso|TWD|New Taiwan Dollar|ARS|Argentine Peso|COP|Colombian Peso|VND|Vietnamese Đồng|NGN|Nigerian Naira|UAH|Ukrainian Hryvnia|EGP|Egyptian Pound|QAR|Qatari Riyal|BDT|Bangladeshi Taka|PKR|Pakistani Rupee|PEN|Peruvian Sol))\s*(?P<amount>[0-9]+(?:[,.][0-9]{3})*(?:[,.][0-9]+)?)","currency")]
currency_ner = RegexNER()
currency_ner.load_raw_rules(currency_rules)
contract_pipeline.add_pipe(name="currency",component=currency_ner)

In [6]:
# document_type_classifier = TransformersClassifier(
#     model="sguarnaccio/document_type_identification",
#     attribute="document_type",
#     method = "lines",
#     positive_class="LABEL_1"
# )
# contract_pipeline.add_pipe(
#   name="document_type_classifier",
#    component=document_type_classifier,
#     params={"text_range":(0,15)})

In [7]:
document_type_classifier = SklearnClassifier(
    model="./classification/pretrained/document_type_model.pkl",
    method="lines",
    positive_class=1,
    attribute="document_type")
document_type_classifier.model = document_type_classifier.model["Linear SVM"]["model"]
contract_pipeline.add_pipe(name="document_type_classifier",component=document_type_classifier,params={"text_range":(0,15)})

In [8]:
language_classifier = SklearnClassifier(
    model="./classification/pretrained/document_language_model.pkl",
    attribute="language",
    method = "lines",
    positive_class="multi",
    normalizer=LangNorm()
)
model = language_classifier.model["Linear SVM"]

language_classifier.model = model["model"]
language_classifier.label_encoder = model["label_encoder"]
contract_pipeline.add_pipe(
    name="language_classifier",
    component=language_classifier,
    before="tokenizer",
    params={"text_range":(0,50)})


In [9]:
# language_classifier = TransformersClassifier(
#     model="papluca/xlm-roberta-base-language-detection",
#     attribute="language",
#     method = "lines",
#     positive_class="multi",
#     normalizer=LangNorm()
# )
# contract_pipeline.add_pipe(
#   name="language_classifier",
#    component=language_classifier,
#     params={"text_range":(0,15)})

In [10]:
le_ner = TransformersNER(keywords=["signature"],model="sguarnaccio/le_signatory",normalizer=EntityNormalizer())
contract_pipeline.add_pipe(name="legal_entities",component=le_ner)

In [11]:
contract_pipeline.pipeline

[{'component': <utils.ocr.OCRProcessor at 0x2af448fcad0>,
  'name': 'file_loader'},
 {'component': <utils.clean_text.TextCleaner at 0x2af43018590>,
  'name': 'clean_text',
  'params': {'lower': False,
   'remove_num': False,
   'add_stop_words': None,
   'remove_stop_words': None}},
 {'component': <classification.sklearn_classifier.SklearnClassifier at 0x2af44c152d0>,
  'name': 'language_classifier',
  'params': {'text_range': (0, 50)}},
 {'component': <tokenization.tokenizer.Tokenizer at 0x2af77fada50>,
  'name': 'tokenizer'},
 {'component': <tokenization.sentence.SentenceTokenizer at 0x2af77f38ed0>,
  'name': 'sentence_tokenizer'},
 {'component': <tokenization.segments.SectionSegmenter at 0x2af77fd2290>,
  'name': 'section_segmenter'},
 {'component': <definitions.definitions.DefinitionFinder at 0x2af445b4590>,
  'name': 'definition_finder'},
 {'component': <ner.clf_ner.CLF_NER at 0x2af44ae1410>,
  'name': 'governing_law'},
 {'component': <ner.regex_ner.RegexNER at 0x2af44bce3d0>,
  '

In [12]:
doc = contract_pipeline("./tests/test7.pdf")

In [13]:
pd.DataFrame(
    [(segment.section,segment.subsection,segment.title,segment.text) 
        for segment in doc.segments]
    ,columns=["Section","Subsection","Title","Text"]).dropna(how="all")\
.to_excel('./tests/test.xlsx')

In [14]:
pd.DataFrame([(df.term,df.definition,df.phrase) for df in doc.glossary],columns=["Term","Definition","Phrase"])

Unnamed: 0,Term,Definition,Phrase
0,Loan Document,include this Amendment No.,"3 is a Loan Document and allreferences to a"" L..."
1,this Agreement,and be a reference to the Credit Agreement or ...,"3, each reference in the Credit Agreement or a..."
2,hereunder,and be a reference to the Credit Agreement or ...,"3, each reference in the Credit Agreement or a..."
3,hereof,and be a reference to the Credit Agreement or ...,"3, each reference in the Credit Agreement or a..."
4,herein,and be a reference to the Credit Agreement or ...,"3, each reference in the Credit Agreement or a..."


In [15]:
for ent in doc.ents:
    print(ent.name,ent.normalized,ent.label,ent.bbox_span)

dated as of May 30, 2019 2019-05-30 EFFECTIVE_DATE [{'page': 1, 'left': 1324, 'top': 642, 'width': 60, 'height': 20}, {'page': 5, 'left': 792, 'top': 692, 'width': 22, 'height': 13}, {'page': 4, 'left': 1395, 'top': 117, 'width': 21, 'height': 19}, {'page': 1, 'left': 577, 'top': 1390, 'width': 50, 'height': 25}, {'page': 1, 'left': 1389, 'top': 343, 'width': 33, 'height': 23}, {'page': 1, 'left': 0, 'top': 0, 'width': 1700, 'height': 2200}, {'page': 1, 'left': 677, 'top': 1390, 'width': 60, 'height': 19}]
dated as of May 22, 2018 2018-05-22 EFFECTIVE_DATE [{'page': 1, 'left': 1324, 'top': 642, 'width': 60, 'height': 20}, {'page': 5, 'left': 792, 'top': 692, 'width': 22, 'height': 13}, {'page': 4, 'left': 1395, 'top': 117, 'width': 21, 'height': 19}, {'page': 1, 'left': 577, 'top': 1390, 'width': 50, 'height': 25}, {'page': 1, 'left': 1327, 'top': 610, 'width': 33, 'height': 23}, {'page': 1, 'left': 0, 'top': 0, 'width': 1700, 'height': 2200}, {'page': 1, 'left': 212, 'top': 676, 'widt

In [16]:
doc.language

'English'

In [17]:
doc.document_type

'AMENDMENT NO. 3 TO'

In [18]:
for ent in doc.ents:
    if ent.label == "legal_entity":
        print(ent.name,ent.bbox_span)

orcc financing ii llc [{'page': 4, 'left': 794, 'top': 218, 'width': 73, 'height': 19}, {'page': 4, 'left': 876, 'top': 218, 'width': 152, 'height': 19}, {'page': 4, 'left': 1036, 'top': 218, 'width': 17, 'height': 18}, {'page': 4, 'left': 1074, 'top': 1692, 'width': 56, 'height': 22}]
natixis, new york branch [{'page': 4, 'left': 792, 'top': 485, 'width': 114, 'height': 23}, {'page': 1, 'left': 0, 'top': 0, 'width': 1700, 'height': 2200}, {'page': 4, 'left': 915, 'top': 485, 'width': 63, 'height': 19}, {'page': 4, 'left': 984, 'top': 485, 'width': 78, 'height': 19}, {'page': 4, 'left': 1070, 'top': 484, 'width': 121, 'height': 24}]
state street bank and trust company [{'page': 4, 'left': 795, 'top': 852, 'width': 78, 'height': 19}, {'page': 4, 'left': 883, 'top': 852, 'width': 99, 'height': 19}, {'page': 4, 'left': 990, 'top': 852, 'width': 77, 'height': 18}, {'page': 3, 'left': 275, 'top': 1002, 'width': 58, 'height': 19}, {'page': 4, 'left': 1141, 'top': 852, 'width': 87, 'height': 

In [19]:
doc.aligned_tokens

In [20]:
print(doc.text)

EX-10.44 3 owl-ex1044_91.htm EX-10.44
Exhibit 10.44
AMENDMENT NO. 3 TO
CREDIT AGREEMENT
This AMENDMENT NO. 3 TO CREDIT AGREEMENT (this "Amendment No. 3'), dated as of May 30, 2019, is enteredinto by and among ORCC Financing II LLC, a Delaware limited liability company (the "Borrower"), NATIXIS, NEW YORK
BRANCH, as administrative agent (in such capacity, the "Administrative Agent'), STATE STREET BANK AND TRUST
COMPANY, as collateral agent, collateral administrator and custodian (in such capacities, respectively, the "Collateral Agent,"
"Collateral Administrator" and "Custodian"), CORTLAND CAPITAL MARKET SERVICES LLC, as document custodian (the
"Document Custodian"), and the lenders identified on the signature pages hereto (the "Lenders'').
A. The Borrower, the Administrative Agent, the Collateral Agent, the Collateral Administrator, the Custodian, the
Document Custodian and the Lenders are parties to that certain Credit Agreement, dated as of May 22, 2018 (as amended bythe Amendments to