In [1]:
from contract import ContractPipeline
from ner.clf_ner import CLF_NER
from ner.regex_ner import RegexNER
from ner.normalization.date_normalizer import DateNorm
from ner.normalization.gov_normalizer import GovNorm, gov_lookup
from classification.transformer_classifier import TransformersClassifier
from classification.sklearn_classifier import SklearnClassifier
import pandas as pd

  hasattr(torch, "has_mps")
  and torch.has_mps  # type: ignore[attr-defined]
  from .autonotebook import tqdm as notebook_tqdm


Aruba
Afghanistan
Angola
Anguilla
Åland Islands
Albania
Andorra
United Arab Emirates
Argentina
Armenia
American Samoa
Antarctica
French Southern Territories
Antigua and Barbuda
Australia
Austria
Azerbaijan
Burundi
Belgium
Benin
Bonaire, Sint Eustatius and Saba
Burkina Faso
Bangladesh
Bulgaria
Bahrain
Bahamas
Bosnia and Herzegovina
Saint Barthélemy
Belarus
Belize
Bermuda
Bolivia, Plurinational State of
Brazil
Barbados
Brunei Darussalam
Bhutan
Bouvet Island
Botswana
Central African Republic
Canada
Cocos (Keeling) Islands
Switzerland
Chile
China
Côte d'Ivoire
Cameroon
Congo, The Democratic Republic of the
Congo
Cook Islands
Colombia
Comoros
Cabo Verde
Costa Rica
Cuba
Curaçao
Christmas Island
Cayman Islands
Cyprus
Czechia
Germany
Djibouti
Dominica
Denmark
Dominican Republic
Algeria
Ecuador
Egypt
Eritrea
Western Sahara
Spain
Estonia
Ethiopia
Finland
Fiji
Falkland Islands (Malvinas)
France
Faroe Islands
Micronesia, Federated States of
Gabon
United Kingdom
Georgia
Guernsey
Ghana
Gibraltar
Gui

In [2]:
contract_pipeline = ContractPipeline()
gov_law_ner = CLF_NER(keywords=["law","jurisdicition","governing"],model="sguarnaccio/gov_law_clf_ner",normalizer=GovNorm(lookups=gov_lookup))
contract_pipeline.add_pipe(name="governing_law",component=gov_law_ner)




In [3]:
effective_date_rules = [(r"(?:effective|dated) (?:as of|on)*? ((?:\d{1,2}[-/th|st|nd|rd\s]*)?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|January|February|March|April|May|June|July|August|September|October|November|Decmebter)?[a-z\s,.]*(?:\d{1,2}[-/th|st|nd|rd)\s,]*)+(?:\d{2,4})+)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*(day)\s*(of)\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*? (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*?  ((?<!\d)([1-9]|([12][0-9])|(3[01]))(?!\d))((?<=1)st|(?<=2)nd|(?<=3)rd|(?<=[0456789])th|\"|°)?\s*of\s*(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE"),
                   (
                   r"(?:effective|dated) (?:as of|on)*?  (January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s*[,\.]\s*(?<!\d)([12][0-9]{3})(?!\d)",
                   "EFFECTIVE_DATE")]

eff_date_ner = RegexNER(rules=effective_date_rules,normalizer=DateNorm())
eff_date_ner.load_raw_rules(effective_date_rules)
contract_pipeline.add_pipe(name="effective_date",component=eff_date_ner)

In [4]:
# document_type_classifier = TransformersClassifier(
#     model="sguarnaccio/document_type_identification",
#     attribute="document_type",
#     method = "lines",
#     positive_class="LABEL_1"
# )
# contract_pipeline.add_pipe(
#     name="document_type_classifier",
#     component=document_type_classifier,
#     params={"text_range":(0,15)})

In [5]:
document_type_classifier = SklearnClassifier(
    model="./classification/pretrained/document_type_model.pkl",
    method="lines",
    positive_class=1,
    attribute="document_type")
document_type_classifier.model = document_type_classifier.model["Linear SVM"]["model"]
contract_pipeline.add_pipe(name="document_type_classifier",component=document_type_classifier,params={"text_range":(0,15)})

In [6]:
language_classifier = TransformersClassifier(
    model="papluca/xlm-roberta-base-language-detection",
    attribute="language",
    method = "document",
    positive_class="multi"
)
contract_pipeline.add_pipe(
    name="language_classifier",
    component=language_classifier,
    before="tokenizer",
    params={"text_range":(0,1000)})


In [7]:
contract_pipeline.pipeline

[{'component': <utils.clean_text.TextCleaner at 0x1d14c463c10>,
  'name': 'clean_text',
  'params': {'lower': False,
   'remove_num': False,
   'add_stop_words': None,
   'remove_stop_words': None}},
 {'component': <classification.transformer_classifier.TransformersClassifier at 0x1d1677c3f50>,
  'name': 'language_classifier',
  'params': {'text_range': (0, 1000)}},
 {'component': <tokenization.tokenizer.Tokenizer at 0x1d14c053210>,
  'name': 'tokenizer'},
 {'component': <tokenization.sentence.SentenceTokenizer at 0x1d14c43a890>,
  'name': 'sentence_tokenizer'},
 {'component': <tokenization.segments.SectionSegmenter at 0x1d14c382f90>,
  'name': 'section_segmenter'},
 {'component': <definitions.definitions.DefinitionFinder at 0x1d14be0ed90>,
  'name': 'definition_finder'},
 {'component': <ner.clf_ner.CLF_NER at 0x1d14a96c510>,
  'name': 'governing_law'},
 {'component': <ner.regex_ner.RegexNER at 0x1d14c0b8e10>,
  'name': 'effective_date'},
 {'component': <classification.sklearn_classifi

In [8]:
with open("./tests/test13.txt",encoding="utf-8") as f:
    text = f.read() 


In [9]:
doc = contract_pipeline(text)

In [10]:
pd.DataFrame(
    [(segment.section,segment.subsection,segment.title,segment.text) 
        for segment in doc.segments]
    ,columns=["Section","Subsection","Title","Text"]).dropna(how="all")\
.to_excel('./tests/test.xlsx')

In [11]:
pd.DataFrame([(df.term,df.definition,df.phrase) for df in doc.glossary],columns=["Term","Definition","Phrase"])

Unnamed: 0,Term,Definition,Phrase
0,Amendment Effective Date,"of electronic transmission( e.g.,"" pdf"")) that...",The effectiveness of this Agreement is subject...
1,hereunder,the Credit Agreement as amended by this Agreem...,( a)From and after the Amendment Effective Dat...
2,hereof,the Credit Agreement as amended by this Agreem...,( a)From and after the Amendment Effective Dat...
3,this Agreement,the Credit Agreement as amended by this Agreem...,( a)From and after the Amendment Effective Dat...
4,Credit Agreement,the Credit Agreement as amended by this Agreem...,( a)From and after the Amendment Effective Dat...
...,...,...,...
352,IBA,of the IBA setting the London interbank offere...,"In July 2017, the U.K. Financial Conduct Autho..."
353,Communications,", collectively, any notice, demand, communicat...",""" Communications"" means, collectively, any not..."
354,return receipt requested,have been sent at the opening of business on t...,Unless the Administrative Agent otherwise pres...
355,Trade Date,of determining whether such minimum amount has...,( A) in the case of an assignment of the entir...


In [12]:
for ent in doc.ents:
    print(ent.name,ent.label,ent.normalized)

law of the State of New York gov_law United States, New York
LAW OF THE STATE OF NEW YORK gov_law United States, New York
COURT OF THE SOUTHERN DISTRICT SITTING IN NEW YORK gov_law United States, New York
COURT OF THE STATE OF NEW YORK gov_law United States, New York
NEW YORK STATE COURT gov_law United States, New York
COURT IN THE CITY OF NEW YORK gov_law United States, New York
dated as of January 7, 2022 EFFECTIVE_DATE 2022-01-07
dated as of September 11, 2019 EFFECTIVE_DATE 2019-09-11
dated as of March 23, 2020 EFFECTIVE_DATE 2020-03-23
dated as of January 7, 2022 EFFECTIVE_DATE 2022-01-07
dated as of July 17, 2015 EFFECTIVE_DATE 2015-07-17


In [14]:
doc.language

'en'