In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Datasets

In [None]:
datasets = {"name": [],
            "summary_file_format": [],
            "raw_files_format": [],
            "aggregated_files_format": [],
            "nb_records": [],
            "split_train_test": [],
            "labeled": [],
            "labels": []}

In [3]:
candidate_datasets_path = "data/candidate-datasets"

## CLAUDETTE - Terms of Service

## CUAD

FORMAT

The files in CUAD v1 include 1 CSV file, 1 SQuAD-style JSON file, 28 Excel files, 510 PDF files, and 510 TXT files.

-  1 master clauses CSV: a 83-column 511-row file. The first column is the names of the contracts corresponding to the PDF and TXT files in the “full_contracts_pdf" and "full_contracts_txt" folders. The remaining columns contain (1) text context (sometimes referred to as clause), and (2) human-input answers that correspond to each of the 41 categories in these contracts. See a list of the categories in “Category List” below. The first row represents the file name and a list of the categories. The remaining 510 rows each represent a contract in the dataset and include the text context and human-input answers corresponding to the categories. The human-input answers are derived from the text context and are formatted to a unified form.

- 1 SQuAD-style JSON: this file is derived from the master clauses CSV to follow the same format as SQuAD 2.0 (https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/), a question answering dataset whose answers are similarly spans of the input text. The exact format of the JSON format exactly mimics that of SQuAD 2.0 for compatibility with prior work. We also provide Python scripts for processing this data for further ease of use.

- 28 Excels: a collection of Excel files containing clauses responsive to each of the categories identified in the “Category List” below. The first column is the names of the contracts corresponding to the PDF and TXT files in the “full_contracts_pdf" and "full_contracts_txt" folders. The remaining columns contain (1) text context (clause) corresponding to one or more Categories that belong in the same group as identified in “Category List” below, and (2) in some cases, human-input answers that correspond to such text context. Each file is named as “Label Report - [label/group name] (Group [number]).xlsx”

- 510 full contract PDFs: a collection of the underlying contracts that we used to extract the labels. Each file is named as “[document name].pdf”. These contracts are in a PDF format and are not labeled. The full contract PDFs contain raw data and are provided for context and reference.

- 510 full contract TXTs: a collection of TXT files of the underlying contracts. Each file is named as “[document name].txt”. These contracts are in a plaintext format and are not labeled. The full contract TXTs contain raw data and are provided for context and reference.


### EDA

In [4]:
cuad_filepath = "data/candidate-datasets/CUAD_v1/"

In [45]:
cuad_master_clauses = pd.read_csv(f"{candidate_datasets_path}/CUAD_v1/master_clauses.csv")
cuad_master_clauses.shape

(510, 83)

In [5]:
! ls data/candidate-datasets/CUAD_v1/full_contract_pdf | wc -l
! ls data/candidate-datasets/CUAD_v1/full_contract_txt | wc -l


       3
     510


In [7]:
cuad_master_clauses.head()

Unnamed: 0,Filename,Document Name,Document Name-Answer,Parties,Parties-Answer,Agreement Date,Agreement Date-Answer,Effective Date,Effective Date-Answer,Expiration Date,...,Liquidated Damages,Liquidated Damages-Answer,Warranty Duration,Warranty Duration-Answer,Insurance,Insurance-Answer,Covenant Not To Sue,Covenant Not To Sue-Answer,Third Party Beneficiary,Third Party Beneficiary-Answer
0,CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605...,['MARKETING AFFILIATE AGREEMENT'],MARKETING AFFILIATE AGREEMENT,"['BIRCH FIRST GLOBAL INVESTMENTS INC.', 'MA', ...","Birch First Global Investments Inc. (""Company""...","['8th day of May 2014', 'May 8, 2014']",5/8/14,['This agreement shall begin upon the date of ...,,['This agreement shall begin upon the date of ...,...,[],No,"[""COMPANY'S SOLE AND EXCLUSIVE LIABILITY FOR T...",Yes,[],No,[],No,[],No
1,EuromediaHoldingsCorp_20070215_10SB12G_EX-10.B...,['VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT'],VIDEO-ON-DEMAND CONTENT LICENSE AGREEMENT,"['EuroMedia Holdings Corp.', 'Rogers', 'Rogers...","Rogers Cable Communications Inc. (""Rogers""); E...","['July 11 , 2006']",7/11/06,"['July 11 , 2006']",7/11/06,"['The term of this Agreement (the ""Initial Ter...",...,[],No,[],No,[],No,[],No,[],No
2,FulucaiProductionsLtd_20131223_10-Q_EX-10.9_83...,['CONTENT DISTRIBUTION AND LICENSE AGREEMENT'],CONTENT DISTRIBUTION AND LICENSE AGREEMENT,"['Producer', 'Fulucai Productions Ltd.', 'Conv...","CONVERGTV, INC. (“ConvergTV”); Fulucai Product...","['November 15, 2012']",11/15/12,"['November 15, 2012']",11/15/12,[],...,[],No,[],No,[],No,[],No,[],No
3,GopageCorp_20140221_10-K_EX-10.1_8432966_EX-10...,['WEBSITE CONTENT LICENSE AGREEMENT'],WEBSITE CONTENT LICENSE AGREEMENT,"['PSiTech Corporation', 'Licensor', 'Licensee'...","PSiTech Corporation (""Licensor""); Empirical Ve...","['Feb 10, 2014']",2/10/14,"['Feb 10, 2014']",2/10/14,['The initial term of this Agreement commences...,...,[],No,[],No,[],No,[],No,[],No
4,IdeanomicsInc_20160330_10-K_EX-10.26_9512211_E...,['CONTENT LICENSE AGREEMENT'],CONTENT LICENSE AGREEMENT,"['YOU ON DEMAND HOLDINGS, INC.', 'Licensor', '...",Beijing Sun Seven Stars Culture Development Li...,"['December 21, 2015']",12/21/15,"['December 21, 2015']",12/21/15,"['The Term of this Agreement (the ""Term"") shal...",...,[],No,[],No,[],No,[],No,[],No


### Adding dataset info

In [None]:
datasets["name"].append("CUAD")
datasets["summary_file_format"].append("csv")
datasets["raw_files_format"].append("pdf/txt")
datasets["aggregated_files_format"].append("NA")
datasets["nb_records"].append(510)
datasets["split_train_test"].append(False)
datasets["labeled"].append(False)

## ContractNLI

We have 17 hypotheses annotated on 607 non-disclosure agreements (NDAs).
The hypotheses are fixed throughout all the contracts including the test dataset.

Our dataset is provided as JSON files.

---

The core information in our dataset is:
* `text`: The full document text
* `spans`: List of spans as pairs of the start and end character indices.
* `annotation_sets`: It is provided as a list to accommodate multiple annotations per document. Since we only have a single annotation for each document, you may safely access the appropriate annotation by `document['annotation_sets'][0]['annotations']`.
* `annotations`: Each key represents a hypothesis key. `choice` is either `Entailment`, `Contradiction` or `NotMentioned`. `spans` is given as indices of `spans` above. `spans` is empty when `choice` is `NotMentioned`.
* `labels`: Each key represents a hypothesis key. `hypothesis` is the hypothesis text that should be used in NLI.

The JSON file comes with supplemental information. Users may simply ignore the information if you are only interested in developing machine learning systems.
* `id`: A unique ID throughout train, development and test datasets.
* `file_name`: The filename of the original document in the dataset zip file.
* `document_type`: One of `search-pdf` (a PDF from a search engine), `sec-text` (a text file from SEC filing) or `sec-html` (an HTML file from SEC filing).
* `url`: The URL that we obtained the document from.

### EDA

In [9]:
with open(f"{candidate_datasets_path}/contract-nli/train.json", 'r') as file:
    contract_nli_train = json.load(file)

with open(f"{candidate_datasets_path}/contract-nli/test.json", 'r') as file:
    contract_nli_test = json.load(file)

with open(f"{candidate_datasets_path}/contract-nli/dev.json", 'r') as file:
    contract_nli_dev = json.load(file)

In [10]:
contract_nli_train.keys()

dict_keys(['documents', 'labels'])

In [11]:
len(contract_nli_train['documents']) + len(contract_nli_test['documents']) + len(contract_nli_dev['documents'])

607

In [12]:
! ls data/candidate-datasets/contract-nli/raw | wc -l

     607


In [13]:
contract_nli_train['documents'][0]

{'id': 34,
 'file_name': 'Annex E_Non-Disclosure and Confidentiality Agreement.pdf',
 'text': "NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\nThis NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT (“Agreement”) is made by and between:\n(i) the Office of the United Nations High Commissioner for Refugees, having its headquarters located at 94 rue de Montbrillant, 1202 Geneva, Switzerland (hereinafter “UNHCR” or the “Discloser”); and\n(ii) ________________________ , a company established in accordance with the laws of ________________________ and having its principal offices located at ________________________________________________ (hereinafter the “Bidder” or the “Recipient”).\nThe Discloser and Recipient are also referred to collectively as the “Parties” and individually as a “Party”.\nRECITALS\nWHEREAS in connection with RFP/2014/620, Request for Proposal for the provision Off-the-shelf Soft-skill, IT Online and HR specific E-learning Courses (the “RFP”), it is advantageous to share certai

### Adding dataset info

In [None]:
datasets["name"].append("ContractNLI")
datasets["summary_file_format"].append("NA")
datasets["raw_files_format"].append("pdf/htm/txt")
datasets["aggregated_files_format"].append("json")
datasets["nb_records"].append(607)
datasets["split_train_test"].append(True)
datasets["labeled"].append(True)

## Legal Clauses

The data source is from was scraped from contracts website where I have collected over 21k legal clauses from 16 type of clauses that are related to ‘finance’. I used python using different type of libraries for scraping since the website keep blocking automated querying from the website, so I used selenium library to extract clause text and clause type.

### EDA

In [15]:
legal_clauses = pd.read_csv(f"{candidate_datasets_path}/legal-clauses/legal_docs.csv", index_col=0)

In [16]:
legal_clauses.shape

(21187, 4)

In [17]:
legal_clauses.head()

Unnamed: 0,clause_text,clause_type,totalwords,totalletters
0,"Make any Investments, except:",investments,4.0,30.0
1,No more than 45% of the “value” (as defined i...,investments,76.0,460.0
2,"Make or hold any Investments, except:",investments,6.0,38.0
3,The SubAdviser is hereby authorized and direc...,investments,228.0,1474.0
4,"Make any advance, loan, extension of credit (...",investments,52.0,329.0


### Adding dataset info

In [None]:
datasets["name"].append("Legal-Clauses")
datasets["summary_file_format"].append("NA")
datasets["raw_files_format"].append("NA")
datasets["aggregated_files_format"].append("csv")
datasets["nb_records"].append(21187)
datasets["split_train_test"].append(False)
datasets["labeled"].append(True)

## LEDGAR

The LEDGAR dataset, which is a annotated dataset consisting of annotated clauses from contracts. The corpus was crawled and scraped from the public domain (SEC filings).

### EDA

In [19]:
ledgar = load_dataset("MAdAiLab/lex_glue_ledgar")

In [20]:
ledgar

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})

In [21]:
60000 + 10000 + 10000

80000

In [22]:
ledgar['train'][0]

{'text': 'Except as otherwise set forth in this Debenture, the Company, for itself and its legal representatives, successors and assigns, expressly waives presentment, protest, demand, notice of dishonor, notice of nonpayment, notice of maturity, notice of protest, presentment for the purpose of accelerating maturity, and diligence in collection.',
 'label': 97}

### Adding dataset info

In [None]:
datasets["name"].append("LEDGAR")
datasets["summary_file_format"].append("NA")
datasets["raw_files_format"].append("NA")
datasets["aggregated_files_format"].append("huggingface-dataset")
datasets["nb_records"].append(80000)
datasets["split_train_test"].append(True)
datasets["labeled"].append(True)

## ToSDR Corpus

This repository contains a corpus of 12,215 terms of service (TOS) documents, which were scraped from the [TOSDR website](https://edit.tosdr.org/documents) using the Beautiful Soup and Requests libraries in Python. The dataset includes both HTML and text versions of the documents.

The data collection process involved retrieving the document text from TOSDR, saving it in an HTML file, removing non-English files detected through the LangDetect library, removing files less than 2B in size and less than 6 words, and converting the remaining files to a text format.

### EDA

In [24]:
! ls data/candidate-datasets/tosdr-terms-of-service-corpus-main/corpus/html | wc -l
! ls data/candidate-datasets/tosdr-terms-of-service-corpus-main/corpus/text | wc -l

   12215
    9496


### Example record

### Adding dataset info

In [None]:
datasets["name"].append("ToSDR-corpus")
datasets["summary_file_format"].append("NA")
datasets["raw_files_format"].append("txt/html")
datasets["aggregated_files_format"].append("NA")
datasets["nb_records"].append(12215)
datasets["split_train_test"].append(False)
datasets["labeled"].append(False)

## ToSDR Service API

needed ?

## ToS Summaries

In this dataset, I have compiled Terms of Service Agreements, each accompanied by human-annotated summaries.

The data were scraped from [tosdr.org](https://tosdr.org/en).

The primary aim of this dataset is to assist in the simplification/summarization of complex and lengthy Terms of Service Agreements, thereby making them more accessible and understandable for users.

### Loading the data

In [None]:
tos_summaries = []

with open(f"{candidate_datasets_path}/tos-summaries/dataset.json", 'r', encoding='utf-8') as f:
    for line in f:
        record = json.loads(line)
        tos_summaries.append(record)

### EDA

In [27]:
len(tos_summaries)

901

### Example record

In [None]:
tos_summaries[0]

### Adding dataset info

In [None]:
datasets["name"].append("ToS-summaries")
datasets["summary_file_format"].append("NA")
datasets["raw_files_format"].append("NA")
datasets["aggregated_files_format"].append("json")
datasets["nb_records"].append(901)
datasets["split_train_test"].append(False)
datasets["labeled"].append(False)

## Privacy Policies Dataset

Research question: Does the wording of a privacy policy enable the user to know:
(i)	what data precisely will be collected;
(ii)	in what ways, precisely, the data will be used;
(iii)	what data will be used in what way OR is the wording such that the company enjoys significant flexibility in (i) what they collect; (ii) in what ways they can use the data, and (iii) what data they can use in what way or for what purpose?

TAGGING CATEGORIES

| Codename      | Description                                                                                                                                                                                                                             | Values   |
|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------|
| GenData       | Is there a clause describing what categories of data are collected that deploys such a general term, potentially followed by an open catalogue of examples, that it is not clear to the consumer what kinds of information will be gathered? | Yes: 1<br>No: 0 |
| GenUse        | Is there a clause describing the ways in which data will be used that deploys such a general term, potentially followed by an open catalogue of example, that is not clear to the consumer how exactly her data will be used?            | Yes: 1<br>No: 0 |
| NoDistinction | Does the privacy policy feature clauses describing how data will be used that do not explain what data exactly will be used in what way or for what purpose                                                                             | Yes: 1<br>No: 0 |


For each category, a separate cells for identified clause examples were included in the table. They include up to 3 clauses that were identified with “1” during the course of privacy policy analysis.


### Loading the data

In [None]:
privacy_policies1 = pd.read_excel(f"{candidate_datasets_path}/annotated-privacy-policies-of-100-online-platforms/PP_table Tagger1.xlsx")
privacy_policies2 = pd.read_excel(f"{candidate_datasets_path}/annotated-privacy-policies-of-100-online-platforms/PP_table Tagger2.xlsx")

### EDA

In [30]:
print(privacy_policies1.shape)
print(privacy_policies2.shape)

(100, 20)
(100, 20)


In [31]:
! ls "data/candidate-datasets/annotated-privacy-policies-of-100-online-platforms/Tagger 1/" | wc -l
! ls "data/candidate-datasets/annotated-privacy-policies-of-100-online-platforms/Tagger 2/" | wc -l

      97
      98


### Example record

In [32]:
privacy_policies1.head()

Unnamed: 0,ID,name,url,date,secto,hq,hq_cat,publ,GenData,GenUse,NoDist,DataExamp1,DataExamp2,DataExamp3,GenUse1,GenUse2,GenUse3,NoDist1,NoDist2,NoDist3
0,,Baidu AI Cloud,https://intl.cloud.baidu.com/doc/Agreements/in...,2020-11-02 00:00:00,Cloud storage,China,Other,Public,1.0,1.0,1.0,Operation and support information. Baidu AI Cl...,"To improve services, Baidu AI Cloud records yo...",8. Please note: In the following circumstances...,We use the collected information to provide an...,We will use the data for research for public i...,We share personal data with suppliers or partn...,In order to improve the security of your use o...,We use the collected information to provide an...,We will use the data for research for public i...
1,,Dropbox,https://www.dropbox.com/privacy,2022-01-14 00:00:00,Cloud storage,US,US,Public,1.0,1.0,1.0,We collect information related to how you use ...,Your devices (depending on their settings) may...,,We collect and use the following information t...,We collect and use the personal data described...,We also collect anduse personal data for our l...,We collect and use the following information t...,We collect and use the personal data described...,We also collect anduse personal data for our l...
2,,iCloud,https://www.apple.com/uk/legal/internet-servic...,,Cloud storage,US,US,Public,,,,,,,,,,,,
3,,Oktawave,https://oktawave.com/en/company/legal/privacy-...,2019-07-25 00:00:00,Cloud storage,Poland,Poland,Private,1.0,1.0,1.0,"In case of events organization, whether physic...",Data that has been\nsubjected to the\nbreach a...,,The above-mentioned data may be processed for ...,These data are collected to organise the\neven...,"Anonymisation for\nresearch and\ndevelopment,\...",Potentially any data\nsubject to this policy,,
4,,OVH,https://www.ovh.ie/personal-data-protection/pr...,,Cloud storage,France,EU,Public,1.0,1.0,1.0,Other information you provide directly to us. ...,Other information that we collect automaticall...,We may receive information from cookies (small...,"Certain features, like contact syncing, may re...",Functional Cookies: These help us provide enha...,To protect our services. We use information to...,To provide you with the services. We use your ...,"To do so, we\nuse your information to monitor ...",To personalize the product. We use your inform...


In [33]:
privacy_policies2.head()

Unnamed: 0,ID,name,url,date,secto,hq,hq_cat,publ,GenData,GenUse,NoDist,DataExamp1,DataExamp2,DataExamp3,GenUse1,GenUse2,GenUse3,NoDist1,NoDist2,NoDist3
0,,Baidu AI Cloud,https://intl.cloud.baidu.com/doc/Agreements/in...,2020-11-02 00:00:00,Cloud storage,China,Other,Public,1.0,1.0,1.0,"After you register, you can continue to comple...",,,We use the collected information to provide an...,"Please note: In the following circumstances, i...",,Baidu AI Cloud will use your personal informat...,To facilitate us to provide you with services ...,
1,,Dropbox,https://www.dropbox.com/privacy,2022-01-14 00:00:00,Cloud storage,US,US,Public,1.0,1.0,1.0,"We collect, and associate with your account, t...","To make that possible, we store,\nprocess, and...",We also collect information from and about the...,We collect and use the following information t...,"We use this information to provide, improve, a...","For example, we use device information to dete...",We collect and use the personal data described...,,
2,,iCloud,https://www.apple.com/uk/legal/internet-servic...,,Cloud storage,US,US,Public,,,,,,,,,,,,
3,,Oktawave,https://oktawave.com/en/company/legal/privacy-...,2019-07-25 00:00:00,Cloud storage,Poland,Poland,Private,1.0,1.0,1.0,The scope of personal data that we process dep...,"If you use the telephone customer service, ele...","Once you use our services, apart of the part I...",We\nprocess the abovementioned information in ...,"Additionally, with regard to the processing of...","Nevertheless, we may analyse information about...","Additionally, with regard to the processing of...",If you have been added by our client as a user...,"Nevertheless, we may analyse information about..."
4,,OVH,https://www.ovh.ie/personal-data-protection/pr...,,Cloud storage,France,EU,Public,1.0,1.0,1.0,"In case of events organization, whether physic...",Data that has been\nsubjected to the\nbreach a...,,The above-mentioned data may be processed for ...,These data are collected to organise the\neven...,"Anonymisation for\nresearch and\ndevelopment,\...",Potentially any data\nsubject to this policy,,


### Adding dataset info

In [None]:
datasets["name"].append("privacy-policies")
datasets["summary_file_format"].append("xlsx")
datasets["raw_files_format"].append("pdf")
datasets["aggregated_files_format"].append("NA")
datasets["nb_records"].append(100)
datasets["split_train_test"].append(False)
datasets["labeled"].append(True)

## Casehold

### Loading the data

In [46]:
casehold = load_dataset("casehold/casehold", "all")

### EDA

In [36]:
casehold

DatasetDict({
    train: Dataset({
        features: ['example_id', 'citing_prompt', 'holding_0', 'holding_1', 'holding_2', 'holding_3', 'holding_4', 'label'],
        num_rows: 42509
    })
    validation: Dataset({
        features: ['example_id', 'citing_prompt', 'holding_0', 'holding_1', 'holding_2', 'holding_3', 'holding_4', 'label'],
        num_rows: 5314
    })
    test: Dataset({
        features: ['example_id', 'citing_prompt', 'holding_0', 'holding_1', 'holding_2', 'holding_3', 'holding_4', 'label'],
        num_rows: 5314
    })
})

In [37]:
42509 + 5314 + 5314

53137

### Example record

In [38]:
casehold['train'][0]

{'example_id': 0,
 'citing_prompt': "Drapeau’s cohorts, the cohort would be a “victim” of making the bomb. Further, firebombs are inherently dangerous. There is no peaceful purpose for making a bomb. Felony offenses that involve explosives qualify as “violent crimes” for purposes of enhancing the sentences of career offenders. See 18 U.S.C. § 924(e)(2)(B)(ii) (defining a “violent felony” as: “any crime punishable by imprisonment for a term exceeding one year ... that ... involves use of explosives”). Courts have found possession of a'bomb to be a crime of violence based on the lack of a nonviolent purpose for a bomb and the fact that, by its very nature, there is a substantial risk that the bomb would be used against the person or property of another. See United States v. Newman, 125 F.3d 863 (10th Cir.1997) (unpublished) (<HOLDING>); United States v. Dodge, 846 F.Supp. 181,",
 'holding_0': 'holding that possession of a pipe bomb is a crime of violence for purposes of 18 usc  3142f1',


### Adding dataset info

In [None]:
datasets["name"].append("casehold")
datasets["summary_file_format"].append("NA")
datasets["raw_files_format"].append("NA")
datasets["aggregated_files_format"].append("huggingface-dataset")
datasets["nb_records"].append(53137)
datasets["split_train_test"].append(True)
datasets["labeled"].append(True)

# Saving summary table

In [42]:
datasets_df = pd.DataFrame(datasets)
datasets_df

Unnamed: 0,name,summary_file_format,raw_files_format,aggregated_files_format,nb_records,labeled,split_train_test
0,CUAD,csv,pdf/txt,,510,False,False
1,ContractNLI,,pdf/htm/txt,json,607,True,True
2,Legal-Clauses,,,csv,21187,True,False
3,LEDGAR,,,huggingface-dataset,80000,True,True
4,ToSDR-corpus,,txt/html,,12215,False,False
5,ToS-summaries,,,json,901,False,False
6,privacy-policies,xlsx,pdf,,100,True,False
7,casehold,,,huggingface-dataset,53137,True,True


In [44]:
datasets_df.to_csv("candidate-datasets-summary.csv", index=False)