In [70]:
import pandas as pd
import numpy as np
import os

from tqdm import tqdm
import json

from langdetect import detect

import ast

import re

In [71]:
try:
    wd = os.path.dirname(os.path.abspath(__file__))
except:
    wd = os.getcwd()

wd = wd.split("/")
repo_path_idx = wd.index("fineprint")
repo_path = "/".join(wd[:repo_path_idx + 1])

# 1. Summarizer dataset

In [98]:
with open(f"{repo_path}/data/candidate-datasets/tos-summaries/dataset.json", 'r') as file:
    summary_dataset = [json.loads(line) for line in file]

output_file = f"{repo_path}/data/stage2-analyzing/summaries.json"

with open(output_file, 'w') as json_file:
    json.dump(summary_dataset, json_file, indent=4)

Same as it came originally

# 2. ToS dataset

In [72]:
categories = False
try:
    cuad_categories = pd.read_csv(f"{repo_path}/curating-datasets/CUAD_categories.csv")
    categories = True
except:
    print("Categories file not computed. Computing now...")

In [73]:
if not categories:
    with open(f"{repo_path}/data/candidate-datasets/CUAD_v1/CUAD_v1_README.txt") as f:
        lines = f.readlines()

    category_list_i = lines.index("CATEGORY LIST\n")
    next_section_i = lines.index("SOURCE OF CONTRACTS\n")
    cuad_categories = lines[category_list_i:next_section_i]
    cuad_categories = [x.replace("\t", "").replace("\n", "").strip() for x in cuad_categories]
    cuad_categories = [x for x in cuad_categories if len(x) > 0]

    cuad_categories_dict = {'category': [], 'description': [], 'answer_format': []}

    i = 0
    while i < len(cuad_categories):
        if cuad_categories[i].isdigit():
            category = cuad_categories[i+1].split(":")[1].strip()
            description = cuad_categories[i+2].split(":")[1].strip()
            answer_format = cuad_categories[i+3].split(":")[1].strip()

            cuad_categories_dict['category'].append(category)
            cuad_categories_dict['description'].append(description)
            cuad_categories_dict['answer_format'].append(answer_format)
            i += 4
        i += 1

    cuad_categories = pd.DataFrame(cuad_categories_dict)
    cuad_categories.to_csv(f"{repo_path}/curating-datasets/CUAD_categories.csv", index=False)

In [74]:
cuad_categories

Unnamed: 0,category,description,answer_format
0,Document Name,The name of the contract,Contract Name
1,Parties,The two or more parties who signed the contract,Entity or individual names
2,Agreement Date,The date of the contract,Date (mm/dd/yyyy)
3,Effective Date,The date when the contract is effective,Date (mm/dd/yyyy)
4,Expiration Date,On what date will the contract's initial term ...,Date (mm/dd/yyyy) / Perpetual
5,Renewal Term,What is the renewal term after the initial ter...,[Successive] number of years/months / Perpetual
6,Notice to Terminate Renewal,What is the notice period required to terminat...,Number of days/months/year(s)
7,Governing Law,Which state/country's law governs the interpre...,"Name of a US State / non-US Province, Country"
8,Most Favored Nation,Is there a clause that if a third party gets b...,Yes/No
9,Non-Compete,Is there a restriction on the ability of a par...,Yes/No


We're only interested in the Yes/No categories, since they indicate wether the clause belongs to that category.

In [75]:
cuad = pd.read_csv(f"{repo_path}/data/candidate-datasets/CUAD_v1/master_clauses.csv")

In [76]:
cuad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 83 columns):
 #   Column                                      Non-Null Count  Dtype 
---  ------                                      --------------  ----- 
 0   Filename                                    510 non-null    object
 1   Document Name                               510 non-null    object
 2   Document Name-Answer                        510 non-null    object
 3   Parties                                     510 non-null    object
 4   Parties-Answer                              509 non-null    object
 5   Agreement Date                              510 non-null    object
 6   Agreement Date-Answer                       465 non-null    object
 7   Effective Date                              510 non-null    object
 8   Effective Date-Answer                       359 non-null    object
 9   Expiration Date                             510 non-null    object
 10  Expiration Date-Answer    

In [77]:
cuad_filtered = cuad.iloc[:, [0] + list(range(17,75)) + list(range(77, 83))]

In [78]:
cuad_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 510 entries, 0 to 509
Data columns (total 65 columns):
 #   Column                                    Non-Null Count  Dtype 
---  ------                                    --------------  ----- 
 0   Filename                                  510 non-null    object
 1   Most Favored Nation                       510 non-null    object
 2   Most Favored Nation-Answer                510 non-null    object
 3   Competitive Restriction Exception         510 non-null    object
 4   Competitive Restriction Exception-Answer  510 non-null    object
 5   Non-Compete                               510 non-null    object
 6   Non-Compete-Answer                        510 non-null    object
 7   Exclusivity                               510 non-null    object
 8   Exclusivity-Answer                        510 non-null    object
 9   No-Solicit Of Customers                   510 non-null    object
 10  No-Solicit Of Customers-Answer            510 non-

In [79]:
cuad_columns = list(cuad_filtered.columns)

In [80]:
tos_dataset = []

nb_clauses = 0
nb_neutral_clauses = 0

rows, columns = cuad_filtered.shape
for i in range(rows):
    row = cuad_filtered.iloc[i]
    filename = row[0]
    filename = filename.replace(".pdf", ".txt").replace(".PDF", ".txt")
    try:
        with open(f"{repo_path}/data/candidate-datasets/CUAD_v1/full_contract_txt/{filename}", "r") as f:
            contract = f.read()
        have_contract = True
    except:
        have_contract = False
    clause_cols = [num for num in range(2, columns, 2)]
    for j in clause_cols:
        if cuad_filtered.iloc[i, j] == "Yes":
            clause = cuad_filtered.iloc[i, j-1]
            clause_list = ast.literal_eval(clause)
            for clause in clause_list:
                if have_contract:
                    neutral_clauses = True
                    if contract.find(clause) != -1:
                        contract = contract.replace(clause, "")
                    else:
                        neutral_clauses = False
                clause = clause.replace("\n", " ").replace("\t", " ").replace('\u00ad', "").strip()
                clause_dict = {'clause': clause,
                            'label': cuad_columns[j-1]}
                tos_dataset.append(clause_dict)
                nb_clauses += 1
    if neutral_clauses:
        contract_clauses = contract.split('\n')
        contract_clauses = [clause.strip() for clause in contract_clauses]
        contract_clauses = [re.sub(r'\s+', ' ', clause) for clause in contract_clauses]
        contract_clauses = [clause for clause in contract_clauses if len(clause) > 50]
        for clause in contract_clauses:
            if nb_neutral_clauses < nb_clauses:
                clause = clause.replace("\n", " ").replace("\t", " ").replace('\u00ad', "").strip()
                clause_dict = {'clause': clause, 'label': 'Non_Relevant'}
                tos_dataset.append(clause_dict)
                nb_neutral_clauses += 1


In [81]:
print(nb_clauses)
print(nb_neutral_clauses)

7746
7733


In [82]:
tos_dataset

[{'clause': 'Company shall not specify the business practices of MA, nor regulate the manner in which MA shall operate its business, provided that MA (a) conducts business in a manner that reflects favorably at all times on the Technology sold and the good name, goodwill and reputation of Company and its affiliates<omitted>',
  'label': 'Non-Disparagement'},
 {'clause': 'MA may not assign, sell, lease or otherwise transfer in whole or in party any of the rights granted pursuant to this Agreement without prior written approval of Company.',
  'label': 'Anti-Assignment'},
 {'clause': 'INITIAL ORDER COMMITMENT - MA commits to purchase a minimum of 100 Units in aggregate within the Territory within the first six months of term of this Agreement.',
  'label': 'Minimum Commitment'},
 {'clause': 'Company hereby grants MA, during the term of this Agreement, the right to use Company and/or Company trade names, trademarks or service marks on Technology or in advertising or promotion relating dir

In [83]:
output_file = f"{repo_path}/data/stage2-analyzing/tos_dataset.json"

with open(output_file, 'w') as json_file:
    json.dump(tos_dataset, json_file, indent=4)

# 3. PP dataset

## TAGGING CATEGORIES

| Codename      | Description | Values      |
|---------------|-------------|-------------|
| **GenData**   | Is there a clause describing what categories of data are collected that deploys such a general term, potentially followed by an open catalogue of examples, that it is not clear to the consumer what kinds of information will be gathered? | Yes: 1<br>No: 0 |
| **GenUse**    | Is there a clause describing the ways in which data will be used that deploys such a general term, potentially followed by an open catalogue of example, that is not clear to the consumer how exactly her data will be used? | Yes: 1<br>No: 0 |
| **NoDistinction** | Does the privacy policy feature clauses describing how data will be used that do not explain what data exactly will be used in what way or for what purpose | Yes: 1<br>No: 0 |


In [84]:
annotated_pp1 = pd.read_excel(f"{repo_path}/data/candidate-datasets/annotated-privacy-policies-of-100-online-platforms/PP_table Tagger1.xlsx")
annotated_pp2 = pd.read_excel(f"{repo_path}/data/candidate-datasets/annotated-privacy-policies-of-100-online-platforms/PP_table Tagger2.xlsx")

In [85]:
annotated_pp1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          0 non-null      float64
 1   name        100 non-null    object 
 2   url         99 non-null     object 
 3   date        84 non-null     object 
 4   secto       100 non-null    object 
 5   hq          100 non-null    object 
 6   hq_cat      100 non-null    object 
 7   publ        100 non-null    object 
 8   GenData     98 non-null     float64
 9   GenUse      98 non-null     float64
 10  NoDist      98 non-null     float64
 11  DataExamp1  98 non-null     object 
 12  DataExamp2  89 non-null     object 
 13  DataExamp3  71 non-null     object 
 14  GenUse1     96 non-null     object 
 15  GenUse2     92 non-null     object 
 16  GenUse3     87 non-null     object 
 17  NoDist1     89 non-null     object 
 18  NoDist2     52 non-null     object 
 19  NoDist3     26 non-null     ob

In [86]:
annotated_pp2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ID          0 non-null      float64
 1   name        100 non-null    object 
 2   url         99 non-null     object 
 3   date        84 non-null     object 
 4   secto       100 non-null    object 
 5   hq          100 non-null    object 
 6   hq_cat      100 non-null    object 
 7   publ        100 non-null    object 
 8   GenData     98 non-null     float64
 9   GenUse      98 non-null     float64
 10  NoDist      98 non-null     float64
 11  DataExamp1  98 non-null     object 
 12  DataExamp2  87 non-null     object 
 13  DataExamp3  74 non-null     object 
 14  GenUse1     95 non-null     object 
 15  GenUse2     94 non-null     object 
 16  GenUse3     89 non-null     object 
 17  NoDist1     89 non-null     object 
 18  NoDist2     53 non-null     object 
 19  NoDist3     26 non-null     ob

In [87]:
annotated_pp1_filtered = annotated_pp1.iloc[:, 8:]
annotated_pp2_filtered = annotated_pp2.iloc[:, 8:]

In [88]:
annotated_pp_columns = list(annotated_pp1_filtered.columns)

In [89]:
annotated_pp1_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   GenData     98 non-null     float64
 1   GenUse      98 non-null     float64
 2   NoDist      98 non-null     float64
 3   DataExamp1  98 non-null     object 
 4   DataExamp2  89 non-null     object 
 5   DataExamp3  71 non-null     object 
 6   GenUse1     96 non-null     object 
 7   GenUse2     92 non-null     object 
 8   GenUse3     87 non-null     object 
 9   NoDist1     89 non-null     object 
 10  NoDist2     52 non-null     object 
 11  NoDist3     26 non-null     object 
dtypes: float64(3), object(9)
memory usage: 9.5+ KB


In [90]:
annotated_pp1_filtered = annotated_pp1_filtered.fillna("")
annotated_pp2_filtered = annotated_pp2_filtered.fillna("")

In [91]:
pp_dataset = []

def process_doc_pp(annotated_pp, annotated_pp_columns, pp_dataset):
    rows, columns = annotated_pp.shape
    for i in range(rows):
        row = annotated_pp.iloc[i]
        clause_cols = [num for num in range(2, columns, 2)]
        for j in range(3):
            if annotated_pp.iloc[i, j] == 1:
                for k in range((j+1)*3, (j+1)*3+3):
                    clause = annotated_pp.iloc[i, k]
                    if len(clause) > 0:
                        lang = detect(clause)
                        if lang == "en":
                            clause = clause.replace("\n", " ").replace("\t", " ").replace('\u00ad', "").strip()
                            clause_dict = {'clause': clause,
                                            'label': annotated_pp_columns[j]}
                            pp_dataset.append(clause_dict)
    return pp_dataset

pp_dataset = process_doc_pp(annotated_pp1_filtered, annotated_pp_columns, pp_dataset)
pp_dataset = process_doc_pp(annotated_pp2_filtered, annotated_pp_columns, pp_dataset)

In [92]:
pp_dataset

[{'clause': 'Operation and support information. Baidu AI Cloud initiates operating activities for newly launched services and functions from time to time. If you participate in relevant operating activities, we may collect yourcontact information (such as phone number), contact address and transaction account information through the interface of operating activities so that you can successfully participate in the activities.',
  'label': 'GenData'},
 {'clause': 'To improve services, Baidu AI Cloud records your service usage, resource consumption, consultation records, work order information and the communication process between Baidu AI Cloud and you.',
  'label': 'GenData'},
 {'clause': '8. Please note: In the following circumstances, it is not necessary to obtain your prior authorization and consent to collect and use your personal information: (…) g) It is necessary for providing you with products or services according to your requirements; h) Necessary to maintain the safe and stab

In [93]:
output_file = f"{repo_path}/data/stage2-analyzing/pp_dataset.json"

with open(output_file, 'w') as json_file:
    json.dump(pp_dataset, json_file, indent=4)