## Experiments

Let's start playing around with our data

In [16]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import tqdm
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from collections import Counter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Read in training data
This dataset contains 100 annotated terms of service contracts, each row represents a sentence, which carries on it a label. The label corresponds to a different type of potential unfairness, as defined by the authors of CLAUDETTE, the previous paper from which this dataset came from. 

In [3]:
df = pd.read_csv('../data/dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,A,CH,CR,J,LAW,LTD,PINC,TER,USE,document,document_ID,label,text,TER_targets,LTD_targets,A_targets,CH_targets,CR_targets
0,0,0,0,0,0,0,0,0,0,0,Mozilla,0,0,websites & communications terms of use,,,,,
1,1,0,0,0,0,0,0,0,0,0,Mozilla,0,0,please read the terms of this entire document ...,,,,,
2,2,0,0,0,0,0,0,0,0,1,Mozilla,0,1,by accessing or signing up to receive communic...,,,,,
3,3,0,0,0,0,0,0,0,0,0,Mozilla,0,0,our websites include multiple domains such as ...,,,,,
4,4,0,0,0,0,0,0,0,0,0,Mozilla,0,0,you may also recognize our websites by nicknam...,,,,,


get the vocabulary for the dataset

Now let's load in a pretrained huggingface BERT model  (https://huggingface.co/pile-of-law/legalbert-large-1.7M-2) to get the word embeddings of each sentence

In [4]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('pile-of-law/legalbert-large-1.7M-2')
model = BertModel.from_pretrained('pile-of-law/legalbert-large-1.7M-2')
model.to(device)
text = "This is a test"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input.to(device))

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
df

Unnamed: 0.1,Unnamed: 0,A,CH,CR,J,LAW,LTD,PINC,TER,USE,document,document_ID,label,text,TER_targets,LTD_targets,A_targets,CH_targets,CR_targets,text_tokenized
0,0,0,0,0,0,0,0,0,0,0,Mozilla,0,0,websites & communications terms of use,,,,,,"[websites, &, communications, terms, of, use]"
1,1,0,0,0,0,0,0,0,0,0,Mozilla,0,0,please read the terms of this entire document ...,,,,,,"[please, read, the, terms, of, this, entire, d..."
2,2,0,0,0,0,0,0,0,0,1,Mozilla,0,1,by accessing or signing up to receive communic...,,,,,,"[by, accessing, or, signing, up, to, receive, ..."
3,3,0,0,0,0,0,0,0,0,0,Mozilla,0,0,our websites include multiple domains such as ...,,,,,,"[our, websites, include, multiple, domains, su..."
4,4,0,0,0,0,0,0,0,0,0,Mozilla,0,0,you may also recognize our websites by nicknam...,,,,,,"[you, may, also, recognize, our, websites, by,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20412,20412,0,0,0,0,0,0,0,0,0,Syncme,99,0,you may terminate your account at our service ...,,,,,,"[you, may, terminate, your, account, at, our, ..."
20413,20413,0,0,0,0,0,0,0,0,0,Syncme,99,0,the following provisions shall survive termina...,,,,,,"[the, following, provisions, shall, survive, t..."
20414,20414,0,1,0,0,0,0,0,0,0,Syncme,99,1,"the company reserves the right , at its sole d...",,,,"[0, 2]",,"[the, company, reserves, the, right, ,, at, it..."
20415,20415,0,0,0,0,0,0,0,0,1,Syncme,99,1,"your continued use of the service , following ...",,,,,,"[your, continued, use, of, the, service, ,, fo..."


In [10]:
x_train, x_test, y_train, y_test = train_test_split(x_tokenized, y, test_size=0.2, random_state=42)

In [13]:
x_train[0]

['these',
 'terms',
 'supersede',
 'any',
 'prior',
 'agreements',
 'or',
 'earlier',
 'versions',
 'of',
 'these',
 'terms',
 'between',
 'you',
 'and',
 'ever',
 '##note',
 'for',
 'the',
 'use',
 'of',
 'the',
 'service',
 'as',
 'of',
 'the',
 'effective',
 'date',
 'indicated',
 'at',
 'the',
 'top',
 'of',
 'these',
 'terms',
 '.']

In [11]:
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
accuracy = np.mean(y_pred == y_test)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (16333,) + inhomogeneous part.

The pre trained model gives encodings of shape n x 1024 for a sentence, where n is the number of words + 2 (for start and end tokens) this will be useful to know when we use it as part of our model.

In [79]:
text = "This is a another test"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input.to(device))

In [86]:
encoded_input

{'input_ids': tensor([[   2,  838,  790,   43, 2273, 2826,    3]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [82]:
df["encodings"] = df["text"].apply(lambda x: tokenizer(x, return_tensors='pt'))