In [None]:
import bisect
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
sns.set_theme()

import sys
sys.path.append('..')

An early notebook used to look at points for brainstorming, and help decide whether sentence classification vs sentence spans is the right paradigm. Finds that a lot of points span multiple sentences, so that's ideal, but over 5 sentences is rare.

In [2]:
VERSION = '181021'
cases = pickle.load(open(f'../data/cases_{VERSION}_clean.pkl', 'rb'))
documents = pickle.load(open(f'../data/documents_{VERSION}_clean.pkl', 'rb'))
points = pickle.load(open(f'../data/points_{VERSION}_clean.pkl', 'rb'))
services = pickle.load(open(f'../data/services_{VERSION}_clean.pkl', 'rb'))
topics = pickle.load(open(f'../data/topics_{VERSION}_clean.pkl', 'rb'))

In [None]:
# English only for now
documents = documents[documents.lang == 'en']
points = points[points.lang == 'en']

In [4]:
# Join Points with Documents, so we can check the quote contexts
points = pd.merge(points, documents, how='left', left_on='document_id', right_index=True, suffixes=['_point', '_doc'])
# Also join to attach Case info
points = pd.merge(points, cases, left_on='case_id', right_index=True, suffixes=['_point', '_case'])

In [5]:
print(f"Points per Case stats:\n{points.case_id.value_counts().describe()}\n")
print(points.groupby(['case_id', 'title_case']).size().sort_values(ascending=False).head(15))
points.groupby(['case_id', 'title_case']).size().sort_values(ascending=False).tail(10)

Points per Case stats:
count    241.000000
mean      66.987552
std       72.775310
min        1.000000
25%       18.000000
50%       41.000000
75%       89.000000
max      551.000000
Name: case_id, dtype: float64

case_id  title_case                                                                                                         
331      There is a date of the last update of the agreements                                                                   551
152      This service is only available to users over a certain age                                                             358
286      The service is provided 'as is' and to be used at your sole risk                                                       339
323      You are tracked via web beacons, tracking pixels, browser fingerprinting, and/or device fingerprinting                 278
146      You agree to defend, indemnify, and hold the service harmless in case of a claim related to your use of the service    256
3

case_id  title_case                                                                                            
317      You aren’t allowed to publicly post private messages                                                      3
497      Prices and fees may be changed at any time, without notice to you                                         2
168      The service is not transparent regarding government requests or inquiries that may involve your data.     1
309      You have the right to request lower Charges from Third Party Providers                                    1
141      Inconvenient process for obtaining personal data                                                          1
330      The service disables software that you are not licensed to use.                                           1
378      Service fines users for Terms of Service violations                                                       1
194      The service does not index or open files that you upload    

In [6]:
points['quote_len'] = points.quoteEnd - points.quoteStart

In [7]:
approved_points = points[points.status == 'approved']
print(f"Approved points per Case stats:\n{approved_points.case_id.value_counts().describe()}\n")
print(approved_points.groupby(['case_id', 'title_case']).size().sort_values(ascending=False).head(15))

Approved points per Case stats:
count    241.000000
mean      60.502075
std       66.537028
min        1.000000
25%       15.000000
50%       36.000000
75%       82.000000
max      499.000000
Name: case_id, dtype: float64

case_id  title_case                                                                                                   
331      There is a date of the last update of the agreements                                                             499
286      The service is provided 'as is' and to be used at your sole risk                                                 314
152      This service is only available to users over a certain age                                                       312
323      You are tracked via web beacons, tracking pixels, browser fingerprinting, and/or device fingerprinting           256
325      Third-party cookies are used for statistics                                                                      229
399      Your IP address is 

In [8]:
# Inspecting the largest dataset
# "There is a date of the last update of the agreements"
print('\n-------------------\n'.join(points[points.case_id == 331].quoteText.sample(20).values))

Policy published 11 September 2020.</p>
-------------------
We always indicate the date the last changes were published
-------------------
Last Updated
-------------------
<strong>EFFECTIVE DATE:</strong> FEBRUARY 18, 2021</p>
-------------------
Posted on: September 4th, 2008
-------------------
Effective Date: 16/10/2020
-------------------
<small>Last Updated: September 2019</small>
-------------------
Last Updated January 1st, 2021</p>
-------------------
The last update to our Terms of Service was posted on September 5, 2014.</p>
-------------------
These Individual Terms will come into effect on July 18, 2019.
</li>
<li>The Individual Terms will also apply to acts committed by Users before said Individual Terms came into effect.
</li>
</ul> Revision <ul>
<li>March 30, 2020: Terms revised in entirety as individual terms associated with the pixiv Inc.
Service Master Terms of Use.
</li>
<li>July 18, 2019 
-------------------
<strong>Last Updated:&nbsp.
June 9, 2020</strong>
-------

In [9]:
# "This service is only available to users over a certain age"
print('\n-------------------\n'.join(points[points.case_id == 152].quoteText.sample(20).values))

In any case, you confirm that you are 16 or older (applicable to users located in&nbsp;California and in the European Economic Area (“EEA”) excluding the U.K.) or 13 or older (applicable to users located in the rest of the world including the U.K.), and acknowledge that the Services are not intended for children under such ages, as applicable.
-------------------
iStudiez Pro has 4+ age rating on the Stores, where it is present.
It means, that there are no age limitations if you don’t use iStudiez Pro Cloud Sync service or contact us (where your contact details are required).</p>
<p>To create iStudiez Pro Cloud Sync account you must be at least 13 years old.
-------------------
The Site is intended for users who are at least 13 years of age.
All users who are minors in the jurisdiction in which they reside (generally under the age of 18) must have the permission of, and be directly supervised by, their parent or guardian to use the Site.
If you are a minor, you must have your parent or

In [10]:
"""
See sent_splitting_benchmarks.py for a way to try out a few different sentence splitting approaches with spacy.
For now, going with the parser-based medium model. It does still tend to not add enough sentence boundaries.
It espeically struggles when a period is not followed up by whitespace, like "asdf.</p>"
"""

# There's no need to analyze docs without Points
docs_with_points = documents[documents.index.isin(points.document_id)]
nlp_parser = spacy.load('en_core_web_md', disable=['attribute_ruler', 'lemmatizer', 'ner'])

In [11]:
"""
In order to know whether single sentence classification would work, we should find how how many quotes cover more than one sentence
"""

for doc_id, doc in zip(docs_with_points.id, nlp_parser.pipe(docs_with_points.text, n_process=4, batch_size=10)):
    # Get sentences with actual content
    sents = list(filter(lambda sent: sent.text != '' and not sent.text.isspace(), doc.sents))
    sent_starts = list(sorted(map(lambda s: s.start_char, sents)))
    for point_id, point in points[points.document_id == doc_id].iterrows():
        num_sents = bisect.bisect_left(sent_starts, point.quoteEnd) - (bisect.bisect_right(sent_starts, point.quoteStart) - 1)
        points.at[point_id, 'num_sents'] = num_sents

points.num_sents.describe()

count    16144.000000
mean         1.590374
std          1.543484
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         72.000000
Name: num_sents, dtype: float64

In [12]:
points['num_sents'] = points.num_sents.astype(int)
# How many points more than 1 sentence?
(points.num_sents > 1).sum()

5377

In [13]:
# Compute mean sentences per Point for each Cases, as well as the proportion of Points that are multisentence. Also over 3 sentences.
approved_points = points[points.status == 'approved']

mean_sents_per_point = approved_points.groupby('case_id').num_sents.agg('mean')
print(mean_sents_per_point.head())
cases.at[mean_sents_per_point.index, 'mean_sents_per_point'] = mean_sents_per_point

multisent_point_prop = approved_points.groupby('case_id').apply(lambda group: (group.num_sents > 1).mean())
overthree_point_prop = approved_points.groupby('case_id').apply(lambda group: (group.num_sents > 3).mean())
overfive_point_prop = approved_points.groupby('case_id').apply(lambda group: (group.num_sents > 5).mean())
print(multisent_point_prop.head())
cases.at[multisent_point_prop.index, 'multisent_point_prop'] = multisent_point_prop
cases.at[multisent_point_prop.index, 'overthree_point_prop'] = overthree_point_prop
cases.at[multisent_point_prop.index, 'overfive_point_prop'] = overfive_point_prop

case_id
117    1.756522
118    1.270833
119    1.500000
120    1.777778
121    2.087912
Name: num_sents, dtype: float64
case_id
117    0.408696
118    0.208333
119    0.187500
120    0.555556
121    0.554945
dtype: float64


In [14]:
# We're only interested in Cases with enough approved Points to learn from
point_counts = approved_points.case_id.value_counts()
cases.at[point_counts.index, 'num_points'] = point_counts
cases[cases.num_points >= 50].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98 entries, 175 to 287
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    98 non-null     int64         
 1   classification        98 non-null     object        
 2   score                 98 non-null     int64         
 3   title                 98 non-null     object        
 4   description           98 non-null     object        
 5   topic_id              98 non-null     int64         
 6   created_at            98 non-null     datetime64[ns]
 7   updated_at            98 non-null     datetime64[ns]
 8   privacy_related       33 non-null     object        
 9   docbot_regex          43 non-null     object        
 10  mean_sents_per_point  98 non-null     float64       
 11  multisent_point_prop  98 non-null     float64       
 12  overthree_point_prop  98 non-null     float64       
 13  overfive_point_prop

In [15]:
# What cases have the highest mean sentences per point?
cols = ['title', 'num_points', 'mean_sents_per_point', 'multisent_point_prop', 'overthree_point_prop', 'overfive_point_prop']
cases[cases.num_points >= 50].sort_values('mean_sents_per_point', ascending=False)[cols].head(15)

Unnamed: 0_level_0,title,num_points,mean_sents_per_point,multisent_point_prop,overthree_point_prop,overfive_point_prop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
482,The service claims to be CCPA compliant for Ca...,51.0,3.078431,0.294118,0.098039,0.098039
226,Information is provided about security practices,99.0,2.686869,0.747475,0.232323,0.080808
278,The service is not responsible for linked or (...,70.0,2.5,0.757143,0.2,0.057143
376,Many different types of personal data are coll...,96.0,2.458333,0.458333,0.15625,0.072917
329,"You should revisit the terms periodically, alt...",104.0,2.375,0.711538,0.192308,0.0
138,Instructions are provided on how to submit a c...,90.0,2.222222,0.388889,0.122222,0.077778
164,This service reserves the right to disclose yo...,106.0,2.188679,0.490566,0.188679,0.056604
146,"You agree to defend, indemnify, and hold the s...",174.0,2.097701,0.436782,0.189655,0.057471
307,You are being tracked via social media cookie...,77.0,2.090909,0.649351,0.103896,0.012987
121,Terms may be changed any time at their discre...,182.0,2.087912,0.554945,0.131868,0.021978


In [16]:
# What cases have the highest proportion or multisentence points?
cases[cases.num_points >= 50].sort_values('multisent_point_prop', ascending=False)[cols].head(15)

Unnamed: 0_level_0,title,num_points,mean_sents_per_point,multisent_point_prop,overthree_point_prop,overfive_point_prop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
278,The service is not responsible for linked or (...,70.0,2.5,0.757143,0.2,0.057143
226,Information is provided about security practices,99.0,2.686869,0.747475,0.232323,0.080808
329,"You should revisit the terms periodically, alt...",104.0,2.375,0.711538,0.192308,0.0
307,You are being tracked via social media cookie...,77.0,2.090909,0.649351,0.103896,0.012987
279,This service assumes no responsibility and lia...,80.0,2.075,0.6125,0.125,0.025
122,"The terms may be changed at any time, but you ...",101.0,1.990099,0.60396,0.039604,0.019802
220,Your personal data is used to employ targeted ...,102.0,1.852941,0.568627,0.058824,0.0
121,Terms may be changed any time at their discre...,182.0,2.087912,0.554945,0.131868,0.021978
187,Your data may be processed and stored anywhere...,122.0,1.909836,0.52459,0.090164,0.016393
293,This service assumes no liability for any loss...,190.0,2.073684,0.510526,0.131579,0.052632


In [17]:
# What cases have the highest proportion of over 3 sentence points?
cases[cases.num_points >= 50].sort_values('overthree_point_prop', ascending=False)[cols].head(15)

Unnamed: 0_level_0,title,num_points,mean_sents_per_point,multisent_point_prop,overthree_point_prop,overfive_point_prop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
226,Information is provided about security practices,99.0,2.686869,0.747475,0.232323,0.080808
278,The service is not responsible for linked or (...,70.0,2.5,0.757143,0.2,0.057143
329,"You should revisit the terms periodically, alt...",104.0,2.375,0.711538,0.192308,0.0
146,"You agree to defend, indemnify, and hold the s...",174.0,2.097701,0.436782,0.189655,0.057471
164,This service reserves the right to disclose yo...,106.0,2.188679,0.490566,0.188679,0.056604
376,Many different types of personal data are coll...,96.0,2.458333,0.458333,0.15625,0.072917
121,Terms may be changed any time at their discre...,182.0,2.087912,0.554945,0.131868,0.021978
293,This service assumes no liability for any loss...,190.0,2.073684,0.510526,0.131579,0.052632
300,A complaint mechanism is provided for the hand...,85.0,1.788235,0.376471,0.129412,0.058824
279,This service assumes no responsibility and lia...,80.0,2.075,0.6125,0.125,0.025


In [18]:
# Take another look at the highest priority cases (those with the biggest datasets) to get a sense for typical cases
cases[cases.num_points >= 50].sort_values('num_points', ascending=False)[cols].head(15)

Unnamed: 0_level_0,title,num_points,mean_sents_per_point,multisent_point_prop,overthree_point_prop,overfive_point_prop
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
331,There is a date of the last update of the agre...,499.0,1.088176,0.07014,0.006012,0.0
286,The service is provided 'as is' and to be used...,314.0,1.464968,0.363057,0.015924,0.003185
152,This service is only available to users over a...,312.0,1.602564,0.355769,0.067308,0.00641
323,"You are tracked via web beacons, tracking pixe...",256.0,1.40625,0.300781,0.023438,0.0
325,Third-party cookies are used for statistics,229.0,1.563319,0.379913,0.039301,0.0
399,"Your IP address is collected, which can be use...",221.0,1.199095,0.149321,0.00905,0.004525
163,The court of law governing the terms is in loc...,216.0,1.342593,0.25463,0.013889,0.00463
195,"You can request access, correction and/or dele...",211.0,1.914692,0.43128,0.123223,0.033175
148,You are responsible for maintaining the securi...,197.0,1.598985,0.42132,0.035533,0.0
287,The service provider makes no warranty regardi...,197.0,1.401015,0.284264,0.030457,0.005076


In [19]:
# The median overthree_point_prop is 3%, which is pretty good, and 0% for >5 which is great
cases[cases.num_points >= 50].describe()

Unnamed: 0,id,score,topic_id,mean_sents_per_point,multisent_point_prop,overthree_point_prop,overfive_point_prop,num_points
count,98.0,98.0,98.0,98.0,98.0,98.0,98.0,98.0
mean,248.989796,35.612245,39.265306,1.562164,0.332903,0.045861,0.011926,118.622449
std,90.347382,21.070149,9.877133,0.3737,0.148564,0.051576,0.020702,70.17629
min,117.0,0.0,25.0,1.063492,0.063492,0.0,0.0,50.0
25%,171.25,16.25,31.0,1.311468,0.239763,0.013676,0.0,67.75
50%,229.0,40.0,41.0,1.479723,0.322253,0.030613,0.0,101.5
75%,302.25,50.0,47.0,1.75,0.415625,0.064967,0.01511,155.25
max,484.0,100.0,57.0,3.078431,0.757143,0.232323,0.098039,499.0


In [28]:
points[points.num_sents > 10][['num_sents', 'title_case']]

Unnamed: 0_level_0,num_sents,title_case
id,Unnamed: 1_level_1,Unnamed: 2_level_1
751,21,This service gives your personal data to third...
18928,51,This service tracks you on other websites
18930,21,This service tracks you on other websites
18934,72,This service tracks you on other websites
18937,45,This service tracks you on other websites
18938,33,This service tracks you on other websites
18931,38,This service tracks you on other websites
18932,16,This service tracks you on other websites
7306,25,You are forced into binding arbitration in cas...
7837,14,You are forced into binding arbitration in cas...


In [20]:
# Looking at over-5-sent points ("Your account can be deleted without prior notice and without a reason")
for i, point in approved_points[(approved_points.case_id == 201) & (approved_points.num_sents > 5)].iterrows():
    print('\n' + ('-' * 30) + f' doc id {int(point.document_id)} point id {int(point.id_point)} ' + ('-' * 30))
    print(point.quoteText)



------------------------------ doc id 2200 point id 8574 ------------------------------
at any time, we may, in our sole discretion, terminate our legal agreement with you and deny you use of our Services if:</p>
<p>you have breached any provision of these terms (or have acted in manner which clearly shows that you do not intend to, or are unable to comply with the provisions of these terms).
or</p>
<p>we are required to do so by law (for example, where the provision of our Services to you is, or becomes, unlawful).
or</p>
<p>any partner with whom we offered the Services to you has terminated its relationship with us or ceased to offer their services to you.
or</p>
<p>we are transitioning to no longer providing the Services to users in the country in which you are resident or from which you use the Services.
or</p>
<p>the provision of the Services to you is, in our opinion, no longer commercially viable.</p>
<p>at any reason, whatsoever.
or if you have broken any of the terms in the A

In [21]:
# Looking at over-5-sent points ("You can request access, correction and/or dele...")
for i, point in approved_points[(approved_points.case_id == 195) & (approved_points.num_sents > 5)].iterrows():
    print('\n' + ('-' * 30) + f' doc id {int(point.document_id)} point id {int(point.id_point)} ' + ('-' * 30))
    print(point.quoteText)


------------------------------ doc id 2274 point id 13466 ------------------------------
You may demand that the controller deletes your personal information without delay, and the controller is required to delete that information immediately if one of the following reasons is true:</p>
<ol>
<li>
<p>Your personal data is no longer necessary for the purposes for which it was collected or otherwise processed.</p>
</li>
<li>
<p>You revoke your consent to the processing, which was based on Art.
6 para.
1 lit.
a or Art.
9 para.
2 lit.
a of the GDPR, and there is no other legal basis for processing.</p>
</li>

------------------------------ doc id 991 point id 5746 ------------------------------
Right to access
<p>UpCloud offers access for the Users to the personal data processed by UpCloud.
This means that Users may contact us and we will inform what personal data we have collected and processed regarding the said User and the purposes such data are used for.</p>
Right to withdraw consent.