### Imports and Data Load

In [None]:
import numpy as np
import pandas as pd
import regex as re
import time
from google.colab import files
from google.colab import drive
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt_tab')

!pip install altair_data_server --quiet
import altair as alt
from altair_data_server import data_server

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Mount drive and read input files [update path based on user]
drive.mount('/content/drive')

tos_df = pd.read_excel('/content/drive/MyDrive/266 Final Project/Input Data/tos.xlsx')
highlights_df = pd.read_csv('/content/drive/MyDrive/266 Final Project/Input Data/highlights.csv')

Mounted at /content/drive


### EDA and Preprocessing

In [None]:
# look for any null values
print(tos_df.isna().sum())
print('---------------------')
print(highlights_df.isna().sum())

doc_id           0
doc_type         0
doc_text        35
service_id       0
segment_link     0
dtype: int64
---------------------
service_id          0
service_name        0
paraphrase          0
highlight_id        0
highlight           4
highlight_link      0
doc_type          274
dtype: int64


In [None]:
# Drop TOS where text is blank, and highlights where highlight is blank
tos_df.dropna(subset='doc_text', inplace=True)
highlights_df.dropna(subset='highlight',inplace=True)

In [None]:
# Identify length of sentences for all highlights
highlights_df['highlight_sent_count'] = highlights_df['highlight'].apply(lambda x: len(sent_tokenize(x)))
print('Unique values for number of sentences in highlights:',highlights_df['highlight_sent_count'].unique())

# Identify length of sentences for all paraphrases
highlights_df['paraphrase_sent_count'] = highlights_df['paraphrase'].apply(lambda x: len(sent_tokenize(x)))
print('Unique values for number of sentences in paraphrases:',highlights_df['paraphrase_sent_count'].unique())

Unique values for number of sentences in highlights: [ 1  2  5  3  7  6 19  4 51  9 14 12 11  8 29 15 10 24]
Unique values for number of sentences in paraphrases: [1 2]


In [None]:
# Visualize distribution of highlight lengths
alt.data_transformers.enable('data_server')
sent_hist = highlights_df[['highlight_sent_count','paraphrase_sent_count']]

highlight_chart = alt.Chart(sent_hist).mark_bar(tooltip=True).encode(
    x=alt.X('highlight_sent_count:Q', title='Sentences in Highlight'),
    y=alt.Y('count()', title='Count')
).properties(
    width=600,
    height=300,
    title='Distribution of Sentences in each Highlight'
)

highlight_chart

In [None]:
# 99% of highlights have 6 or less sentences.
highlight_q = highlights_df['highlight_sent_count'].quantile(.99)
print('Number of sentences 99th percentile:',highlight_q)
print('======\n')

# Here is an example with 9 sentences. Does not seem like a great highlight, can probably remove everything above the 99th percentile
print(highlights_df['highlight'][557])

# Remove highlights that are above the 99th percentile
highlights_df = highlights_df[highlights_df['highlight_sent_count'] < highlight_q]

Number of sentences 99th percentile: 6.0

We disclose account records solely in accordance with our terms of service and applicable law, including the federal Stored Communications Act (“SCA”), 18 U.S.C.
Sections 2701-2712.
Under the SCA:

a valid subpoena issued in connection with an official criminal investigation is required to compel the disclosure of basic subscriber records (defined in 18 U.S.C.
Section 2703(c)(2)), which may include: name, length of service, credit card information, email address(es), and any recent login/logout IP address(es), if available.
a court order issued under 18 U.S.C.
Section 2703(d) is required to compel the disclosure of certain records or other information pertaining to the account, not including contents of communications, which may include message headers and IP addresses, in addition to the basic subscriber records identified above.
a search warrant issued under the procedures described in the Federal Rules of Criminal Procedure or equivalent sta

In [None]:
def clean_special_characters(text):
  '''
  Removes html related tags, non-english chars, and non-essential punctuation
  '''
  text = text.replace('</p','').replace('<p','').replace('<b','').replace('blockquote>','')#.replace('\n',' ').replace('&nbsp;',' ').replace('etc.','').replace("’","'").replace('”','"').replace('“','"')
  return re.sub(r'[^a-zA-Z0-9\s.,?!]', '', text)

tos_df['doc_text'] = tos_df['doc_text'].apply(clean_special_characters)
highlights_df['highlight'] = highlights_df['highlight'].apply(clean_special_characters)

In [None]:
# Tokenize highlights such that each row contains only 1 sentence
# highlights_df = highlights_df[['service_id','doc_type','highlight']]
highlights_df['highlight'] = highlights_df['highlight'].apply(lambda x: sent_tokenize(x))
highlights_df = highlights_df.explode('highlight').reset_index(drop=True)

# Post-tokenization filters
highlights_df.dropna(subset='highlight',inplace=True)
highlights_df['highlight_length'] = highlights_df['highlight'].apply(lambda x: len(x))
highlights_df = highlights_df[highlights_df['highlight_length'] > 20]

Experiment with character length?

In [None]:
# See shape of dfs before passing to matching function
print(tos_df.shape)
print(highlights_df.shape)

(470, 5)
(8055, 10)


### Perform Sentence Level Matching

In [None]:
def identify_highlights(tos_df, highlights_df):
  '''
  Iterate through terms of service documents and highlights,
  and identify matches at the individual sentence level
  '''
  match_df = pd.DataFrame(columns = ['service_id','doc_text','highlight'])

  for tos_index, tos_row in tos_df.iterrows():
    results = {}
    for highlights_index, highlights_row in highlights_df.iterrows():
        if tos_row['service_id'] == highlights_row['service_id']:
            tok_sentences = sent_tokenize(str(tos_row['doc_text']))
            tok_sentences = [sent for sent in tok_sentences if len(sent) > 20]
            tok_highlights = sent_tokenize(str(highlights_row['highlight']))
            for sentence in tok_sentences:
              for highlight in tok_highlights:
                  if re.search(re.escape(sentence), highlight):
                      results[sentence] = highlight
                      break
              if sentence not in results.keys():
                results[sentence] = ''

    results_df = pd.DataFrame(columns = ['service_id','doc_text','highlight'])
    results_df['doc_text'] = results.keys()
    results_df['highlight'] = results.values()
    results_df['service_id'] = tos_row['service_id']

    match_df = pd.concat([match_df, results_df]).reset_index(drop=True)

  return match_df

sentence_highlights = identify_highlights(tos_df, highlights_df)
print('matched highlights:',len(sentence_highlights['highlight'].unique()))
print(sentence_highlights.shape)

matched highlights: 3716
(50807, 3)


In [None]:
# Write sentence level highlights to drive folder [update path based on user]
sentence_highlights.to_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/Old Data/sentence_highlights.csv',index=False)

In [None]:
sentence_highlights = pd.read_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/Old Data/sentence_highlights.csv')

In [None]:
sentence_highlights = sentence_highlights.drop_duplicates()
sentence_highlights.loc[sentence_highlights['highlight'].isna() == True, 'label'] = 0
sentence_highlights.loc[sentence_highlights['highlight'].isna() == False, 'label'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_highlights.loc[sentence_highlights['highlight'].isna() == True, 'label'] = 0


In [None]:
sentence_training_data, sentence_testing_data, sentence_y_train, sentence_y_test = train_test_split(sentence_highlights[['service_id','doc_text','highlight','label']],
                                                    sentence_highlights['label'].values,
                                                    stratify=sentence_highlights['label'].values,
                                                    test_size = 0.2,
                                                    random_state = 1)

print(sentence_training_data.shape)
print(sentence_testing_data.shape)

(38478, 4)
(9620, 4)


In [None]:
sentence_class1_df = sentence_training_data[sentence_training_data['label'] == 1]
sentence_class0_df = sentence_training_data[sentence_training_data['label'] == 0].sample(n = len(sentence_training_data[sentence_training_data['label'] == 1]))
sentence_training_data = pd.concat([sentence_class0_df, sentence_class1_df])
sentence_training_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0.0,2993
1.0,2993


In [None]:
sentence_training_data.to_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/Old Data/sentence_training_data.csv',index=False)
sentence_testing_data.to_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/Old Data/sentence_testing_data.csv',index=False)

### Group sentence level matches into chunks

In [None]:
def create_chunks(df, n_sentences=3, n_overlap=1):
  '''
  Given sentence level data, create groups of sentences with specified overlap
  '''
  results = {'doc_text': [], 'highlight': []}

  for i in range(0, len(df) - 1, n_sentences - n_overlap):
      combined_doc = ' '.join(df['doc_text'][i:i + n_sentences])
      combined_highlight = list(filter(None,df['highlight'][i:i + n_sentences].tolist()))
      results['doc_text'].append(combined_doc)
      results['highlight'].append(combined_highlight)
  return pd.DataFrame(results)

# Create chunks with overlap
chunk_highlights = sentence_highlights.groupby(['doc_type','service_id']).apply(lambda x: create_chunks(x)).reset_index(level=0)

# Separate highlights such that there is only 1 highlight sentence per row. This will have duplicate doc_text for any highlights that are > 1 sentence
chunk_highlights = chunk_highlights.explode('highlight').reset_index(drop=False)
chunk_highlights['highlight'] = chunk_highlights['highlight'].fillna('')
chunk_highlights = chunk_highlights.drop(columns=['level_1'])
chunk_highlights

  chunk_highlights = sentence_highlights.groupby(['doc_type','service_id']).apply(lambda x: create_chunks(x)).reset_index(level=0)


Unnamed: 0,service_id,doc_type,doc_text,highlight
0,687,2FA,Fastmail Email on your side Why Fastmail? Powe...,
1,687,2FA,We support twostep verification with either an...,
2,687,2FA,It is not required to be set up on your accoun...,
3,687,2FA,"In an ideal world, all passwords would be a se...",
4,687,2FA,They might try to steal it through phishing or...,
...,...,...,...,...
27000,158,iTunes Terms of Service,Specifically excluded from application to this...,
27001,158,iTunes Terms of Service,"For example, Apple Books Content is acquired f...",
27002,158,iTunes Terms of Service,"However, if you are a customer of Apple Distri...",
27003,158,iTunes Terms of Service,The Content provider is solely responsible for...,


In [None]:
# Ensure sentence level data and chunk level data have the same number of unique highlights
print('sentence level unique highlights:',len(sentence_highlights['highlight'].unique()))
print('chunk level unique highlights:',len(chunk_highlights['highlight'].unique()))

sentence level unique highlights: 3716
chunk level unique highlights: 3716


In [None]:
# Write chunk level highlights to drive folder [update path based on user]
chunk_highlights.to_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/chunk_highlights.csv',index=False)

In [None]:
chunk_highlights = pd.read_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/chunk_highlights.csv')

In [None]:
# Join paraphrases
final_data_folder = '/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/'
paraphrases_df = pd.read_csv(final_data_folder + 'highlights_data.csv')
chunk_highlights = chunk_highlights.merge(paraphrases_df, how = 'left', on = ['service_id','highlight'])
chunk_highlights = chunk_highlights.merge(highlights_df[['service_id', 'service_name']].drop_duplicates(), on=['service_id'], how='left')
chunk_highlights = chunk_highlights.drop(columns=['Unnamed: 0'])

chunk_highlights.head()

Unnamed: 0,service_id,doc_type,doc_text,highlight,paraphrase,highlight_length,service_name
0,687,2FA,Fastmail Email on your side Why Fastmail? Powe...,,,,FastMail
1,687,2FA,We support twostep verification with either an...,,,,FastMail
2,687,2FA,It is not required to be set up on your accoun...,,,,FastMail
3,687,2FA,"In an ideal world, all passwords would be a se...",,,,FastMail
4,687,2FA,They might try to steal it through phishing or...,,,,FastMail


In [None]:
# Create Train Test Split
chunk_highlights = chunk_highlights[['service_id','service_name','doc_type','doc_text','highlight','paraphrase']]
chunk_highlights['label'] = chunk_highlights['highlight'].apply(lambda x: 1 if x==x else 0)

In [None]:
# Proportion of class 1/0
len(chunk_highlights[chunk_highlights['highlight'].isna()==False])/len(chunk_highlights)

0.22108026174719952

In [None]:
training_data, testing_data, y_train, y_test = train_test_split(chunk_highlights[['service_id','service_name','doc_type','doc_text','highlight','paraphrase','label']],
                                                    chunk_highlights['label'].values,
                                                    stratify=chunk_highlights['label'].values,
                                                    test_size = 0.2,
                                                    random_state = 1)

print(training_data.shape)
print(testing_data.shape)

(21639, 7)
(5410, 7)


In [None]:
class1_df = training_data[training_data['label'] == 1]
class0_df = training_data[training_data['label'] == 0].sample(n = len(training_data[training_data['label'] == 1]))
training_data = pd.concat([class1_df, class0_df])
training_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,4784
0,4784


In [None]:
print(training_data.shape)
print(testing_data.shape)

(9568, 7)
(5410, 7)


In [None]:
testing_data

Unnamed: 0,service_id,service_name,doc_type,doc_text,highlight,paraphrase,label
13928,186,Flickr,Privacy Policy - April 30th 2020,If you do not receive timely acknowledgment of...,,,0
10072,1597,Huawei,Privacy Policy,Huawei will release the methods for withdrawin...,,,0
13070,3144,OneSignal,Privacy Policy,"Introduction and Background \nOneSignal, a U.S...",,,0
8910,846,Crunchyroll,Privacy Policy,To comply with tax or accounting rules or othe...,,,0
2701,2315,Fandango,Cookies and Tracking,We and third parties may associate Measurement...,,,0
...,...,...,...,...,...,...,...
19677,698,Sync,Terms of Service,C ANY INFORMATION OBTAINED BY YOU AS A RESULT ...,,,0
42,687,FastMail,2FA,"We do recommend YubiKey, as in our experience ...",,,0
4290,707,Nextcloud,Privacy,Only a header with a subject is sent via Googl...,How do we use your information?,The service provides information about how the...,1
5760,264,VKontakte,Privacy Policy,Information containing the Site activity histo...,,,0


In [None]:
# Write training and test datasets
training_data.to_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/chunk_training_data.csv',index=False)
testing_data.to_csv('/content/drive/MyDrive/266 Final Project/Full Orchestration: Stage 1+2/final_data/chunk_testing_data.csv',index=False)