In [13]:
import sys
sys.path.append("../src")

In [14]:
from extractor import load_policy, clean_text
from summarizer import summarize_policy
from risk_detector import detect_risk

In [15]:
raw = load_policy("../data/raw/marriot_privacy.txt")
cleaned = clean_text(raw)

summary = summarize_policy(cleaned)

print(summary)

Here’s a summary of the privacy policy in 5 simple bullet points:

1. **Data Collection**: Marriott collects various types of personal data about you, such as your name, contact details, travel preferences, and payment information when you book stays, use their apps, or interact with their websites.

2. **Data Usage**: Your data is used to improve services, personalize experiences, process bookings, handle transactions, and communicate with you about your reservations or special offers.

3. **Data Sharing**: Marriott may share your information with affiliated companies, franchisees, and third-party service providers for business operations, loyalty programs, and marketing purposes.

4. **Your Choices**: You have control over your data, including options to opt-out of marketing communications, manage cookie preferences, and access, correct, or delete your personal data.

5. **Security Measures**: Marriott takes reasonable steps to protect your data, but no system is completely secure. T

In [16]:
raw = load_policy("../data/raw/marriot_privacy.txt")
cleaned = clean_text(raw)

risks = detect_risk(cleaned)
risks

{'data_sharing': ['How and When We Share Your Data',
  'HOW AND WHEN WE SHARE YOUR DATA',
  'When you visit our Websites from either a desktop or mobile device, we may collect and use cookies or other identifiers to serve you with personal advertisements for Marriott and third-party products, via email, on our Websites, or on other websites on the Internet; to measure how you interact with our Websites; and to maintain your preferences. We do this primarily through cookies, which are pieces of data stored directly on the computer or mobile device that you are using. We do not currently respond to browser do-not-track signals. Marriott recognizes the Global Privacy Control.',
  '3. Third-Party Analytics: We collect data through third-party services and products, including Google Analytics, Adobe Analytics, and Epsilon Data Management, which use cookies and technologies to collect and analyze data about use of the Online Services. These services also collect Data regarding the use of oth

In [17]:
from extractor import load_policy, clean_text
from clause_splitter import split_into_clauses

In [18]:
raw = load_policy("../data/raw/snorkel_ai_privacy.txt")
cleaned = clean_text(raw)

clauses = split_into_clauses(cleaned)

In [19]:
print(f"Number of clauses: {len(clauses)}")

for c in clauses[:10]:
    print("—", c)


Number of clauses: 184
— Privacy Policy Effective as of 06/10/2025 This Privacy Policy describes the privacy practices of Snorkel AI, Inc.
— ("Snorkel AI", "we", "us" or "our") and how we handle personal information that we collect through our website (the "Service"), as well as through social media, our marketing activities, and other activities described in this Privacy Policy.
— Snorkel AI may provide additional or supplemental privacy policies to individuals for specific products or services that we offer at the time we collect personal information.
— Our websites, products and services are designed for businesses and other academic and government entities and their representatives.
— We do not offer products or services for use by individuals for their personal, family or household purposes.
— Accordingly, we treat all personal information we collect as pertaining to individuals in their capacities as business representatives and not their individual capacities.
— You can download

# testing auto labler

In [20]:
from extractor import load_policy, clean_text
from auto_labeler import auto_label_clauses

raw = load_policy("../data/raw/snorkel_ai_privacy.txt")
cleaned = clean_text(raw)

labeled_clauses = auto_label_clauses(cleaned)

In [21]:
for item in labeled_clauses[:10]:
    print(item)

{'clause': 'Privacy Policy Effective as of 06/10/2025 This Privacy Policy describes the privacy practices of Snorkel AI, Inc.', 'label': 'none'}
{'clause': '("Snorkel AI", "we", "us" or "our") and how we handle personal information that we collect through our website (the "Service"), as well as through social media, our marketing activities, and other activities described in this Privacy Policy.', 'label': 'none'}
{'clause': 'Snorkel AI may provide additional or supplemental privacy policies to individuals for specific products or services that we offer at the time we collect personal information.', 'label': 'none'}
{'clause': 'Our websites, products and services are designed for businesses and other academic and government entities and their representatives.', 'label': 'none'}
{'clause': 'We do not offer products or services for use by individuals for their personal, family or household purposes.', 'label': 'none'}
{'clause': 'Accordingly, we treat all personal information we collect 

# testing build dataset

In [22]:
from dataset_builder import build_dataset

build_dataset(
    "../data/raw/snorkel_ai_privacy.txt",
    "../data/processed/snorkel_clauses_labeled.csv",
    source_name="snorkel_ai"
)

Dataset saved to ../data/processed/snorkel_clauses_labeled.csv with 184 rows. 


In [25]:
import pandas as pd

df = pd.read_csv("../data/processed/snorkel_clauses_labeled.csv")
df.head()

Unnamed: 0,clause,label,source_policy
0,Privacy Policy Effective as of 06/10/2025 This...,none,snorkel_ai
1,"(""Snorkel AI"", ""we"", ""us"" or ""our"") and how we...",none,snorkel_ai
2,Snorkel AI may provide additional or supplemen...,none,snorkel_ai
3,"Our websites, products and services are design...",none,snorkel_ai
4,We do not offer products or services for use b...,none,snorkel_ai


In [26]:
df['label'].value_counts()

label
none            165
tracking          8
data_sharing      5
location          3
ai_decisions      3
Name: count, dtype: int64

### remvoing useless clauses

In [27]:
df = df[df['clause'].str.len() > 40]

In [29]:
df = df[df['clause'].str.split().apply(len) > 4]

### inspecting label noise

In [30]:
for label in df['label'].unique():
    print("\n======", label, "======\n")
    sample = df[df['label'] == label].head(5)
    for c in sample['clause']:
        print("-", c)



- Privacy Policy Effective as of 06/10/2025 This Privacy Policy describes the privacy practices of Snorkel AI, Inc.
- ("Snorkel AI", "we", "us" or "our") and how we handle personal information that we collect through our website (the "Service"), as well as through social media, our marketing activities, and other activities described in this Privacy Policy.
- Snorkel AI may provide additional or supplemental privacy policies to individuals for specific products or services that we offer at the time we collect personal information.
- Our websites, products and services are designed for businesses and other academic and government entities and their representatives.
- We do not offer products or services for use by individuals for their personal, family or household purposes.


- Location data when you authorize (our mobile application/the Service) to access your device's location.
- Code § 1798.140) PI we collect in this category (See Personal information we collect above for descript

Gonna add cleaning filters

In [31]:
df = df[df['clause'].str.len() < 800]

In [32]:
# removing claifornia law appendix

df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California Consumer", case=False)]

  df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California Consumer", case=False)]


In [33]:
# removing clauses with too many commas - table like structure
df = df[df['clause'].str.count(",") < 10]

In [34]:
df['label'].value_counts()

label
none            153
tracking          8
data_sharing      5
ai_decisions      2
location          1
Name: count, dtype: int64

## Running pipelines on new data

In [35]:
build_dataset(
    "../data/raw/google_privacy.txt",
    "../data/processed/google_clauses_labeled.csv",
    source_name="google"
)

Dataset saved to ../data/processed/google_clauses_labeled.csv with 229 rows. 


In [36]:
build_dataset(
    "../data/raw/tiktok_privacy.txt",
    "../data/processed/tiktok_clauses_labeled.csv",
    source_name="tiktok"
)

Dataset saved to ../data/processed/tiktok_clauses_labeled.csv with 175 rows. 


In [37]:
build_dataset(
    "../data/raw/delta_privacy.txt",
    "../data/processed/delta_clauses_labeled.csv",
    source_name="delta"
)

Dataset saved to ../data/processed/delta_clauses_labeled.csv with 314 rows. 


### Combining datasets

In [38]:
import pandas as pd
import glob

files = glob.glob("../data/processed/*_clauses_labeled.csv")

dfs = [pd.read_csv(f) for f in files]

df = pd.concat(dfs, ignore_index=True)
df.shape

(902, 3)

In [39]:
df = df[df['clause'].str.len() < 800]

In [40]:
df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California", case=False)]

  df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California", case=False)]


In [41]:
df = df[df['clause'].str.count(",") < 10]

In [42]:
df = df[df['clause'].str.split().apply(len) > 5]

In [43]:
df.to_csv("../data/processed/all_clauses_clean.csv", index=False)

In [44]:
df['label'].value_counts()

label
none            742
tracking         47
data_sharing     44
location          4
ai_decisions      4
refunds           1
Name: count, dtype: int64