In [64]:
import sys
sys.path.append("../src")

In [65]:
from extractor import load_policy, clean_text
from summarizer import summarize_policy
from risk_detector import detect_risk

In [66]:
raw = load_policy("../data/raw/marriot_privacy.txt")
cleaned = clean_text(raw)

summary = summarize_policy(cleaned)

print(summary)

- **Data Collection**: Marriott collects personal data like names, contact information, travel preferences, and payment details when you interact with their services, both online and during your stays.

- **Data Usage**: Your data is used to provide, personalize, and improve services, such as handling reservations, providing customer service, and sending marketing communications.

- **Data Sharing**: Marriott may share your data with affiliated properties, franchisees, service providers, and advertising partners for business purposes and service optimization.

- **Privacy Choices**: You have options to manage your data and communication preferences, including opting out of marketing emails and adjusting cookie settings.

- **Security and Retention**: Marriott takes steps to protect your data but cannot guarantee complete security. They retain personal data only as long as necessary to provide services or comply with legal requirements.


In [67]:
raw = load_policy("../data/raw/marriot_privacy.txt")
cleaned = clean_text(raw)

risks = detect_risk(cleaned)
risks

{'data_sharing': ['How and When We Share Your Data',
  'HOW AND WHEN WE SHARE YOUR DATA',
  'When you visit our Websites from either a desktop or mobile device, we may collect and use cookies or other identifiers to serve you with personal advertisements for Marriott and third-party products, via email, on our Websites, or on other websites on the Internet; to measure how you interact with our Websites; and to maintain your preferences. We do this primarily through cookies, which are pieces of data stored directly on the computer or mobile device that you are using. We do not currently respond to browser do-not-track signals. Marriott recognizes the Global Privacy Control.',
  '3. Third-Party Analytics: We collect data through third-party services and products, including Google Analytics, Adobe Analytics, and Epsilon Data Management, which use cookies and technologies to collect and analyze data about use of the Online Services. These services also collect Data regarding the use of oth

In [68]:
from extractor import load_policy, clean_text
from clause_splitter import split_into_clauses

In [69]:
raw = load_policy("../data/raw/snorkel_ai_privacy.txt")
cleaned = clean_text(raw)

clauses = split_into_clauses(cleaned)

In [19]:
print(f"Number of clauses: {len(clauses)}")

for c in clauses[:10]:
    print("—", c)


Number of clauses: 184
— Privacy Policy Effective as of 06/10/2025 This Privacy Policy describes the privacy practices of Snorkel AI, Inc.
— ("Snorkel AI", "we", "us" or "our") and how we handle personal information that we collect through our website (the "Service"), as well as through social media, our marketing activities, and other activities described in this Privacy Policy.
— Snorkel AI may provide additional or supplemental privacy policies to individuals for specific products or services that we offer at the time we collect personal information.
— Our websites, products and services are designed for businesses and other academic and government entities and their representatives.
— We do not offer products or services for use by individuals for their personal, family or household purposes.
— Accordingly, we treat all personal information we collect as pertaining to individuals in their capacities as business representatives and not their individual capacities.
— You can download

# testing auto labler

In [70]:
from extractor import load_policy, clean_text
from auto_labeler import auto_label_clauses

raw = load_policy("../data/raw/snorkel_ai_privacy.txt")
cleaned = clean_text(raw)

labeled_clauses = auto_label_clauses(cleaned)

In [71]:
for item in labeled_clauses[:10]:
    print(item)

{'clause': 'Privacy Policy Effective as of 06/10/2025 This Privacy Policy describes the privacy practices of Snorkel AI, Inc.', 'label': 'none'}
{'clause': '("Snorkel AI", "we", "us" or "our") and how we handle personal information that we collect through our website (the "Service"), as well as through social media, our marketing activities, and other activities described in this Privacy Policy.', 'label': 'none'}
{'clause': 'Snorkel AI may provide additional or supplemental privacy policies to individuals for specific products or services that we offer at the time we collect personal information.', 'label': 'none'}
{'clause': 'Our websites, products and services are designed for businesses and other academic and government entities and their representatives.', 'label': 'none'}
{'clause': 'We do not offer products or services for use by individuals for their personal, family or household purposes.', 'label': 'none'}
{'clause': 'Accordingly, we treat all personal information we collect 

# testing build dataset

In [72]:
from dataset_builder import build_dataset

build_dataset(
    "../data/raw/snorkel_ai_privacy.txt",
    "../data/processed/snorkel_clauses_labeled.csv",
    source_name="snorkel_ai"
)

Dataset saved to ../data/processed/snorkel_clauses_labeled.csv with 184 rows. 


In [73]:
import pandas as pd

df = pd.read_csv("../data/processed/snorkel_clauses_labeled.csv")
df.head()

Unnamed: 0,clause,label,source_policy
0,Privacy Policy Effective as of 06/10/2025 This...,none,snorkel_ai
1,"(""Snorkel AI"", ""we"", ""us"" or ""our"") and how we...",none,snorkel_ai
2,Snorkel AI may provide additional or supplemen...,none,snorkel_ai
3,"Our websites, products and services are design...",none,snorkel_ai
4,We do not offer products or services for use b...,none,snorkel_ai


In [74]:
df['label'].value_counts()

label
none            165
tracking          8
data_sharing      5
location          3
ai_decisions      3
Name: count, dtype: int64

### remvoing useless clauses

In [27]:
df = df[df['clause'].str.len() > 40]

In [29]:
df = df[df['clause'].str.split().apply(len) > 4]

### inspecting label noise

In [30]:
for label in df['label'].unique():
    print("\n======", label, "======\n")
    sample = df[df['label'] == label].head(5)
    for c in sample['clause']:
        print("-", c)



- Privacy Policy Effective as of 06/10/2025 This Privacy Policy describes the privacy practices of Snorkel AI, Inc.
- ("Snorkel AI", "we", "us" or "our") and how we handle personal information that we collect through our website (the "Service"), as well as through social media, our marketing activities, and other activities described in this Privacy Policy.
- Snorkel AI may provide additional or supplemental privacy policies to individuals for specific products or services that we offer at the time we collect personal information.
- Our websites, products and services are designed for businesses and other academic and government entities and their representatives.
- We do not offer products or services for use by individuals for their personal, family or household purposes.


- Location data when you authorize (our mobile application/the Service) to access your device's location.
- Code § 1798.140) PI we collect in this category (See Personal information we collect above for descript

Gonna add cleaning filters

In [31]:
df = df[df['clause'].str.len() < 800]

In [32]:
# removing claifornia law appendix

df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California Consumer", case=False)]

  df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California Consumer", case=False)]


In [33]:
# removing clauses with too many commas - table like structure
df = df[df['clause'].str.count(",") < 10]

In [34]:
df['label'].value_counts()

label
none            153
tracking          8
data_sharing      5
ai_decisions      2
location          1
Name: count, dtype: int64

## Running pipelines on new data

In [35]:
build_dataset(
    "../data/raw/google_privacy.txt",
    "../data/processed/google_clauses_labeled.csv",
    source_name="google"
)

Dataset saved to ../data/processed/google_clauses_labeled.csv with 229 rows. 


In [36]:
build_dataset(
    "../data/raw/tiktok_privacy.txt",
    "../data/processed/tiktok_clauses_labeled.csv",
    source_name="tiktok"
)

Dataset saved to ../data/processed/tiktok_clauses_labeled.csv with 175 rows. 


In [37]:
build_dataset(
    "../data/raw/delta_privacy.txt",
    "../data/processed/delta_clauses_labeled.csv",
    source_name="delta"
)

Dataset saved to ../data/processed/delta_clauses_labeled.csv with 314 rows. 


In [45]:
build_dataset(
    "../data/raw/airbnb_privacy.txt",
    "../data/processed/airbnb_clauses_labeled.csv",
    source_name="airbnb"
)

Dataset saved to ../data/processed/airbnb_clauses_labeled.csv with 240 rows. 


In [46]:
build_dataset(
    "../data/raw/apple_privacy.txt",
    "../data/processed/apple_clauses_labeled.csv",
    source_name="apple"
)

Dataset saved to ../data/processed/apple_clauses_labeled.csv with 177 rows. 


In [47]:
build_dataset(
    "../data/raw/lyft_privacy.txt",
    "../data/processed/lyft_clauses_labeled.csv",
    source_name="lyft"
)

Dataset saved to ../data/processed/lyft_clauses_labeled.csv with 154 rows. 


In [48]:
build_dataset(
    "../data/raw/meta_privacy.txt",
    "../data/processed/meta_clauses_labeled.csv",
    source_name="meta"
)

Dataset saved to ../data/processed/meta_clauses_labeled.csv with 142 rows. 


In [49]:
build_dataset(
    "../data/raw/microsoft_privacy.txt",
    "../data/processed/microsoft_clauses_labeled.csv",
    source_name="microsoft"
)

Dataset saved to ../data/processed/microsoft_clauses_labeled.csv with 179 rows. 


In [50]:
build_dataset(
    "../data/raw/openai_privacy.txt",
    "../data/processed/openai_clauses_labeled.csv",
    source_name="openai"
)

Dataset saved to ../data/processed/openai_clauses_labeled.csv with 90 rows. 


In [51]:
build_dataset(
    "../data/raw/reddit_privacy.txt",
    "../data/processed/reddit_clauses_labeled.csv",
    source_name="reddit"
)

Dataset saved to ../data/processed/reddit_clauses_labeled.csv with 235 rows. 


In [52]:
build_dataset(
    "../data/raw/shopify_privacy.txt",
    "../data/processed/shopify_clauses_labeled.csv",
    source_name="shopify"
)

Dataset saved to ../data/processed/shopify_clauses_labeled.csv with 111 rows. 


In [53]:
build_dataset(
    "../data/raw/southwest_privacy.txt",
    "../data/processed/southwest_clauses_labeled.csv",
    source_name="southwest"
)

Dataset saved to ../data/processed/southwest_clauses_labeled.csv with 242 rows. 


In [54]:
build_dataset(
    "../data/raw/spotify_privacy.txt",
    "../data/processed/spotify_clauses_labeled.csv",
    source_name="spotify"
)

Dataset saved to ../data/processed/spotify_clauses_labeled.csv with 250 rows. 


In [55]:
build_dataset(
    "../data/raw/united_privacy.txt",
    "../data/processed/united_clauses_labeled.csv",
    source_name="united"
)

Dataset saved to ../data/processed/united_clauses_labeled.csv with 970 rows. 


In [56]:
build_dataset(
    "../data/raw/waze_privacy.txt",
    "../data/processed/waze_clauses_labeled.csv",
    source_name="waze"
)

Dataset saved to ../data/processed/waze_clauses_labeled.csv with 265 rows. 


### Combining datasets

In [57]:
import pandas as pd
import glob

files = glob.glob("../data/processed/*_clauses_labeled.csv")

dfs = [pd.read_csv(f) for f in files]

df = pd.concat(dfs, ignore_index=True)
df.shape

(3957, 3)

In [58]:
df = df[df['clause'].str.len() < 800]

In [59]:
df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California", case=False)]

  df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California", case=False)]


In [60]:
df = df[df['clause'].str.count(",") < 10]

In [61]:
df = df[df['clause'].str.split().apply(len) > 5]

In [62]:
df.to_csv("../data/processed/all_clauses_clean.csv", index=False)

In [63]:
df['label'].value_counts()

label
none                 3200
data_sharing          310
tracking              214
location               24
ai_decisions           11
employment_checks       5
arbitration             2
refunds                 1
Name: count, dtype: int64

# running the pipeline with new data and new script

In [75]:
from run_pipeline import run_pipeline

df = run_pipeline()
df.head()

Found 49 raw policy files. 

processing: youtube_privacy...
Dataset saved to ../data/processed/youtube_privacy_clauses_labeled.csv with 229 rows. 
processing: lyft_privacy...
Dataset saved to ../data/processed/lyft_privacy_clauses_labeled.csv with 154 rows. 
processing: united_privacy...
Dataset saved to ../data/processed/united_privacy_clauses_labeled.csv with 970 rows. 
processing: Amazon_COU...
Dataset saved to ../data/processed/Amazon_COU_clauses_labeled.csv with 120 rows. 
processing: marriot_privacy...
Dataset saved to ../data/processed/marriot_privacy_clauses_labeled.csv with 246 rows. 
processing: hulu_privacy...
Dataset saved to ../data/processed/hulu_privacy_clauses_labeled.csv with 425 rows. 
processing: stripe_privacy...
Dataset saved to ../data/processed/stripe_privacy_clauses_labeled.csv with 388 rows. 
processing: priceline_TOS...
Dataset saved to ../data/processed/priceline_TOS_clauses_labeled.csv with 379 rows. 
processing: google_privacy...
Dataset saved to ../data/pr

Unnamed: 0,clause,label,source_policy
0,Overview Privacy Policy Terms of Service Techn...,none,youtube_privacy
1,We understand this is a big responsibility and...,none,youtube_privacy
2,This Privacy Policy is meant to help you under...,none,youtube_privacy
3,Privacy Checkup Looking to change your privacy...,none,youtube_privacy
4,"Take the Privacy Checkup Effective July 1, 202...",none,youtube_privacy


In [76]:
df.shape

(13798, 3)

In [77]:
df['label'].value_counts()

label
none                 12030
data_sharing           843
tracking               562
refunds                136
location                92
arbitration             61
ai_decisions            56
employment_checks       16
auto_renewal             2
Name: count, dtype: int64

## need to clean the merged dataset

In [78]:
import pandas as pd

df = pd.read_csv("../data/processed/all_clauses_raw.csv")

# remove long junk
df = df[df['clause'].str.len() < 800]

# remove law appendix garbage
df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California", case=False)]

# remove table-like clauses
df = df[df['clause'].str.count(",") < 12]

# remove very short clauses
df = df[df['clause'].str.split().apply(len) > 5]

df.to_csv("../data/processed/all_clauses_clean.csv", index=False)

df['label'].value_counts()

  df = df[~df['clause'].str.contains("Code §|1798|Cal\.|California", case=False)]


label
none                 11627
data_sharing           768
tracking               526
refunds                134
location                81
arbitration             59
ai_decisions            51
employment_checks       14
auto_renewal             2
Name: count, dtype: int64