# Masking and Tokenization Experiments


## Dependencies

In [1]:
# run in colab
# !pip install -q gcsfs pyarrow pandas torch transformers polars langid tqdm

In [2]:
import os
from pathlib import Path

# credential file
sa_path = Path(r"eastern-bridge-credentials.json").resolve()

# clean up any conflicting vars from earlier cells
os.environ.pop("GOOGLE_SERVICE_ACCOUNT", None)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = str(sa_path)  # <-- path, not JSON


In [3]:
# imports
import gcsfs
from dataclasses import dataclass
from typing import Dict, List, Iterable, Tuple, Optional, Set, Any
import polars as pl

## GCS Access

In [4]:
# GCS Bucket Access
BUCKET = "parquet_v2_openwebtext-with-pos-ner"

# Connect to GCS using service account key
fs = gcsfs.GCSFileSystem(token="eastern-bridge-credentials.json")


# Get all parquet files from the bucket
paths = sorted(fs.glob(f"{BUCKET}/**/*.parquet"))
print(f"Found {len(paths)} parquet files")

# Show first few files
for i, path in enumerate(paths[:5]):
    print(f"{i+1}. {path}")
if len(paths) > 5:
    print(f"... and {len(paths) - 5} more files")


Found 225 parquet files
1. parquet_v2_openwebtext-with-pos-ner/batch_000001_1759435499.parquet
2. parquet_v2_openwebtext-with-pos-ner/batch_000002_1759435590.parquet
3. parquet_v2_openwebtext-with-pos-ner/batch_000003_1759435685.parquet
4. parquet_v2_openwebtext-with-pos-ner/batch_000004_1759435780.parquet
5. parquet_v2_openwebtext-with-pos-ner/batch_000005_1759435870.parquet
... and 220 more files


## Polars Manipulation

### Polats tutorial

In [5]:
# eager - reads everything into memory
# lf = pl.read_parquet("gs://parquet_v2_openwebtext-with-pos-ner/batch_*.parquet")
# df = lf.head(5).collect()
# df = df.select(["id", "text"])  
# df

In [None]:
# lazy - reads only needed columns/row-groups

# create lazy frame object
lf = pl.scan_parquet("gs://parquet_v2_openwebtext-with-pos-ner/batch_*.parquet")

# select columns
lf = lf.select(["id", "text"])

# filter rows 
lf = lf.head(5)

# collect data - actually read the data
lf.collect()


id,text
str,str
"""sent_bd339fa5-d5b2-430e-b8a2-4…","""Port-au-Prince, Haiti (CNN) --…"
"""sent_1a1e1636-86d0-4f0a-86ad-8…","""Former secretary of state Hill…"
"""sent_3972b602-bb67-4969-8c4f-6…","""The opinions expressed by colu…"
"""sent_44c35ec7-67f3-4d13-85bb-d…","""BIGBANG is one of those musica…"
"""sent_8b8bcf88-a267-4db3-8f9d-e…","""WHAT?!??! I know. That’s what …"


In [None]:
# filter by word
lf_filtered = lf.filter(pl.col("text").str.contains("Haiti"))
lf_filtered.collect()

id,text
str,str
"""sent_bd339fa5-d5b2-430e-b8a2-4adc18263474""","""Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated…"


In [None]:
# retrieve the row with the specific id
lf.filter(pl.col("id") == "sent_bd339fa5-d5b2-430e-b8a2-4adc18263474").collect()

id,text
str,str
"""sent_bd339fa5-d5b2-430e-b8a2-4adc18263474""","""Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated…"


polars.config.Config

### Replacing Special tags

In [19]:
# create lazy frame object
lf = pl.scan_parquet("gs://parquet_v2_openwebtext-with-pos-ner/batch_*.parquet")

# select columns
lf.schema

OrderedDict([('id', String),
             ('text', String),
             ('sent_spans', List(Struct({'end': Int64, 'start': Int64}))),
             ('punct_spans',
              List(Struct({'end': Int64, 'start': Int64, 'value': String}))),
             ('special_tags',
              List(Struct({'end': Int64, 'start': Int64, 'type': String, 'value': String}))),
             ('ner_spans',
              List(Struct({'end': Int64, 'entity_id': String, 'label': String, 'start': Int64}))),
             ('pos_tokens', List(String)),
             ('pos_tags', List(String)),
             ('ner_iob', List(String))])

In [22]:
# Configuration of replacement tokens
TAG_REPLACE_CFG = {
    "URL": "<URL>",
    "EMAIL": "<EMAIL>",
    "PHONE": "<PHONE>",
    "ADDRESS": "<ADDRESS>",
    # ... any other tags
}

In [23]:


def replace_spans(text: Optional[str], special_tags: Optional[List[Dict[str, Any]]]) -> Optional[str]:
    """
    Core function: Replaces spans in a single text string based on a list of tags.
    It performs replacements in reverse order to maintain correct indices.
    """
    if text is None or special_tags is None or not text:
        return text

    # Filter and validate tags
    valid_tags = [
        tag for tag in special_tags
        if tag is not None and 
           all(key in tag for key in ['start', 'end', 'label']) and 
           isinstance(tag.get('start'), int) and isinstance(tag.get('end'), int)
    ]
    
    if not valid_tags:
        return text

    # CRITICAL: Sort in reverse order by start index
    sorted_tags = sorted(valid_tags, key=lambda x: x['start'], reverse=True)
    
    modified_text = text
    
    for tag in sorted_tags:
        start = tag['start']
        end = tag['end']
        tag_type = tag.get('label', '').upper()
        
        # Sanity check
        if start < 0 or end > len(modified_text) or start >= end:
            continue
            
        replacement = TAG_REPLACE_CFG.get(tag_type, modified_text[start:end])
        
        # String slicing replacement
        modified_text = modified_text[:start] + replacement + modified_text[end:]
        
    return modified_text

In [26]:
# Assuming you have a DataFrame named `df` with a 'special_tags' column

# Filter rows where 'special_tags' is not None and not empty
non_empty_special_tags = lf.filter(
    (pl.col("special_tags").is_not_null()) & (pl.col("special_tags").list.lengths() > 0)
)

# Get the first 100 entries
first_100 = non_empty_special_tags.head(5)

first_100.collect()


  (pl.col("special_tags").is_not_null()) & (pl.col("special_tags").list.lengths() > 0)


id,text,sent_spans,punct_spans,special_tags,ner_spans,pos_tokens,pos_tags,ner_iob
str,str,list[struct[2]],list[struct[3]],list[struct[4]],list[struct[4]],list[str],list[str],list[str]
"""sent_3972b602-bb67-4969-8c4f-6c20599dcf4e""","""The opinions expressed by columnists are their own and do not represent the views of Townhall.com. You have to give President Barack Obama credit for one thing: consistency. Nothing is ever his fault…","[{98,0}, {100,98}, … {6021,5991}]","[{94,93,"".""}, {98,97,"".""}, … {6021,6020,"".""}]","[{97,85,""URL"",""Townhall.com""}]","[{139,""PERSON-BARACK_OBAMA"",""PERSON"",127}, {154,""CARDINAL-ONE"",""CARDINAL"",151}, … {6020,""GPE-AMERICA"",""GPE"",6013}]","[""The"", ""opinions"", … "".""]","[""DET"", ""NOUN"", … ""PUNCT""]","[""O"", ""O"", … ""O""]"
"""sent_30f31821-cf62-46f9-8c23-66ac5970cb13""","""Introduction On Feb. 1, 2017, the United States led Coalition was accused by local activists and journalists that it had bombed the headquarters of the Syrian Arab Red Crescent (SARC) located in the …","[{12,0}, {14,12}, … {8585,8464}]","[{21,20,"".""}, {24,23,"",""}, … {8585,8584,"".""}]","[{3646,3628,""URL"",""hotelscombined.com""}, {3866,3856,""URL"",""esyria.net""}, {3893,3882,""URL"",""dp-news.com""}]","[{29,""DATE-FEB._1,_2017"",""DATE"",17}, {48,""GPE-UNITED_STATES"",""GPE"",35}, … {8556,""GPE-IDLIB"",""GPE"",8551}]","[""Introduction"", ""On"", … "".""]","[""NOUN"", ""ADP"", … ""PUNCT""]","[""O"", ""O"", … ""O""]"
"""sent_4b311934-e54a-44e7-9597-ad7ffa6af9ea""","""Whenever something momentous happens in superhero comics history, mysterious figures always seem to appear and observe proceedings. In the Marvel Universe, it's the Watcher (we still have no idea why …","[{131,0}, {242,132}, … {22954,22865}]","[{65,64,"",""}, {131,130,"".""}, … {22954,22953,"".""}]","[{15782,15770,""ADDRESS"",""1 take place""}]","[{172,""PERSON-WATCHER"",""PERSON"",165}, {227,""PERSON-BLACK_PANTHER"",""PERSON"",214}, … {22898,""ORG-DCU"",""ORG"",22895}]","[""Whenever"", ""something"", … "".""]","[""SCONJ"", ""PRON"", … ""PUNCT""]","[""O"", ""O"", … ""O""]"
"""sent_f9b63eb4-46ec-433e-adac-55dfdf574e47""","""Executive summary This paper reviews the empirical literature on the employment effects of increases in the minimum wage. It organizes the most prominent studies in this literature by their use of tw…","[{17,0}, {19,17}, … {45697,45657}]","[{122,121,"".""}, {233,232,"":""}, … {45697,45696,"".""}]","[{41183,41130,""URL"",""http://www.irle.berkeley.edu/workingpapers/148-13.pdf""}, {42787,42694,""URL"",""http://www.irle.berkeley.edu/events/spring14/zipperer/dubezipperer_pooledsyntheticcontrol.pdf""}, {45696,45657,""URL"",""http://www.ukcpr.org/AvailableData.aspx""}]","[{201,""CARDINAL-TWO"",""CARDINAL"",198}, {1670,""PERSON-HARRY_TRUMAN"",""PERSON"",1658}, … {45566,""DATE-2012"",""DATE"",45562}]","[""Executive"", ""summary"", … "".""]","[""ADJ"", ""NOUN"", … ""PUNCT""]","[""O"", ""O"", … ""O""]"
"""sent_ced5a3dc-d7c6-4d7d-979f-43c1cc0622ac""","""If you're waiting for someone who has to drive in San Francisco, especially around the Moscone Center area, take a deep breath and get comfortable. A giant tech conference is in town and it's causing …","[{147,0}, {232,148}, … {3574,3439}]","[{7,6,""'""}, {64,63,"",""}, … {3574,3573,"".""}]","[{3531,3507,""ADDRESS"",""150 parties taking place""}]","[{63,""GPE-SAN_FRANCISCO"",""GPE"",50}, {101,""FAC-MOSCONE_CENTER"",""FAC"",87}, … {3573,""DATE-FROM_NOV._6_TO_NOV._9"",""DATE"",3552}]","[""If"", ""you"", … "".""]","[""SCONJ"", ""PRON"", … ""PUNCT""]","[""O"", ""O"", … ""O""]"


In [27]:
import re

TYPE_REGEX = re.compile(r'\{\s*\d+,\s*\d+,\s*"([^"]+)"\s*,\s*"[^"]*"\s*\}')
def extract_entity_types(span_blob: str) -> list[str]:
    return sorted(set(TYPE_REGEX.findall(span_blob)))

In [30]:
import polars as pl

# --- parsing and masking helpers (no regex on text) ---

def _parse_special_tags(blob: str):
    # Parse strings like:
    # [{41183,41130,"URL","http://..."}, {3531,3507,"EMAIL","john@doe.com"}]
    if not blob:
        return []
    out = []
    i, n = 0, len(blob)

    def skip_ws(j):
        while j < n and blob[j].isspace():
            j += 1
        return j

    def parse_int(j):
        j = skip_ws(j)
        k = j
        while k < n and blob[k].isdigit():
            k += 1
        return int(blob[j:k]), k

    def parse_quoted(j):
        j = skip_ws(j)
        assert blob[j] == '"'
        j += 1
        s = []
        while j < n and blob[j] != '"':
            s.append(blob[j])
            j += 1
        assert j < n and blob[j] == '"'
        return "".join(s), j + 1

    while i < n:
        i = skip_ws(i)
        if i >= n:
            break
        if blob[i] == '{':
            i += 1
            start, i = parse_int(i)
            i = skip_ws(i + 1)  # skip comma
            end, i = parse_int(i)
            i = skip_ws(i + 1)
            typ, i = parse_quoted(i)
            i = skip_ws(i + 1)
            val, i = parse_quoted(i)
            # move past closing brace if present
            while i < n and blob[i] != '}':
                i += 1
            if i < n and blob[i] == '}':
                i += 1
            out.append({"start": start, "end": end, "type": typ, "value": val})
        else:
            i += 1
    return out

def _mask_text_with_spans(text: str, spans: list[dict]) -> str:
    if not text or not spans:
        return text

    # Keep only EMAIL/URL, normalize bounds, and choose replacement label
    norm = []
    for s in spans:
        t = s.get("type")
        if t not in ("EMAIL", "URL"):
            continue
        a, b = sorted((int(s["start"]), int(s["end"])))
        if a == b:
            continue
        repl = "<EMAIL>" if t == "EMAIL" else "<URL>"
        # clamp
        a = max(0, min(len(text), a))
        b = max(0, min(len(text), b))
        if a < b:
            norm.append((a, b, repl))

    if not norm:
        return text

    # Sort and merge overlaps; prefer longer span on overlap
    norm.sort(key=lambda x: (x[0], -x[1]))
    merged = []
    for a, b, repl in norm:
        if not merged or a > merged[-1][1]:
            merged.append([a, b, repl])
        else:
            # overlap: extend if longer; keep earlier repl if equal
            if b > merged[-1][1]:
                merged[-1][1] = b
                merged[-1][2] = repl

    # Build output
    out, cur = [], 0
    for a, b, repl in merged:
        out.append(text[cur:a])
        out.append(repl)
        cur = b
    out.append(text[cur:])
    return "".join(out)

def mask_from_blob(text: str, blob: str) -> str:
    spans = _parse_special_tags(blob)
    return _mask_text_with_spans(text, spans)

# --- lazy Polars flow: sample 100 rows and mask ---

def sample_mask_urls_emails_without_regex(
    parquet_path: str,
    text_col: str = "text",
    tags_col: str = "special_tags",
    sample_rows: int = 100,
) -> pl.DataFrame:
    ldf = (
        pl.scan_parquet(parquet_path)
        .filter(
            pl.col(text_col).is_not_null() & (pl.col(text_col).str.len_chars() > 0)
            & pl.col(tags_col).is_not_null() & (pl.col(tags_col).str.len_chars() > 2)
        )
        .with_columns(
            pl.struct([text_col, tags_col])
            .map_elements(lambda s: mask_from_blob(s[text_col], s[tags_col]), return_dtype=pl.Utf8)
            .alias("text_masked")
        )
        .select(pl.all(), "text_masked")
        .limit(sample_rows)
    )
    return ldf.collect()

# Example:
df_sample = sample_mask_urls_emails_without_regex("data.parquet", "text", "special_tags", 10)
df_sample.write_parquet("masked_sample.parquet")

FileNotFoundError: The system cannot find the file specified. (os error 2): data.parquet

This error occurred with the following context stack:
	[1] 'parquet scan' failed
	[2] 'filter' input failed to resolve
	[3] 'with_columns' input failed to resolve
	[4] 'select' input failed to resolve
	[5] 'slice' input failed to resolve
