<a href="https://colab.research.google.com/github/yowainwright/google-colab-notebooks/blob/main/pyspark_tokenizer_udf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install pyspark
!pip install faker

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Using cached pyspark-3.4.0.tar.gz (310.8 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=6369509dfa2693c443881ab346787e169bfd9fe525de214b206586d7b323c7ea
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faker
  Downloading Faker-18.6.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing coll

In [3]:
#{ 1. generate a dataframe with random data for tokenizer udf testing }

from faker import Faker
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize the Spark session
spark = SparkSession.builder \
    .appName("FakeTokenizerUDF") \
    .getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("address", StringType(), True),
])

# generate fake data
fake = Faker()
data = [(i, fake.name(), fake.email(), fake.address().replace('\n', ', ')) for i in range(1, 26)]

# Create a DataFrame using the fake data and schema
fake_users_df = spark.createDataFrame(data, schema)

# Show the DataFrame
fake_users_df.show()

+-------+--------------------+--------------------+--------------------+
|user_id|                name|               email|             address|
+-------+--------------------+--------------------+--------------------+
|      1|  Theresa Richardson| vguzman@example.net|Unit 0961 Box 276...|
|      2|       Steven Porter| laura08@example.org|593 Tracy Station...|
|      3|        Lisa Johnson|humphreywilliam@e...|3746 Jessica Stre...|
|      4|       Heather Welch|hortonarthur@exam...|55924 Frazier Str...|
|      5|      Ashley Sampson|cantuwillie@examp...|96930 Martin Knol...|
|      6|      Susan Thornton| hwilson@example.org|7176 James Mill, ...|
|      7|    Elizabeth Patton|malonetravis@exam...|25995 Michael Pla...|
|      8|       Willie Nelson|emilythompson@exa...|3673 Debra Coves,...|
|      9|   Anthony Frederick|darrellcole@examp...|271 Logan Course,...|
|     10|       Michael Giles|barbaracuevas@exa...|25482 Myers Pine,...|
|     11|     Cory Valenzuela|cooperbecky@examp...|

In [4]:
#{ 2. create a udf to reverse the order of a string }

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType

# Define the UDF to reverse a string
@udf(returnType=StringType())
def reverse_string(s):
    return s[::-1]


# Register the UDF to the spark session
spark.udf.register("reverse_string", reverse_string)

reversed_df = fake_users_df.withColumn("name", reverse_string(col("name")))


# Show the original and reversed names
reversed_df.show()

+-------+--------------------+--------------------+--------------------+
|user_id|                name|               email|             address|
+-------+--------------------+--------------------+--------------------+
|      1|  nosdrahciR aserehT| vguzman@example.net|Unit 0961 Box 276...|
|      2|       retroP nevetS| laura08@example.org|593 Tracy Station...|
|      3|        nosnhoJ asiL|humphreywilliam@e...|3746 Jessica Stre...|
|      4|       hcleW rehtaeH|hortonarthur@exam...|55924 Frazier Str...|
|      5|      nospmaS yelhsA|cantuwillie@examp...|96930 Martin Knol...|
|      6|      notnrohT nasuS| hwilson@example.org|7176 James Mill, ...|
|      7|    nottaP htebazilE|malonetravis@exam...|25995 Michael Pla...|
|      8|       nosleN eilliW|emilythompson@exa...|3673 Debra Coves,...|
|      9|   kcirederF ynohtnA|darrellcole@examp...|271 Logan Course,...|
|     10|       seliG leahciM|barbaracuevas@exa...|25482 Myers Pine,...|
|     11|     aleuznelaV yroC|cooperbecky@examp...|

In [5]:
import base64

#{ 3. a mock tokenize/detokenize sdk }

# This class doesn't have to be an SDK but that make it more controllable

class Tokenizer:
    def __init__(self, bearer_token):
        self.bearer_token = bearer_token

    def _xor_cipher(self, text, key):
        return ''.join(chr(ord(c) ^ ord(k)) for c, k in zip(text, key * (len(text) // len(key) + 1)))

    def tokenize(self, plaintext):
        encrypted_text = self._xor_cipher(plaintext, self.bearer_token)
        return base64.urlsafe_b64encode(encrypted_text.encode()).decode()

    def detokenize(self, encrypted_text):
        decoded_text = base64.urlsafe_b64decode(encrypted_text.encode()).decode()
        decrypted_text = self._xor_cipher(decoded_text, self.bearer_token)
        return decrypted_text

# Example usage:
bearer_token = "your_bearer_token_here"
text_to_tokenize = "This is a sample text."

tokenizer = Tokenizer(bearer_token)
tokenized_text = tokenizer.tokenize(text_to_tokenize)
print(f"Tokenized text: {tokenized_text}")

detokenized_text = tokenizer.detokenize(tokenized_text)
print(f"Detokenized text: {detokenized_text}")

Tokenized text: LQccAX8LFkETRQE-GR8HAE4rDR0GSw==
Detokenized text: This is a sample text.


In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

#{ 4. create function wrappers of the tokenizer class }

def tokenize(s):
    return tokenizer.tokenize(s)

def detokenize(s):
    return tokenizer.detokenize(s)
    
tokenize_udf = udf(tokenize, StringType())
detokenize_udf = udf(detokenize, StringType())

In [7]:
from pyspark.sql.functions import col

#{ 5. tokenize emails }

tokenized_email_df = fake_users_df.withColumn("email", tokenize_udf(col("email")))
tokenized_email_df.show()

+-------+--------------------+--------------------+--------------------+
|user_id|                name|               email|             address|
+-------+--------------------+--------------------+--------------------+
|      1|  Theresa Richardson|DwgACDIDCyEXHRMyB...|Unit 0961 Box 276...|
|      2|       Steven Porter|FQ4AAD5SXSEXHRMyB...|593 Tracy Station...|
|      3|        Lisa Johnson|ERoYAjcQABgFDB4zH...|3746 Jessica Stre...|
|      4|       Heather Welch|EQAHBjAMBBMGDQctN...|55924 Frazier Str...|
|      5|      Ashley Sampson|Gg4bBioVDA0eDBcfE...|96930 Martin Knol...|
|      6|      Susan Thornton|ERgcHiwNCyEXHRMyB...|7176 James Mill, ...|
|      7|    Elizabeth Patton|FA4ZHTEHERMTExssN...|25995 Michael Pla...|
|      8|       Willie Nelson|HAIcHiYWDQ4fFQEwG...|3673 Debra Coves,...|
|      9|   Anthony Frederick|HQ4HADoOCQIdCRcfE...|271 Logan Course,...|
|     10|       Michael Giles|Gw4HED4QBAIHAAQ-B...|25482 Myers Pine,...|
|     11|     Cory Valenzuela|GgAaAjoQBwQRDgsfE...|

In [8]:
#{ 6. detokenize emails }

detokenized_email_df = tokenized_email_df.withColumn("email", detokenize_udf(col("email")))
detokenized_email_df.show()

+-------+--------------------+--------------------+--------------------+
|user_id|                name|               email|             address|
+-------+--------------------+--------------------+--------------------+
|      1|  Theresa Richardson| vguzman@example.net|Unit 0961 Box 276...|
|      2|       Steven Porter| laura08@example.org|593 Tracy Station...|
|      3|        Lisa Johnson|humphreywilliam@e...|3746 Jessica Stre...|
|      4|       Heather Welch|hortonarthur@exam...|55924 Frazier Str...|
|      5|      Ashley Sampson|cantuwillie@examp...|96930 Martin Knol...|
|      6|      Susan Thornton| hwilson@example.org|7176 James Mill, ...|
|      7|    Elizabeth Patton|malonetravis@exam...|25995 Michael Pla...|
|      8|       Willie Nelson|emilythompson@exa...|3673 Debra Coves,...|
|      9|   Anthony Frederick|darrellcole@examp...|271 Logan Course,...|
|     10|       Michael Giles|barbaracuevas@exa...|25482 Myers Pine,...|
|     11|     Cory Valenzuela|cooperbecky@examp...|