In [1]:
# Download Java Virtual Machine (JVM)
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download Spark
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz
# Unzip the file
!tar xf spark-3.3.1-bin-hadoop3.tgz

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = '/content/spark-3.3.1-bin-hadoop3'

# Install library for finding Spark
!pip install -q findspark
# Import the libary
import findspark
# Initiate findspark
findspark.init()
# Check the location for Spark
findspark.find()

'/content/spark-3.3.1-bin-hadoop3'

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Tokens").getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer

In [4]:
# Create sample DataFrame
dataframe = spark.createDataFrame([
    (0, "Spark is great"),
    (1, "We are learning Spark"),
    (2, "Spark is better than hadoop no doubt")
], ["id", "sentence"])
dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|      Spark is great|
|  1|We are learning S...|
|  2|Spark is better t...|
+---+--------------------+



In [5]:
# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenizer

Tokenizer_97e5ccb38ebc

In [6]:
# Transform and show DataFrame
tokenized_df = tokenizer.transform(dataframe)
tokenized_df.show(truncate=False)

+---+------------------------------------+--------------------------------------------+
|id |sentence                            |words                                       |
+---+------------------------------------+--------------------------------------------+
|0  |Spark is great                      |[spark, is, great]                          |
|1  |We are learning Spark               |[we, are, learning, spark]                  |
|2  |Spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|
+---+------------------------------------+--------------------------------------------+



In [7]:
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)

In [8]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [9]:
# Create a user defined function
count_tokens = udf(word_list_length, IntegerType())

In [10]:
# Create our Tokenizer
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform the DataFrame
tokenized_df = tokenizer.transform(dataframe)

# Select the needed columns and don't truncate results
tokenized_df.withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

+---+------------------------------------+--------------------------------------------+------+
|id |sentence                            |words                                       |tokens|
+---+------------------------------------+--------------------------------------------+------+
|0  |Spark is great                      |[spark, is, great]                          |3     |
|1  |We are learning Spark               |[we, are, learning, spark]                  |4     |
|2  |Spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|7     |
+---+------------------------------------+--------------------------------------------+------+

