In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigqueryExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the Spark session, which is the entry point to the Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Load data from BigQuery.
df = spark.read \
  .format("bigquery") \
  .load("de2025.labdataset.retaildata")    # project_id.datatset.tablename. Use your project id
df.printSchema()
df.show(4)

root
 |-- InvoiceNo: long (nullable = true)
 |-- StockCode: long (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: long (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+--------------+---------+----------+-------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|Country|
+---------+---------+--------------------+--------+--------------+---------+----------+-------+
|   536370|    21791|VINTAGE HEADS AND...|      24|12/1/2010 8:45|     1.25|     12583| France|
|   536370|    22326|ROUND SNACK BOXES...|      24|12/1/2010 8:45|     2.95|     12583| France|
|   536370|    22631|CIRCUS PARADE LUN...|      24|12/1/2010 8:45|     1.95|     12583| France|
|   536370|    21035|SET/2 RED RETROSP...|      18|12/1/2010 8:45|     2.95|     12583| France|


In [2]:
from pyspark.sql.functions import explode, split, concat, col, lit, desc

words = df.where(df.Country == "France").select(
            explode(
                split(col("Description"), " ")
            ).alias("word")
        )
words.show(10)

ordered_word_count = words.groupby(words.word).count().orderBy(col("count").desc())
ordered_word_count.show(100)

+-------+
|   word|
+-------+
|VINTAGE|
|  HEADS|
|    AND|
|  TAILS|
|   CARD|
|   GAME|
|  ROUND|
|  SNACK|
|  BOXES|
|    SET|
+-------+
only showing top 10 rows

+---------+-----+
|     word|count|
+---------+-----+
|      BOX|    1|
|      OF4|    1|
|      RED|    1|
|      TEA|    1|
|    SNACK|    1|
| WOODLAND|    1|
|    TAILS|    1|
|     CARD|    1|
|    ROUND|    1|
|    SET/2|    1|
|    LUNCH|    1|
|      SET|    1|
|   CIRCUS|    1|
|      AND|    1|
|     GAME|    1|
|   PARADE|    1|
|    BOXES|    1|
|  VINTAGE|    1|
|    HEADS|    1|
|RETROSPOT|    1|
|   TOWELS|    1|
+---------+-----+



In [4]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "temp_de2025"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
ordered_word_count.write.format('bigquery') \
  .option('table', 'de2025.labdataset.wordcounts') \
  .mode("append") \
  .save()

In [6]:
# Stop the Spark context
spark.stop()