In [8]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.types import StructField, StructType, StringType, FloatType, IntegerType
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline
import pyspark.sql.functions as F

# %pip install spark-nlp
from sparknlp.annotator import LemmatizerModel, Tokenizer, Normalizer, StopWordsCleaner, NGramGenerator
from sparknlp.base import Finisher, DocumentAssembler

pyspark.__version__

'3.0.3'

In [2]:
credentials_path = "/home/ztmj96/.google/credentials/de-r-stocks.json"

# Spark configuration
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "gcs-connector-hadoop3-2.2.5.jar, spark-bigquery-latest_2.12.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_path) \
    .set('spark.jars.packages', 'com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.3')

# Spark context
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_path)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

# Start Spark session
# In production setting, master will be specified during spark-submit
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

22/04/25 20:03:37 WARN Utils: Your hostname, SilverLining resolves to a loopback address: 127.0.1.1; using 172.28.169.0 instead (on interface eth0)
22/04/25 20:03:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/ztmj96/.ivy2/cache
The jars for the packages stored in: /home/ztmj96/.ivy2/jars
:: loading settings :: url = jar:file:/home/ztmj96/spark/spark-3.0.3-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bb1b3a99-0df5-4ebe-a249-110efe109a79;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.3 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotati

In [3]:
# Access data from GCS
df = spark.read.parquet('gs://datalake_de-r-stocks/stocks/submission/stocks_submission_2022-02-06.parquet')

# 1. Remove posts by AutoModerator
# 2. Remove duplicate titles
# 3. Convert unix timestamp to date
# 4. Keep title and date columns
df_filter = df.filter(~F.col('author').contains('AutoModerator')) \
    .dropDuplicates(['title']) \
        .withColumn('date', F.from_unixtime(F.col('created_utc'), 'yyyy-MM-dd')) \
            .select('title', 'date')
            
documentAssembler = DocumentAssembler() \
     .setInputCol('title') \
     .setOutputCol('title_document')

tokenizer = Tokenizer() \
     .setInputCols(['title_document']) \
     .setOutputCol('title_token')

normalizer = Normalizer() \
     .setInputCols(['title_token']) \
     .setOutputCol('title_normalized') \
     .setLowercase(True)

lemmatizer = LemmatizerModel.pretrained() \
            .setInputCols(['title_normalized']) \
            .setOutputCol('title_lemma')

stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['title_lemma']) \
     .setOutputCol('title_cleaned') \
     .setCaseSensitive(False)

ngrams_cum = NGramGenerator() \
            .setInputCols(["title_cleaned"]) \
            .setOutputCol("title_ngrams") \
            .setN(2) \
            .setEnableCumulative(True)\
            .setDelimiter("_") # Default is space

finisher = Finisher() \
     .setInputCols(['title_ngrams']) \
     .setOutputCols(['title_finished']) \
     .setCleanAnnotations(False)

nlpPipeline = Pipeline(stages=[
              documentAssembler, 
              tokenizer,
              normalizer,
              lemmatizer,
              stopwords_cleaner,
              ngrams_cum,
              finisher
 ])

df_result = nlpPipeline.fit(df_filter).transform(df_filter).select('title_finished', 'date')

# CountVectorizer model
cv = CountVectorizer(inputCol='title_finished', outputCol='features', minDF=3.0)

# Train on all submissions
model = cv.fit(df_result)

df_tokensbydate = df_result.groupBy('date').agg(F.flatten(F.collect_list('title_finished')).alias('title_finished'))

# Get counts for each date
counts = model.transform(df_tokensbydate).select('date','features').collect()

# Create empty dataframe
df_wordcountbydate = spark.createDataFrame(spark.sparkContext.emptyRDD(), 
                        schema=StructType(fields=[
                            StructField("word", StringType()), 
                            StructField("count", FloatType()),
                            StructField("date", StringType())]))

# Append count for each day to dataframe
for row in range(len(counts)):
    test_dict = dict(zip(model.vocabulary, (float(x) for x in counts[row]['features'].values)))
    df_temp = spark.createDataFrame(test_dict.items(), 
                        schema=StructType(fields=[
                            StructField("word", StringType()), 
                            StructField("count", FloatType())]))
    df_temp = df_temp.withColumn('date', F.lit(counts[row]['date']))
    df_wordcountbydate = df_wordcountbydate.unionAll(df_temp)

                                                                                

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ / ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ — ]Download done! Loading the resource.


                                                                                

[OK!]


                                                                                

In [23]:
df_wordcountbydate = df_wordcountbydate.withColumn('count', F.col('count').cast(IntegerType())) \
                        .withColumn('submission_date', F.to_date(F.col('date'), 'yyyy-MM-dd')) \
                        .withColumnRenamed('count', 'wordcount') \
                        .drop('date')

In [9]:
df_sub = df_wordcountbydate.limit(10)        
df_sub.show()
df_sub.printSchema()

AttributeError: 'DataFrame' object has no attribute 'to_date'

In [6]:

# upload dataframe to BigQuery
df_sub.write.format('bigquery') \
  .option('temporaryGcsBucket', 'dataproc-temp-asia-southeast1-199926105384-mjv5yqqm') \
  .save('de-r-stocks.stocks_data.submission_wordcount_test')

#   .option('table', 'de-r-stocks.stocks_data.submission_wordcount_test') \

22/04/25 18:42:21 WARN DefaultCredentialsProvider: Your application has authenticated using end user credentials from Google Cloud SDK. We recommend that most server applications use service accounts instead. If your application continues to use end user credentials from Cloud SDK, you might receive a "quota exceeded" or "API not enabled" error. For more information about service accounts, see https://cloud.google.com/docs/authentication/.
                                                                                

Py4JJavaError: An error occurred while calling o809.save.
: java.lang.RuntimeException: Failed to write to BigQuery
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.scala:69)
	at com.google.cloud.spark.bigquery.BigQueryInsertableRelation.insert(BigQueryInsertableRelation.scala:43)
	at com.google.cloud.spark.bigquery.BigQueryRelationProvider.createRelation(BigQueryRelationProvider.scala:115)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:46)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:90)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:127)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:126)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:962)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:767)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:962)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:414)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:398)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:287)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.NullPointerException
	at com.google.cloud.bigquery.connector.common.BigQueryClient.loadDataIntoTable(BigQueryClient.java:532)
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.loadDataToBigQuery(BigQueryWriteHelper.scala:87)
	at com.google.cloud.spark.bigquery.BigQueryWriteHelper.writeDataFrameToBigQuery(BigQueryWriteHelper.scala:66)
	... 34 more


To run the script on the cluster

```sh
spark-submit \
    --master="${URL}" \
    06_spark_sql.py \
        --input_green=data/pq/green/2021/*/ \
        --input_yellow=data/pq/yellow/2021/*/ \
        --output=data/report-2021
```

```
gcloud dataproc jobs submit spark --properties spark.jars.packages=com.google.cloud.spark:spark-bigquery_2.11:0.9.1-beta

gcloud dataproc jobs submit pyspark --cluster=${CLUSTER} \
    /path/to/your/script.py \
    --jars=gs://hadoop-lib/bigquery/bigquery-connector-hadoop2-latest.jar

spark-submit --jars=gs://hadoop-lib/bigquery/bigquery-connector-hadoop2-latest.jar \
    /path/to/your/script.py
```