In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf, explode, count, avg
from pyspark.sql.types import FloatType, ArrayType, StringType
from pyspark.ml.feature import Tokenizer, StopWordsRemover
from textblob import TextBlob
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns
import re
from collections import Counter
import pandas as pd

spark = SparkSession.builder.appName("HydrationWordCloud").getOrCreate()

data = spark.sql("select * from rnd_gssi_gold_vw.vw_clinicalstudy where ActivityName like 'Meaning of Hydration'")
filtered_data = data.filter(col("QuestionDesc").isin(["BadHydration", "GoodHydration", "HydrationDefination"]))

def _sentiment(text: str) -> float:
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return 0.0  # fallback if error
sentiment_udf = udf(_sentiment, FloatType())

df_sent = filtered_data.withColumn("sentiment_score", sentiment_udf(col("SelectedAnswer")))

tokenizer = Tokenizer(inputCol="SelectedAnswer", outputCol="tokens")
df_tokenized = tokenizer.transform(df_sent)

extra_stop = ["please", "thank", "also"]
remover = StopWordsRemover() \
    .setInputCol("tokens") \
    .setOutputCol("filtered") \
    .setStopWords(StopWordsRemover.loadDefaultStopWords("english") + extra_stop)

df_filtered = remover.transform(df_tokenized)

df_keywords = (
    df_filtered
    .select(
        "QuestionId",
        "QuestionDesc",
        "SelectedAnswer",
        explode("filtered").alias("Keywords"),
        "sentiment_score"
    )
    .groupBy(
        "QuestionId",
        "QuestionDesc",
        "SelectedAnswer",
        "Keywords"
    )
    .agg(
        count("*").alias("frequency"),
        avg("sentiment_score").alias("sentiment_score")
    )
)

df_keywords.createOrReplaceGlobalTempView("keyword_counts_with_sentiment")



[0;31m---------------------------------------------------------------------------[0m
[0;31mModuleNotFoundError[0m                       Traceback (most recent call last)
File [0;32m<command-5892716120266130>, line 5[0m
[1;32m      3[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m[38;5;21;01m.[39;00m[38;5;21;01mtypes[39;00m [38;5;28;01mimport[39;00m FloatType, ArrayType, StringType
[1;32m      4[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01mml[39;00m[38;5;21;01m.[39;00m[38;5;21;01mfeature[39;00m [38;5;28;01mimport[39;00m Tokenizer, StopWordsRemover
[0;32m----> 5[0m [38;5;28;01mfrom[39;00m [38;5;21;01mtextblob[39;00m [38;5;28;01mimport[39;00m TextBlob
[1;32m      6[0m [38;5;28;01mimport[39;00m [38;5;21;01mmatplotlib[39;00m[38;5;21;01m.[39;00m[38;5;21;01mpyplot[39;00m [38;5;28;01mas[39;00m [38;5;21;01mplt[39;00m
[1;32m      7[0m [38;5;28;01m