In interactive notebook, the `spark` object is already created.
Instructors tested with 1 driver, 6 executors of small e4 (24 cores, 192GB memory)

### Launch spark environment

In [32]:
spark

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 17, 37, Finished, Available)

In [1]:
%%configure -f \
{"conf": {"spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.2"}}

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, -1, Finished, Available)

Unrecognized options: 

### Set up data configuration

In [2]:
blob_account_name = "marckvnonprodblob"
blob_container_name = "bigdata"
# read only
blob_sas_token = "?sv=2021-10-04&st=2023-10-04T01%3A42%3A59Z&se=2024-01-02T02%3A42%3A00Z&sr=c&sp=rlf&sig=w3CH9MbCOpwO7DtHlrahc7AlRPxSZZb8MOgS6TaXLzI%3D"

wasbs_base_url = (
    f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/"
)
spark.conf.set(
    f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net",
    blob_sas_token,
)

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 6, Finished, Available)

#### Reading in single parquet file

In [3]:
comments_path = "reddit-parquet/comments/"
submissions_path = "reddit-parquet/submissions/"

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 7, Finished, Available)

In [4]:
topic = ["Tetris","pokemon","SuperMario","GTA","CallOfDuty","FIFA","legostarwars",
"assassinscreed","thesims","FinalFantasy"] 

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 8, Finished, Available)

### Reeading in all of the Reddit data

In [5]:
comments_df = spark.read.parquet(f"{wasbs_base_url}{comments_path}")
submissions_df = spark.read.parquet(f"{wasbs_base_url}{submissions_path}")

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 9, Finished, Available)

In [6]:
from pyspark.sql.functions import length, col,split
sub_filtered = submissions_df.filter((length(col("selftext")) > 0)& (col("selftext") != "[deleted]")&(col('selftext')!= "[removed]"))\
.filter(col("subreddit").isin(topic))

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 10, Finished, Available)

In [7]:
df_save = sub_filtered.select("subreddit", "title", "selftext","year","month").cache()
df_save.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 11, Finished, Available)

+--------------+--------------------+--------------------+----+-----+
|     subreddit|               title|            selftext|year|month|
+--------------+--------------------+--------------------+----+-----+
|       pokemon|the PokemonTogeth...|So several days a...|2023|    2|
|       pokemon|Who's a non-villa...|For me, Tyme insp...|2023|    2|
|       pokemon|i have a realization|&amp;#x200B;\n\n[...|2023|    2|
|          FIFA|Is there any reas...|For the past 10 d...|2023|    2|
|           GTA|What should I buy...|I have around 5 m...|2023|    2|
|           GTA|what is the name ...|I know the Nero i...|2023|    2|
|       pokemon|Name any Bug type...|Ok now we’re doin...|2023|    2|
|       pokemon|My starters for e...|Gen 1: Charizard ...|2023|    2|
|       thesims|The Victoria Chal...|\n\nI made my own...|2023|    2|
|       pokemon|I really fucking ...|I feel like it's ...|2023|    2|
|       thesims|The sim 4 build m...|So whenever I pla...|2023|    2|
|          FIFA|  fl

## Using TFIDF to identify the key points for each game 

In [8]:
!pip install spark-nlp

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 12, Finished, Available)



In [28]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer as tot, StopWordsRemover

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 32, Finished, Available)

In [93]:
# Define the Spark ML components
tokenizer_nlp = (
    Tokenizer()
    .setInputCols(["document"])
    .setOutputCol("tokens_nlp")
)
stop_words = (
    StopWordsCleaner().pretrained("stopwords_iso","en")
    .setInputCols("tokens_nlp")
    .setOutputCol("cleanTokens")
)

documentAssembler = DocumentAssembler()\
    .setInputCol("selftext")\
    .setOutputCol("document")
    
use = UniversalSentenceEncoder.pretrained(name="tfhub_use", lang="en")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")


sentimental = SentimentDLModel.pretrained(lang="en")\
    .setInputCols(["sentence_embeddings"])\
    .setOutputCol("sentiment")
# Create a pipeline
pipeline1 = Pipeline(stages=[documentAssembler, use,sentimental])

# Fit the pipeline on the data
model = pipeline1.fit(df_save)

# Transform the data to get TF-IDF features
result = model.transform(df_save)
result.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 97, Finished, Available)

stopwords_iso download started this may take some time.
Approximate size to download 2.1 KB
[OK!]
tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]
sentimentdl_use_imdb download started this may take some time.
Approximate size to download 12 MB
[OK!]
+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+
|     subreddit|               title|            selftext|year|month|            document| sentence_embeddings|           sentiment|
+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+
|       pokemon|the PokemonTogeth...|So several days a...|2023|    2|[{document, 0, 13...|[{sentence_embedd...|[{category, 0, 13...|
|       pokemon|Who's a non-villa...|For me, Tyme insp...|2023|    2|[{document, 0, 66...|[{sentence_embedd...|[{category, 0, 66...|
|       pokemon|i have a realization|

In [94]:
# Define the HashingTF stage
tokenizer = tot(inputCol="selftext", outputCol="tokens")
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")
exploder = stopwords_remover.transform(tokenizer.transform(result)).select("selftext", F.explode("filtered_tokens").alias("filtered_tokens"))
hashing_tf = HashingTF(inputCol="filtered_tokens", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
pipeline2 = Pipeline(stages=[tokenizer, stopwords_remover])
model = pipeline2.fit(result)

# Transform the data to get TF-IDF features
result2 = model.transform(result)
result2 = result2.withColumn("filtered_tokens",f.explode("filtered_tokens"))
pipeline3 = Pipeline(stages=[hashing_tf,idf])
model2 = pipeline3.fit(result2)
result3 = model2.transform(result2)
result3.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 98, Finished, Available)

IllegalArgumentException: requirement failed: The input column must be array, but got string.

In [33]:
result.cache().show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 37, Finished, Available)

+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     subreddit|               title|            selftext|year|month|            document| sentence_embeddings|           sentiment|              tokens|     filtered_tokens|         rawFeatures|            features|
+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|       pokemon|the PokemonTogeth...|So several days a...|2023|    2|[{document, 0, 13...|[{sentence_embedd...|[{category, 0, 13...|[so, several, day...|[several, days, a...|(262144,[3888,840...|(262144,[3888,840...|
|       pokemon|Who's a non-villa...|For me, Tyme insp...|2023|    2|[{document, 0, 66...|[{sentence_embedd...|[{category, 0, 66...|

In [35]:
result = result.withColumn("sentiment",F.explode('sentiment.result'))
result.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 39, Finished, Available)

+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
|     subreddit|               title|            selftext|year|month|            document| sentence_embeddings|sentiment|              tokens|     filtered_tokens|         rawFeatures|            features|
+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+
|       pokemon|the PokemonTogeth...|So several days a...|2023|    2|[{document, 0, 13...|[{sentence_embedd...|      pos|[so, several, day...|[several, days, a...|(262144,[3888,840...|(262144,[3888,840...|
|       pokemon|Who's a non-villa...|For me, Tyme insp...|2023|    2|[{document, 0, 66...|[{sentence_embedd...|      neg|[for, me,, tyme, ...|[me,, tyme, inspi...|(262144,[3421

In [60]:
result.select("filtered_tokens").take(4)

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 64, Finished, Available)

[Row(filtered_tokens=['several', 'days', 'ago,', 'pokemon', 'company', 'put', 'campaign', 'celebration', 'pokemon', 'day', 'approaching', 'fans', 'submit', 'short', 'story', '', 'favourite', 'memories', 'related', 'pokemon', 'chance', 'submission', 'photo', 'appear', 'pokemon', 'day', 'mosaic', 'tag', 'tpci', 'use', 'hashtag', 'social', 'media.', '', 'yet,', 'please', 'soon!', 'have,', 'fond', 'memory', 'pokemon', 'series', 'experienced?', '', '', '', 'mine', 'pokemon', 'series', 'whole', 'inspired', 'pursue', 'biology', 'wayy', 'back', 'elementary', '/', 'middle', 'school.', 'loved', 'outdoors', 'activities', 'well', 'studying', 'different', 'kinds', 'animals', '(specifically', 'birds),', 'addition,', 'family', 'lived', 'right', 'next', 'untouched', 'woods', 'always', 'many', 'kinds', 'birds', 'younger', 'watch', 'learn', 'about!', '', 'currently', 'writing', 'this,', '2nd', 'yr', 'college', 'biotech', 'major', 'nothing', 'beats', 'opening', 'pokemon', 'game', 'exploring', "region's",

In [51]:
from pyspark.sql import functions as f
from pyspark.sql.types import MapType, StringType
ndf = result.select("subreddit",f.explode('filtered_tokens').name('expwords'),"rawFeatures","year","month","sentiment").withColumn('filtered_tokens',f.array('expwords'))
hashudf = f.udf(lambda vector : vector.indices.tolist()[0],StringType())
wordtf = ndf.withColumn('wordhash', hashudf(col('rawFeatures')))

# Show the resulting DataFrame
wordtf = wordtf.drop('rawFeatures')
wordtf.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 55, Finished, Available)

+---------+-----------+----+-----+---------+---------------+--------+
|subreddit|   expwords|year|month|sentiment|filtered_tokens|wordhash|
+---------+-----------+----+-----+---------+---------------+--------+
|  pokemon|    several|2023|    2|      pos|      [several]|    3888|
|  pokemon|       days|2023|    2|      pos|         [days]|    3888|
|  pokemon|       ago,|2023|    2|      pos|         [ago,]|    3888|
|  pokemon|    pokemon|2023|    2|      pos|      [pokemon]|    3888|
|  pokemon|    company|2023|    2|      pos|      [company]|    3888|
|  pokemon|        put|2023|    2|      pos|          [put]|    3888|
|  pokemon|   campaign|2023|    2|      pos|     [campaign]|    3888|
|  pokemon|celebration|2023|    2|      pos|  [celebration]|    3888|
|  pokemon|    pokemon|2023|    2|      pos|      [pokemon]|    3888|
|  pokemon|        day|2023|    2|      pos|          [day]|    3888|
|  pokemon|approaching|2023|    2|      pos|  [approaching]|    3888|
|  pokemon|       fa

In [77]:
from pyspark.ml.linalg import SparseVector
from pyspark.sql.types import ArrayType, DoubleType
#df_exploded = result.select("features", "filtered_tokens",F.explode("filtered_tokens").alias("token"))
# Define a UDF to extract values from SparseVector
extract_values_udf = f.udf(lambda v: v.toArray().tolist(), ArrayType(DoubleType()))

# Apply the UDF to the DataFrame
df_exploded = result.withColumn("feature_values", extract_values_udf("features"))
#df_exploded = df_exploded.select("token", F.explode("feature_dict").alias("feature_key", "feature_value"))
# Show the exploded DataFrame
df_exploded.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 81, Finished, Available)

+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     subreddit|               title|            selftext|year|month|            document| sentence_embeddings|sentiment|              tokens|     filtered_tokens|         rawFeatures|            features|      feature_values|
+--------------+--------------------+--------------------+----+-----+--------------------+--------------------+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|       pokemon|the PokemonTogeth...|So several days a...|2023|    2|[{document, 0, 13...|[{sentence_embedd...|      pos|[so, several, day...|[several, days, a...|(262144,[3888,840...|(262144,[3888,840...|[0.0, 0.0, 0.0, 0...|
|       pokemon|Who's a non-villa...|For me, Tyme insp...|2023|    2|[{document, 0, 66...|[{

In [79]:

udf1 = f.udf(lambda vec : dict(zip(vec.indices.tolist(),vec.values.tolist())),MapType(StringType(),StringType()))
valuedf = result.select('subreddit','filtered_tokens',f.explode(udf1(f.col('features'))).name('wordhash','value'))
#valuedf = valuedf.withColumn("filtered_tokens",f.explode("filtered_tokens").alias("tokens"))
valuedf.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 83, Finished, Available)

+---------+--------------------+--------+------------------+
|subreddit|     filtered_tokens|wordhash|             value|
+---------+--------------------+--------+------------------+
|  pokemon|[several, days, a...|  196098| 5.764472278598879|
|  pokemon|[several, days, a...|  116996|3.4225891399086903|
|  pokemon|[several, days, a...|  111370|3.7956553068448686|
|  pokemon|[several, days, a...|   73740| 7.329341871553103|
|  pokemon|[several, days, a...|  186381| 6.062033192055356|
|  pokemon|[several, days, a...|  151058|4.6907425222526955|
|  pokemon|[several, days, a...|  241691| 6.263305753319606|
|  pokemon|[several, days, a...|   13340| 13.01980193133773|
|  pokemon|[several, days, a...|   93729|  7.63426575043939|
|  pokemon|[several, days, a...|  201511| 4.043526564009793|
|  pokemon|[several, days, a...|  205861|  6.79194922040239|
|  pokemon|[several, days, a...|  229166| 3.008696354702516|
|  pokemon|[several, days, a...|  186925| 2.998777236454856|
|  pokemon|[several, day

In [82]:
from pyspark.sql.window import Window
window_spec = Window.partitionBy("filtered_tokens")
window_spec.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 86, Finished, Available)

AnalysisException: cannot resolve 'explode(word_info)' due to data type mismatch: input to function explode should be array or map type, not struct<filtered_tokens:array<string>,wordhash:string,value:string>;
'Project [subreddit#1457, explode(word_info#14152) AS word_info#14158]
+- Project [subreddit#1457, filtered_tokens#1988, wordhash#13610, value#13611, struct(filtered_tokens, filtered_tokens#1988, wordhash, wordhash#13610, value, value#13611) AS word_info#14152]
   +- Project [subreddit#1457, filtered_tokens#1988, wordhash#13610, value#13611]
      +- Generate explode(<lambda>(features#2019)), false, [wordhash#13610, value#13611]
         +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1520, sentiment#2924, tokens#1972, filtered_tokens#1988, rawFeatures#2004, features#2019]
            +- Generate explode(sentiment#1531.result), false, [sentiment#2924]
               +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1520, sentiment#1531, tokens#1972, filtered_tokens#1988, rawFeatures#2004, UDF(rawFeatures#2004) AS features#2019]
                  +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1520, sentiment#1531, tokens#1972, filtered_tokens#1988, UDF(filtered_tokens#1988) AS rawFeatures#2004]
                     +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1520, sentiment#1531, tokens#1972, UDF(tokens#1972) AS filtered_tokens#1988]
                        +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1520, sentiment#1531, UDF(selftext#1475) AS tokens#1972]
                           +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1520, UDF(array(sentence_embeddings#1520)) AS sentiment#1531]
                              +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1511 AS sentence_embeddings#1520]
                                 +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1502, sentence_embeddings#1455 AS sentence_embeddings#1511]
                                    +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1493, document#1454 AS document#1502, sentence_embeddings#1455]
                                       +- Project [subreddit#1457, title#1466, selftext#1475, year#1484, month#1453 AS month#1493, document#1454, sentence_embeddings#1455]
                                          +- Project [subreddit#1457, title#1466, selftext#1475, year#1452 AS year#1484, month#1453, document#1454, sentence_embeddings#1455]
                                             +- Project [subreddit#1457, title#1466, selftext#1451 AS selftext#1475, year#1452, month#1453, document#1454, sentence_embeddings#1455]
                                                +- Project [subreddit#1457, title#1450 AS title#1466, selftext#1451, year#1452, month#1453, document#1454, sentence_embeddings#1455]
                                                   +- Project [subreddit#1449 AS subreddit#1457, title#1450, selftext#1451, year#1452, month#1453, document#1454, sentence_embeddings#1455]
                                                      +- SerializeFromObject [if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 0, subreddit), StringType), true, false, true) AS subreddit#1449, if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 1, title), StringType), true, false, true) AS title#1450, if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 2, selftext), StringType), true, false, true) AS selftext#1451, if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 3, year), IntegerType) AS year#1452, if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 4, month), IntegerType) AS month#1453, if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else mapobjects(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), if (isnull(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)))) null else named_struct(annotatorType, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 0, annotatorType), StringType), true, false, true), begin, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 1, begin), IntegerType), end, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 2, end), IntegerType), result, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 3, result), StringType), true, false, true), metadata, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else newInstance(class org.apache.spark.sql.catalyst.util.ArrayBasedMapData), embeddings, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else staticinvoke(class org.apache.spark.sql.catalyst.util.ArrayData, ArrayType(FloatType,false), toArrayData, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1283), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 5, embeddings), ArrayType(FloatType,false)), true, false, true)), validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 5, document), ArrayType(StructType(StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)),true)), None) AS document#1454, if (assertnotnull(input[0, org.apache.spark.sql.Row, true]).isNullAt) null else mapobjects(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), if (isnull(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)))) null else named_struct(annotatorType, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 0, annotatorType), StringType), true, false, true), begin, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 1, begin), IntegerType), end, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 2, end), IntegerType), result, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else staticinvoke(class org.apache.spark.unsafe.types.UTF8String, StringType, fromString, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 3, result), StringType), true, false, true), metadata, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else newInstance(class org.apache.spark.sql.catalyst.util.ArrayBasedMapData), embeddings, if (validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)).isNullAt) null else staticinvoke(class org.apache.spark.sql.catalyst.util.ArrayData, ArrayType(FloatType,false), toArrayData, validateexternaltype(getexternalrowfield(validateexternaltype(lambdavariable(MapObject, ObjectType(class java.lang.Object), true, 1286), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), 5, embeddings), ArrayType(FloatType,false)), true, false, true)), validateexternaltype(getexternalrowfield(assertnotnull(input[0, org.apache.spark.sql.Row, true]), 6, sentence_embeddings), ArrayType(StructType(StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)),true)), None) AS sentence_embeddings#1455]
                                                         +- MapPartitions com.johnsnowlabs.nlp.AnnotatorModel$$Lambda$5818/525231699@3f8efc78, obj#1448: org.apache.spark.sql.Row
                                                            +- DeserializeToObject createexternalrow(subreddit#102.toString, title#111.toString, selftext#99.toString, year#114, month#115, staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279), if (isnull(lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279))) null else createexternalrow(if (lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).isNullAt) null else lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).annotatorType.toString, if (lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).isNullAt) null else lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).begin, if (lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).isNullAt) null else lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).end, if (lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).isNullAt) null else lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).result.toString, if (lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).isNullAt) null else staticinvoke(class org.apache.spark.sql.catalyst.util.ArrayBasedMapData$, ObjectType(interface scala.collection.Map), toScalaMap, staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(lambdavariable(MapObject, StringType, true, 1280), lambdavariable(MapObject, StringType, true, 1280).toString, lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).metadata.keyArray, None).array, true, false, true), staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(lambdavariable(MapObject, StringType, true, 1281), lambdavariable(MapObject, StringType, true, 1281).toString, lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).metadata.valueArray, None).array, true, false, true), true, false, true), if (lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).isNullAt) null else staticinvoke(class scala.collection.mutable.WrappedArray$, ObjectType(interface scala.collection.Seq), make, mapobjects(lambdavariable(MapObject, FloatType, true, 1282), lambdavariable(MapObject, FloatType, true, 1282), lambdavariable(MapObject, StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true), true, 1279).embeddings, None).array, true, false, true), StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)), document#1430, None).array, true, false, true), StructField(subreddit,StringType,true), StructField(title,StringType,true), StructField(selftext,StringType,true), StructField(year,IntegerType,true), StructField(month,IntegerType,true), StructField(document,ArrayType(StructType(StructField(annotatorType,StringType,true), StructField(begin,IntegerType,false), StructField(end,IntegerType,false), StructField(result,StringType,true), StructField(metadata,MapType(StringType,StringType,true),true), StructField(embeddings,ArrayType(FloatType,false),true)),true),true)), obj#1447: org.apache.spark.sql.Row
                                                               +- Project [subreddit#102, title#111, selftext#99, year#114, month#115, UDF(selftext#99) AS document#1430]
                                                                  +- Project [subreddit#102, title#111, selftext#99, year#114, month#115]
                                                                     +- Filter subreddit#102 IN (Tetris,pokemon,SuperMario,GTA,CallOfDuty,FIFA,legostarwars,assassinscreed,thesims,FinalFantasy)
                                                                        +- Filter (((length(selftext#99) > 0) AND NOT (selftext#99 = [deleted])) AND NOT (selftext#99 = [removed]))
                                                                           +- Relation [adserver_click_url#46,adserver_imp_pixel#47,archived#48,author#49,author_cakeday#50,author_flair_css_class#51,author_flair_text#52,author_id#53,brand_safe#54,contest_mode#55,created_utc#56,crosspost_parent#57,crosspost_parent_list#58,disable_comments#59,distinguished#60,domain#61,domain_override#62,edited#63,embed_type#64,embed_url#65,gilded#66L,hidden#67,hide_score#68,href_url#69,... 46 more fields] parquet


In [54]:
from pyspark.sql.functions import col, desc, collect_list, sort_array
from pyspark.sql import functions as F
res = valuedf.withColumn(
    "value",
    F.first("value").over(F.window("subreddit", "filtered_tokens"))
).withColumn(
    "wordhash",
    F.first("wordhash").over(F.window("subreddit", "filtered_tokens"))
)

res.show(truncate=False)


StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 58, Finished, Available)

AnalysisException: Unable to parse 'filtered_tokens'

In [44]:
result_without_duplicates = joined_df.dropDuplicates()

# Show the resulting DataFrame without duplicates
result_without_duplicates.cache().show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 20, 48, Cancelled, Waiting)

### Saving intermediate data

The intermediate outputs go into the azureml workspace attached storage using the URI `azureml://datastores/workspaceblobstore/paths/<PATH-TO_STORE>` this is the same for all workspaces. Then to re-load you use the same URI

In [8]:
import os
CSV_DIR = os.path.join("Users/yc1063/fall-2023-reddit-project-team-11/data", "csv")
joined_df.write.parquet(f"{CSV_DIR}/sentiment_tfidf.csv")

StatementMeta(cf47b043-0c98-4514-b15a-1e3237e3aed3, 23, 13, Finished, Available)