In interactive notebook, the `spark` object is already created.
Instructors tested with 1 driver, 6 executors of small e4 (24 cores, 192GB memory)

### Launch spark environment

In [1]:
spark

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 51, 6, Finished, Available)

In [2]:
%%configure -f \
{"conf": {"spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.2"}}

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, -1, Finished, Available)

Unrecognized options: 

### Set up data configuration

In [3]:
blob_account_name = "marckvnonprodblob"
blob_container_name = "bigdata"
# read only
blob_sas_token = "?sv=2021-10-04&st=2023-10-04T01%3A42%3A59Z&se=2024-01-02T02%3A42%3A00Z&sr=c&sp=rlf&sig=w3CH9MbCOpwO7DtHlrahc7AlRPxSZZb8MOgS6TaXLzI%3D"

wasbs_base_url = (
    f"wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/"
)
spark.conf.set(
    f"fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net",
    blob_sas_token,
)

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 6, Finished, Available)

#### Reading in single parquet file

In [4]:
comments_path = "reddit-parquet/comments/"
submissions_path = "reddit-parquet/submissions/"

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 7, Finished, Available)

In [5]:
topic = ["Tetris","pokemon","SuperMario","GTA","CallOfDuty","FIFA","legostarwars",
"assassinscreed","thesims","FinalFantasy"] 

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 8, Finished, Available)

### Reeading in all of the Reddit data

In [6]:
comments_df = spark.read.parquet(f"{wasbs_base_url}{comments_path}")
submissions_df = spark.read.parquet(f"{wasbs_base_url}{submissions_path}")

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 9, Finished, Available)

In [7]:
from pyspark.sql.functions import length, col,split
sub_filtered = submissions_df.filter((length(col("title")) > 0)& (col("title") != "[deleted]")&(col('title')!= "[removed]"))\
.filter(col("subreddit").isin(topic))

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 10, Finished, Available)

In [8]:
df_save = sub_filtered.select("subreddit", "title","year","month").sample(fraction= 0.2,seed =22).cache()
df_save.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 11, Finished, Available)

+--------------+--------------------+----+-----+
|     subreddit|               title|year|month|
+--------------+--------------------+----+-----+
|       pokemon|the PokemonTogeth...|2023|    2|
|       pokemon|Who's a non-villa...|2023|    2|
|       thesims|(If we know) when...|2023|    2|
|    CallOfDuty|SBMM was very nic...|2023|    2|
|           GTA|Is GTA china town...|2023|    2|
|assassinscreed|Network issues wi...|2023|    2|
|       pokemon|Opinion on Pokemo...|2023|    2|
|       pokemon|     Team Gyarados🥶|2023|    2|
|       pokemon|I made Tinkaton o...|2023|    2|
|       pokemon|        Vaporeon...?|2023|    2|
|  legostarwars|how much is this ...|2023|    2|
|       pokemon|One of the follow...|2023|    2|
|          FIFA|89 Ben Yedder or ...|2023|    2|
|       pokemon|  Pawmo not evolving|2023|    2|
|       pokemon|Factually the cut...|2023|    2|
|       pokemon|Main Series games...|2023|    2|
|          FIFA|Mbappe &amp; Rttf...|2023|    2|
|           GTA|I use

## Using TFIDF to identify the key points for each game 

In [9]:
!pip install spark-nlp

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 12, Finished, Available)

Collecting spark-nlp
  Downloading spark_nlp-5.1.4-py2.py3-none-any.whl (540 kB)
[K     |████████████████████████████████| 540 kB 9.6 MB/s eta 0:00:01
[?25hInstalling collected packages: spark-nlp
Successfully installed spark-nlp-5.1.4


In [10]:
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer as tot, StopWordsRemover
from pyspark.sql.functions import length, col,split

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 13, Finished, Available)

In [11]:
documentAssembler = DocumentAssembler() \
    .setInputCol('title') \
    .setOutputCol('document')

cleanUpPatterns = ["<[^>]*>"]
# normalizer referred from https://github.com/JohnSnowLabs/spark-nlp/blob/master/examples/python/annotation/text/english/document-normalizer/document_normalizer_notebook.ipynb
documentNormalizer = DocumentNormalizer() \
    .setInputCols("document") \
    .setOutputCol("normalizedDocument") \
    .setAction("clean") \
    .setPatterns(cleanUpPatterns) \
    .setReplacement(" ") \
    .setPolicy("pretty_all") \
    .setLowercase(True)

sentenceDetector = SentenceDetector() \
      .setInputCols(["normalizedDocument"]) \
      .setOutputCol("sentence")
'''
regexTokenizer = Tokenizer() \
      .setInputCols(["sentence"]) \
      .setOutputCol("token") \
      .fit(dataeg)
'''
# tokenization
tokenizer = (
    Tokenizer() \
    .setInputCols(["sentence"]) \
    .setOutputCol("token")
)

# make the words back to root form
stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")

# remove stop words
stop_words_cleaner = StopWordsCleaner() \
    .setInputCols(["stem"])  \
    .setOutputCol("cleaned_token") \
    .setCaseSensitive(False)

# check spelling
spellModel = ContextSpellCheckerModel\
    .pretrained("spellcheck_dl", "en")\
    .setInputCols("cleaned_token")\
    .setOutputCol("final_token")\

nlpcleanPipeline = \
  Pipeline() \
    .setStages([
        documentAssembler,
        documentNormalizer,
        sentenceDetector,
        tokenizer,
        stemmer,
        stop_words_cleaner,
        spellModel])


StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 14, Finished, Available)

spellcheck_dl download started this may take some time.
Approximate size to download 95.1 MB
[OK!]


In [12]:
cleaned_df = nlpcleanPipeline.fit(df_save).transform(df_save)

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 15, Finished, Available)

In [13]:
from pyspark.sql.functions import size, concat_ws
new_df_post = cleaned_df.select("subreddit", 'final_token.result')
# remove empty array
new_df_post_filter = new_df_post.filter(size(col("result")) > 0)


StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 16, Finished, Available)

In [14]:
final_df = new_df_post_filter.withColumn("text", concat_ws(", ", col("result")))
final_df_post = final_df.select("subreddit", 'text')
final_df_post.show(10)

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 17, Finished, Available)

+--------------+--------------------+
|     subreddit|                text|
+--------------+--------------------+
|       pokemon|pokemontogeth, ca...|
|       pokemon|who, non-villain,...|
|       thesims|thank, !, (, know...|
|    CallOfDuty|same, wa, very, n...|
|           GTA|Uta, china, town,...|
|assassinscreed|network, issue, u...|
|       pokemon|opinion, common, ...|
|       pokemon|    team, gyarados🥶|
|       pokemon|Doc, made, Pinkst...|
|       pokemon|?, ., ., Napoleon, .|
+--------------+--------------------+
only showing top 10 rows



In [48]:
document = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

token = Tokenizer() \
.setInputCols(["document"]) \
.setOutputCol("token")

normalizer = Normalizer() \
.setInputCols(["token"]) \
.setOutputCol("normal")

vivekn =  ViveknSentimentModel.pretrained() \
.setInputCols(["document", "normal"]) \
.setOutputCol("result_sentiment")

finisher = Finisher() \
.setInputCols(["result_sentiment"]) \
.setOutputCols("final_sentiment")

pipeline = Pipeline().setStages([document, token, normalizer, vivekn, finisher])

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 51, Finished, Available)

sentiment_vivekn download started this may take some time.
Approximate size to download 873.6 KB
[OK!]


In [49]:
result = pipeline.fit(final_df_post).transform(final_df_post).cache()
result.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 52, Finished, Available)

+--------------+--------------------+---------------+
|     subreddit|                text|final_sentiment|
+--------------+--------------------+---------------+
|       pokemon|pokemontogeth, ca...|     [negative]|
|       pokemon|who, non-villain,...|     [negative]|
|       thesims|thank, !, (, know...|     [negative]|
|    CallOfDuty|same, wa, very, n...|     [positive]|
|           GTA|Uta, china, town,...|     [negative]|
|assassinscreed|network, issue, u...|           [na]|
|       pokemon|opinion, common, ...|     [negative]|
|       pokemon|    team, gyarados🥶|     [positive]|
|       pokemon|Doc, made, Pinkst...|     [negative]|
|       pokemon|?, ., ., Napoleon, .|           [na]|
|  legostarwars|much, the, coli, ...|     [positive]|
|       pokemon|follow, common, f...|     [negative]|
|          FIFA|89, men, leader, ...|     [negative]|
|       pokemon|         paw, evolve|           [na]|
|       pokemon|factual, test, co...|     [positive]|
|       pokemon|main, Geri, g

In [64]:
from pyspark.sql.functions import col, regexp_replace
df = result.withColumn("text", regexp_replace(col("text"), ",", ""))
df = df.withColumn("title", regexp_replace(col("text"), ".*?([^.]+).*", "$1"))

# Remove parentheses from the sentiment
df = df.withColumn("final_sentiment", col("final_sentiment")[0])
df = df.drop("text")
df = df.filter(col("final_sentiment") != "na")
# Show the resulting DataFrame
df.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 67, Finished, Available)

+------------+---------------+--------------------+
|   subreddit|final_sentiment|               title|
+------------+---------------+--------------------+
|     pokemon|       negative|pokemontogeth cam...|
|     pokemon|       negative|who non-villain r...|
|     thesims|       negative|thank ! ( know ) ...|
|  CallOfDuty|       positive|same wa very nice...|
|         GTA|       negative|Uta china town wa...|
|     pokemon|       negative|opinion common un...|
|     pokemon|       positive|     team gyarados🥶|
|     pokemon|       negative|Doc made Pinkston...|
|legostarwars|       positive|much the coli Leg...|
|     pokemon|       negative|follow common fac...|
|        FIFA|       negative|89 men leader 93 ...|
|     pokemon|       positive|factual test comm...|
|     pokemon|       positive|main Geri game ne...|
|        FIFA|       negative|sell ? roof Phill...|
|         GTA|       negative|       us face smile|
|FinalFantasy|       positive|advent children i...|
|     pokemon

In [69]:
df.rdd.getNumPartitions()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 72, Finished, Available)

1

In [72]:
df = df.sample(fraction=0.2,seed= 20).cache()


StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 75, Finished, Available)

In [73]:
import os
CSV_DIR = os.path.join("Users/yc1063/fall-2023-reddit-project-team-11/data", "csv")
df.write.parquet(f"{CSV_DIR}/sentiment_tfidf2.parquet")

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 52, 76, Cancelled, Waiting)

In [42]:
ndf.cache()
ndf.select("subreddit","expwords","year","month","sentiment","wordhash")

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 34, 31, Finished, Available)

DataFrame[subreddit: string, expwords: string, year: int, month: int, sentiment: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, wordhash: int]

In [47]:
udf1 = f.udf(lambda vec : dict(zip(vec.indices.tolist(),vec.values.tolist())),MapType(StringType(),StringType()))
valuedf = result.select('subreddit',"filtered_tokens","year","month","sentiment",f.explode(udf1(f.col('features'))).name('wordhash','value'))
valuedf = valuedf.withColumn("sentiment",f.explode("sentiment.result"))
valuedf = valuedf.drop_duplicates(subset=["wordhash"])
valuedf.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 34, 36, Finished, Available)

+---------+--------------------+----+-----+---------+--------+------------------+
|subreddit|     filtered_tokens|year|month|sentiment|wordhash|             value|
+---------+--------------------+----+-----+---------+--------+------------------+
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|  139265| 9.740140549187382|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|   88005| 5.463474430171326|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|  215686| 6.332122839048911|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|  114628|10.992903517682748|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|   63750| 7.041659799101322|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|   80646| 20.59951267424561|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|   12999|10.992903517682748|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|  186058| 5.152261860309351|
|  pokemon|[gen, 1:, chariza...|2023|    2|      pos|  113673|2.5955078897221076|
|  pokemon|[gen,

In [49]:
valuedf = valuedf.drop_duplicates(subset=["subreddit","wordhash"])

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 34, 38, Finished, Available)

In [50]:
valuedf.cache()
import os
CSV_DIR = os.path.join("Users/yc1063/fall-2023-reddit-project-team-11/data", "csv")
valuedf.toPandas().to_csv(f"{CSV_DIR}/analysis-2.csv")

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 34, 39, Submitted, Running)

In [44]:
result_df = result_df.join(valuedf,['subreddit','wordhash'],"right_outer").cache()
result_df.show()

StatementMeta(3c67b279-1d53-4b7a-b0d9-41cb8b4b6723, 34, 33, Cancelled, Waiting)

In [None]:
result_without_duplicates = joined_df.dropDuplicates()

# Show the resulting DataFrame without duplicates
result_without_duplicates.cache().show()

StatementMeta(, , , Cancelled, )

### Saving intermediate data

The intermediate outputs go into the azureml workspace attached storage using the URI `azureml://datastores/workspaceblobstore/paths/<PATH-TO_STORE>` this is the same for all workspaces. Then to re-load you use the same URI

In [None]:
import os
CSV_DIR = os.path.join("Users/yc1063/fall-2023-reddit-project-team-11/data", "csv")
joined_df.write.parquet(f"{CSV_DIR}/sentiment_tfidf.csv")

StatementMeta(, , , Cancelled, )