## Load Data

In [1]:
val basePath = "/user/yc7093_nyu_edu/imdb-reviews-w-emotion/part"
val fileSuffixes = List("-01-all") //, "-02-all", "-03-all", "-04-all")

val initialPath = s"$basePath${fileSuffixes.head}"
var rawDF = spark.read.parquet(initialPath)

for (suffix <- fileSuffixes.tail) {
  val fullPath = s"$basePath$suffix"
  val part_df = spark.read.parquet(fullPath)
  rawDF = rawDF.union(part_df) 
}

rawDF.show(5)


## Rating Distribution

In [3]:
val ratingDistribution = rawDF.groupBy("rating")
  .count() 
  .orderBy("rating")

ratingDistribution.show()


In [4]:
val outputPath = "/user/yc7093_nyu_edu/imdb-emotion-analysis/rating-distribution"

ratingDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

## Emotion Distribution

In [6]:

val emotionDistribution = rawDF.groupBy("emotion")
  .count()
  .orderBy("emotion")

emotionDistribution.show()


In [7]:
val outputPath = "/user/yc7093_nyu_edu/imdb-emotion-analysis/emotion-distribution"

emotionDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

## Ratings Distribution Within Emotion

In [9]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

val emotionWindow = Window.partitionBy("emotion")

val ratingDistributionWithinEmotion = rawDF.groupBy("emotion", "rating")
  .count() 
  .withColumn("total_count", sum("count").over(emotionWindow))
  .withColumn("percentage", (col("count") / col("total_count")) * 100) 
  .orderBy("emotion", "rating") 


val rowCount = ratingDistributionWithinEmotion.count()
ratingDistributionWithinEmotion.show(rowCount.toInt, truncate = false)



In [10]:
val outputPath = "/user/yc7093_nyu_edu/imdb-emotion-analysis/rating-distribution-w-emotion"


ratingDistributionWithinEmotion.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)


## Emotion Distribution Within Ratings

In [12]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

val ratingWindow = Window.partitionBy("rating")

val emotionDistributionWithinRating = rawDF.groupBy("rating", "emotion")
  .count() 
  .withColumn("total_count", sum("count").over(ratingWindow))
  .withColumn("percentage", (col("count") / col("total_count")) * 100) 
  .orderBy("rating", "emotion") 

val rowCount = emotionDistributionWithinRating.count()
emotionDistributionWithinRating.show(rowCount.toInt, truncate = false)




In [13]:
val outputPath = "/user/yc7093_nyu_edu/imdb-emotion-analysis/emotion-distribution-w-rating"

emotionDistributionWithinRating.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)


## Keyword Distribution by Emotions

In [15]:
import org.apache.spark.sql.functions._

val basePath = "/user/yc7093_nyu_edu/imdb_partitioned_by_emotion_rating_word"
val specificEmotion = "sadness"

val emotionDf = spark.read.parquet(s"$basePath/emotion=$specificEmotion")

val keywordDistribution = emotionDf.groupBy("word")
  .count()
  .orderBy(desc("count")) 



val rowCount = keywordDistribution.count()
keywordDistribution.show(rowCount.toInt, truncate = false)




In [16]:
val outputPath = s"/user/yc7093_nyu_edu/imdb-emotion-analysis/$specificEmotion-keyword-distribution"

keywordDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

In [17]:
import org.apache.spark.sql.functions._

val basePath = "/user/yc7093_nyu_edu/imdb_partitioned_by_emotion_rating_word"
val specificEmotion = "love"

val emotionDf = spark.read.parquet(s"$basePath/emotion=$specificEmotion")

val keywordDistribution = emotionDf.groupBy("word")
  .count() 
  .orderBy(desc("count")) 


val rowCount = keywordDistribution.count()
keywordDistribution.show(rowCount.toInt, truncate = false)

In [18]:
val outputPath = s"/user/yc7093_nyu_edu/imdb-emotion-analysis/$specificEmotion-keyword-distribution"

keywordDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

In [19]:
import org.apache.spark.sql.functions._

val basePath = "/user/yc7093_nyu_edu/imdb_partitioned_by_emotion_rating_word"
val specificEmotion = "joy"

val emotionDf = spark.read.parquet(s"$basePath/emotion=$specificEmotion")

val keywordDistribution = emotionDf.groupBy("word")
  .count() 
  .orderBy(desc("count")) 


val rowCount = keywordDistribution.count()
keywordDistribution.show(rowCount.toInt, truncate = false)

In [20]:
val outputPath = s"/user/yc7093_nyu_edu/imdb-emotion-analysis/$specificEmotion-keyword-distribution"

keywordDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

In [21]:
import org.apache.spark.sql.functions._

val basePath = "/user/yc7093_nyu_edu/imdb_partitioned_by_emotion_rating_word"
val specificEmotion = "surprise"

val emotionDf = spark.read.parquet(s"$basePath/emotion=$specificEmotion")

val keywordDistribution = emotionDf.groupBy("word")
  .count() 
  .orderBy(desc("count")) 


val rowCount = keywordDistribution.count()
keywordDistribution.show(rowCount.toInt, truncate = false)

In [22]:
val outputPath = s"/user/yc7093_nyu_edu/imdb-emotion-analysis/$specificEmotion-keyword-distribution"

keywordDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

In [23]:
import org.apache.spark.sql.functions._

val basePath = "/user/yc7093_nyu_edu/imdb_partitioned_by_emotion_rating_word"
val specificEmotion = "anger"

val emotionDf = spark.read.parquet(s"$basePath/emotion=$specificEmotion")

val keywordDistribution = emotionDf.groupBy("word")
  .count()
  .orderBy(desc("count"))



val rowCount = keywordDistribution.count()
keywordDistribution.show(rowCount.toInt, truncate = false)

In [24]:
val outputPath = s"/user/yc7093_nyu_edu/imdb-emotion-analysis/$specificEmotion-keyword-distribution"

keywordDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

In [25]:
import org.apache.spark.sql.functions._

val basePath = "/user/yc7093_nyu_edu/imdb_partitioned_by_emotion_rating_word"
val specificEmotion = "fear"

val emotionDf = spark.read.parquet(s"$basePath/emotion=$specificEmotion")

val keywordDistribution = emotionDf.groupBy("word")
  .count() // Count occurrences of each word within the emotion
  .orderBy(desc("count")) 


val rowCount = keywordDistribution.count()
keywordDistribution.show(rowCount.toInt, truncate = false)

In [26]:
val outputPath = s"/user/yc7093_nyu_edu/imdb-emotion-analysis/$specificEmotion-keyword-distribution"

keywordDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

## Rating Distribution Within Keywords

In [28]:
import org.apache.spark.sql.functions._

val basePath = "/user/yc7093_nyu_edu/imdb_partitioned_by_emotion_rating_word"

val partitionedDf = spark.read.parquet(basePath)

val ratingDistributionByKeyword = partitionedDf.groupBy("word", "rating")
  .count() 
  .orderBy("word", "rating") 

val pivotedDistribution = ratingDistributionByKeyword.groupBy("word")
  .pivot("rating") 
  .sum("count")

pivotedDistribution.show(truncate = false)


In [29]:
val outputPath = "/user/yc7093_nyu_edu/imdb-emotion-analysis/rating-distribution-w-word"

pivotedDistribution.coalesce(1)
  .write
  .mode("overwrite")
  .option("header", "true")
  .csv(outputPath)

