In [47]:
! pip install pyspark



In [48]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [49]:
spark = SparkSession.builder.getOrCreate()

# 1 - Leia o arquivo 'videos-preparados.snappy.parquet' no dataframe 'df_video'

In [50]:
df_video = spark.read.parquet('/content/videos-preparados.snappy.parquet')
df_video.show(5)

+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|Keyword| Likes|Comments|   Views|Interaction|Year|Month|Keyword Index|        Features PCA|     Features Normal|            Features|
+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|mukbang|378858|   18860|17975269|   18372987|2020|    4|         30.0|[0.6985786560867407]|[0.02303716158264...|[378858.0,1.79752...|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|   news|  6379|    4853|  808787|     820019|2022|    8|         37.0|[0.8936407990235931]|[3.87946679100418...|[6379.0,808787.0,...|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|   news|  1029|    2347|   97434|     100810|202

# 2 - Leia o arquivo ‘video-comments-tratados.snappy.parquet' no dataframe 'df_comments'

In [51]:
df_comments = spark.read.parquet('/content/videos-comments-tratados.snappy.parquet')
df_comments.show(5)

+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments| Views|Interaction|Year|             Comment|Sentiment|Likes Comment|
+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+---------+-------------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Let's not forget ...|        1|           95|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Here in NZ 50% of...|        0|           19|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|I will forever ac...|        2|          161|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Whenever I go to ...|        0|            8|
|wAZZ-UWGVHI|Apple P

# 3 - Crie tabelas temporárias para ambos os dataframe

In [52]:
df_video.createOrReplaceTempView("videos")
df_comments.createOrReplaceTempView("comments")

# 4 - Faça um join das tabelas criadas anteriormente utilizando o spark.sql no dataframe ‘join_video_comments’

In [53]:
join_video_comments = spark.sql("""
  SELECT v.*, c.comment
  FROM videos v
  JOIN comments c ON v.`Video ID` = c.`Video ID`
  """)

join_video_comments.show(5)

+--------------------+-----------+------------+-------+-----+--------+------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|Keyword|Likes|Comments| Views|Interaction|Year|Month|Keyword Index|        Features PCA|     Features Normal|            Features|             comment|
+--------------------+-----------+------------+-------+-----+--------+------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|    8|         23.0|[0.5461641657286636]|[2.07229197864298...|[3407.0,135612.0,...|Let's not forget ...|
|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|    8|         23.0|[0.5461641657286636]|[2.07229197864298...|[3407.0,135612.0,...|Here in NZ 50% of...|
|Appl

# 5 - Faça as mesmas etapas anteriores (1,2,3,4) utilizando repartition e coalesce


In [54]:
# (Usando o Repatition) 1 e 2 - Leitura utilizando o repartition
rdd_df_video = spark.read.parquet('/content/videos-preparados.snappy.parquet').repartition(5)
rdd_df_comments = spark.read.parquet('/content/videos-comments-tratados.snappy.parquet').repartition(5)

print('Núm. de partições rdd_df_video: ', rdd_df_video.rdd.getNumPartitions())
print('Núm. de partições rdd_df_comments: ', rdd_df_comments.rdd.getNumPartitions())

# (Usando o Repatition) 3 - Criando as TempView
rdd_df_video.createOrReplaceTempView('rdd_video')
rdd_df_comments.createOrReplaceTempView('rdd_comments')

# (Usando o Repatition) 4 - Realizando o JOIN usando o spark.sql
join_rdd_video_comments = spark.sql("""
  SELECT v.*, c.comment
  FROM rdd_video v
  JOIN rdd_comments c ON v.`Video ID` = c.`Video ID`
  """)

join_rdd_video_comments.show(5)

Núm. de partições rdd_df_video:  5
Núm. de partições rdd_df_comments:  5
+--------------------+-----------+------------+------------+------+--------+-------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|     Keyword| Likes|Comments|  Views|Interaction|Year|Month|Keyword Index|        Features PCA|     Features Normal|            Features|             comment|
+--------------------+-----------+------------+------------+------+--------+-------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|😱2/2 Cube Solved...|CkhgF-bP2cc|  2022-07-24|       cubes|205854|     338|4543326|    4749518|2022|    7|         11.0|[0.2402758308009996]|[0.01251736106993...|[205854.0,4543326...|            Nice bro|
|John Legend - All...|RjI4c-QNt0s|  2022-08-24|       music|  5174|     238| 187956|     193368|2022|   

In [55]:
# (Usando o coalesce) 1 e 2 - Leitura utilizando o coalesce
reduce_df_video = spark.read.parquet('/content/videos-preparados.snappy.parquet').coalesce(1)
reduce_df_comments = spark.read.parquet('/content/videos-comments-tratados.snappy.parquet').coalesce(1)

print('Núm. de partições reduce_df_video: ', reduce_df_video.rdd.getNumPartitions())
print('Núm. de partições reduce_df_comments: ', reduce_df_comments.rdd.getNumPartitions())

# (Usando o coalesce) 3 - Criando as TempView
reduce_df_video.createOrReplaceTempView('reduce_video')
reduce_df_comments.createOrReplaceTempView('reduce_comments')

# (Usando o coalesce) 4 - Realizando o JOIN usando o spark.sql

join_reduce_video_comments = spark.sql("""
  SELECT v.*, c.comment
  FROM reduce_video v
  JOIN reduce_comments c ON v.`Video ID` = c.`Video ID`
  """)

print('Núm. de partições: ', join_reduce_video_comments.rdd.getNumPartitions())

join_reduce_video_comments.show(5)

Núm. de partições reduce_df_video:  1
Núm. de partições reduce_df_comments:  1
Núm. de partições:  1
+--------------------+-----------+------------+-------+-----+--------+------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|               Title|   Video ID|Published At|Keyword|Likes|Comments| Views|Interaction|Year|Month|Keyword Index|        Features PCA|     Features Normal|            Features|             comment|
+--------------------+-----------+------------+-------+-----+--------+------+-----------+----+-----+-------------+--------------------+--------------------+--------------------+--------------------+
|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|    8|         23.0|[0.5461641657286636]|[2.07229197864298...|[3407.0,135612.0,...|Let's not forget ...|
|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|    8|   

# 6 - Utilize o explain para entender melhor as duas formas de realizar as etapas e refaça novamente as etapas anteriores (1,2,3,4), utilizando tudo que você já aprendeu para realizar o join e filter apenas com os dados necessários.

In [56]:
# (Utilizando o Repatition) 1 e 2 - Leitura dos arquivos
rdd_df_video = spark.read.parquet('/content/videos-preparados.snappy.parquet').repartition(5)
rdd_df_comments = spark.read.parquet('/content/videos-comments-tratados.snappy.parquet').repartition(5)

# Utilizando o Explain para exibir o plano de execução físico e lógico da etapa da leitura
print('Plano de execução da leitura dos arquivos com repartition')
rdd_df_video.explain()
rdd_df_comments.explain()

# (Utilizando o Repatition) 3 - Criando as TempView
rdd_df_video.createOrReplaceTempView('rdd_video')
rdd_df_comments.createOrReplaceTempView('rdd_comments')

# Utilizando o Explain para exibir o plano de execução físico e lógico da etapa de criação das TempView
print('Plano de execução da Criação das TempViews')
spark.table("rdd_video").explain()
spark.table("rdd_comments").explain()

# (Utilizando o Repatition) 4 - Realizando o JOIN usando o spark.sql
join_rdd_video_comments = spark.sql("""
  SELECT v.*, c.comment
  FROM rdd_video v
  JOIN rdd_comments c ON v.`Video ID` = c.`Video ID`
  """)

# Utilizando o Explain para exibir o plano de execução físico e lógico da etapa etapa do Join
print('Plano de execução do Join')
join_rdd_video_comments.explain()

Plano de execução da leitura dos arquivos com repartition
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(5), REPARTITION_BY_NUM, [plan_id=2443]
   +- InMemoryTableScan [Title#21682, Video ID#21683, Published At#21684, Keyword#21685, Likes#21686, Comments#21687, Views#21688, Interaction#21689, Year#21690, Month#21691, Keyword Index#21692, Features PCA#21693, Features Normal#21694, Features#21695]
         +- InMemoryRelation [Title#21682, Video ID#21683, Published At#21684, Keyword#21685, Likes#21686, Comments#21687, Views#21688, Interaction#21689, Year#21690, Month#21691, Keyword Index#21692, Features PCA#21693, Features Normal#21694, Features#21695], StorageLevel(disk, memory, deserialized, 1 replicas)
               +- *(1) ColumnarToRow
                  +- FileScan parquet [Title#0,Video ID#1,Published At#2,Keyword#3,Likes#4,Comments#5,Views#6,Interaction#7,Year#8,Month#9,Keyword Index#10,Features PCA#11,Features Normal#12,Features#13] Ba

In [57]:
# (Usando o coalesce) 1 e 2 - Leitura utilizando o coalesce
reduce_df_video = spark.read.parquet('/content/videos-preparados.snappy.parquet').coalesce(1)
reduce_df_comments = spark.read.parquet('/content/videos-comments-tratados.snappy.parquet').coalesce(1)

# Utilizando o Explain para exibir o plano de execução físico e lógico da etapa de leitura
print('Plano de execução da leitura dos arquivos com coalesce')
reduce_df_video.explain()
reduce_df_comments.explain()

# (Usando o coalesce) 3 - Criando as TempView
reduce_df_video.createOrReplaceTempView('reduce_video')
reduce_df_comments.createOrReplaceTempView('reduce_comments')

# Utilizando o Explain para exibir o plano de execução físico e lógico da etapa de criação das TempView
print('Plano de execução da Criação das TempViews')
spark.table("reduce_video").explain()
spark.table("reduce_comments").explain()

# (Usando o coalesce) 4 - Realizando o JOIN usando o spark.sql
join_reduce_video_comments = spark.sql("""
  SELECT v.*, c.comment
  FROM reduce_video v
  JOIN reduce_comments c ON v.`Video ID` = c.`Video ID`
  """)

# Utilizando o Explain para exibir o plano de execução físico e lógico da etapa do Join
print('Plano de execução do Join')
join_reduce_video_comments.explain()

Plano de execução da leitura dos arquivos com coalesce
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Coalesce 1
   +- InMemoryTableScan [Title#22945, Video ID#22946, Published At#22947, Keyword#22948, Likes#22949, Comments#22950, Views#22951, Interaction#22952, Year#22953, Month#22954, Keyword Index#22955, Features PCA#22956, Features Normal#22957, Features#22958]
         +- InMemoryRelation [Title#22945, Video ID#22946, Published At#22947, Keyword#22948, Likes#22949, Comments#22950, Views#22951, Interaction#22952, Year#22953, Month#22954, Keyword Index#22955, Features PCA#22956, Features Normal#22957, Features#22958], StorageLevel(disk, memory, deserialized, 1 replicas)
               +- *(1) ColumnarToRow
                  +- FileScan parquet [Title#0,Video ID#1,Published At#2,Keyword#3,Likes#4,Comments#5,Views#6,Interaction#7,Year#8,Month#9,Keyword Index#10,Features PCA#11,Features Normal#12,Features#13] Batched: true, DataFilters: [], Format: Parquet, Location: InMemo

# 7 - Salve o seu join otimizado como 'join-videos-comments-otimizado' no formato parquet

In [58]:
# Salva o resultado otimizado em formato parquet para uma leitura mais eficiente
join_reduce_video_comments.write.mode('overwrite').parquet('join-videos-comments-otimizado')

# 8 - Comente todas as ações realizadas no código nas etapas 6 e 7