In [None]:
! pip install pyspark

In [14]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count

In [3]:
spark = SparkSession.builder.appName('Atividade Módulo 27').getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [25]:
# 1. Leia o arquivo ‘videos-stats.csv' no dataframe 'df_video' com cabeçalho e inferindo o esquema

df_video = spark.read.csv('drive/MyDrive/Colab Notebooks/colab_ebac/m27_spark_data/videos-stats.csv', header=True, inferSchema=True)
df_video.show(5)
df_video.printSchema()

+---+--------------------+-----------+------------+-------+-------+--------+---------+
|_c0|               Title|   Video ID|Published At|Keyword|  Likes|Comments|    Views|
+---+--------------------+-----------+------------+-------+-------+--------+---------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407.0|   672.0| 135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech|76779.0|  4306.0|1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech|63825.0|  3338.0|1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech|71566.0|  1426.0| 922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech|96513.0|  5155.0|1855644.0|
+---+--------------------+-----------+------------+-------+-------+--------+---------+
only showing top 5 rows

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable =

In [26]:
# 2. Altere os valores nulos dos campos 'Likes', 'Comments' e 'Views' para o valor 0

# Verificando a quantidade de nulos nas colunas do DataFrame
df_video.select([count(when(col(c).isNull(), c)).alias(c) for c in df_video.columns]).show()

# Substituindo os nulos por 0 nas colunas indicadas
colunas = ['Likes', 'Comments', 'Views']
df_video = df_video.fillna(value=0, subset=colunas)
df_video.show(5)

# verificando se restaram valores nulos
df_video.select([count(when(col(c).isNull(), c)).alias(c) for c in df_video.columns]).show()


+---+-----+--------+------------+-------+-----+--------+-----+
|_c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|
+---+-----+--------+------------+-------+-----+--------+-----+
|  0|    0|       0|           0|      0|    2|       2|    2|
+---+-----+--------+------------+-------+-----+--------+-----+

+---+--------------------+-----------+------------+-------+-------+--------+---------+
|_c0|               Title|   Video ID|Published At|Keyword|  Likes|Comments|    Views|
+---+--------------------+-----------+------------+-------+-------+--------+---------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech| 3407.0|   672.0| 135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech|76779.0|  4306.0|1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech|63825.0|  3338.0|1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech|71566.0|  1426.0| 922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech|96513.0|  5155.0

In [27]:
# 3. Leia o arquivo ‘comments.csv' no dataframe 'df_comentario' com cabeçalho e inferindo o esquema
df_comentario = spark.read.csv('drive/MyDrive/Colab Notebooks/colab_ebac/m27_spark_data/comments.csv', header=True, inferSchema=True)
df_comentario.show(5)

+---+-----------+--------------------+-----+---------+
|_c0|   Video ID|             Comment|Likes|Sentiment|
+---+-----------+--------------------+-----+---------+
|  0|wAZZ-UWGVHI|Let's not forget ...| 95.0|      1.0|
|  1|wAZZ-UWGVHI|Here in NZ 50% of...| 19.0|      0.0|
|  2|wAZZ-UWGVHI|I will forever ac...|161.0|      2.0|
|  3|wAZZ-UWGVHI|Whenever I go to ...|  8.0|      0.0|
|  4|wAZZ-UWGVHI|Apple Pay is so c...| 34.0|      2.0|
+---+-----------+--------------------+-----+---------+
only showing top 5 rows



In [36]:
# 4. Calcule a quantidade de registros do df_video e df_comentario
print('Qtd registros df_video: ', df_video.count())
print('Qtd registros df_comentario: ', df_comentario.count())

Qtd registros df_video:  1881
Qtd registros df_comentario:  30036


In [37]:
# 5. Remova os registros do df_video e df_comentario quem possuem o campo 'Video ID' nulos e calcule novamente a quantidade de registros
df_video = df_video.dropna(subset='Video ID')
df_comentario = df_comentario.dropna(subset='Video ID')

print('Qtd registros df_video: ', df_video.count())
print('Qtd registros df_comentario: ', df_comentario.count())

Qtd registros df_video:  1881
Qtd registros df_comentario:  22555


In [38]:
# 6. Remova os registros apenas do df_video quem possuem o campo 'Video ID' duplicados
df_video = df_video.dropDuplicates(['Video ID'])
print('Qtd registros df_video: ', df_video.count())

Qtd registros df_video:  1869


In [39]:
# 7. Converta os campos Likes, Comments e Views para 'int' no dataframe df_video
colunas = ['Likes', 'Comments', 'Views']
for coluna in colunas:
  df_video = df_video.withColumn(coluna, col(coluna).cast('int'))

df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)



In [44]:
# 8. Converta os campos Likes e Sentiment para 'int' no dataframe df_comentario, além disso, altere o nome do campo Likes para 'Likes Comment'
df_comentario.printSchema()
df_comentario.show(5)

df_comentario = df_comentario.withColumn('Likes', col('Likes').cast('int')).withColumn('Sentiment', col('Likes').cast('int'))
df_comentario = df_comentario.withColumnRenamed('Likes', 'Likes Comment')
df_comentario.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes: string (nullable = true)
 |-- Sentiment: string (nullable = true)

+---+-----------+--------------------+-----+---------+
|_c0|   Video ID|             Comment|Likes|Sentiment|
+---+-----------+--------------------+-----+---------+
|  0|wAZZ-UWGVHI|Let's not forget ...| 95.0|      1.0|
|  1|wAZZ-UWGVHI|Here in NZ 50% of...| 19.0|      0.0|
|  2|wAZZ-UWGVHI|I will forever ac...|161.0|      2.0|
|  3|wAZZ-UWGVHI|Whenever I go to ...|  8.0|      0.0|
|  4|wAZZ-UWGVHI|Apple Pay is so c...| 34.0|      2.0|
+---+-----------+--------------------+-----+---------+
only showing top 5 rows

root
 |-- _c0: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [45]:
# 9. Crie o campo 'Interaction' no dataframe df_video, com a soma dos campos Likes, Comments e Views
df_video = df_video.withColumn('Interaction', col('Likes') + col('Comments') + col('Views'))
df_video.show(5)

+----+--------------------+-----------+------------+-------+------+--------+--------+-----------+
| _c0|               Title|   Video ID|Published At|Keyword| Likes|Comments|   Views|Interaction|
+----+--------------------+-----------+------------+-------+------+--------+--------+-----------+
| 986|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|mukbang|378858|   18860|17975269|   18372987|
|  71|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|   news|  6379|    4853|  808787|     820019|
|  48|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|   news|  1029|    2347|   97434|     100810|
| 993|Celebrating My 40...|-64r1hcxtV4|  2022-05-30|mukbang| 45628|   17264| 5283664|    5346556|
|1456|Physics Review - ...|-6IgkG5yZfo|  2017-01-02|physics| 10959|     525|  844015|     855499|
+----+--------------------+-----------+------------+-------+------+--------+--------+-----------+
only showing top 5 rows



In [46]:
# 10. Converta os campos 'Published At' para 'date' no dataframe df_video
from pyspark.sql.functions import to_date, date_format
df_video = df_video.withColumn('Published At', to_date(col('Published At')))

In [47]:
# 11. Crie o campo 'Year' no dataframe df_video, extraindo apenas o ano do campo 'Published At'
df_video = df_video.withColumn('Year', date_format(col('Published At'), 'yyyy'))
df_video.show(5)

+----+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+
| _c0|               Title|   Video ID|Published At|Keyword| Likes|Comments|   Views|Interaction|Year|
+----+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+
| 986|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|mukbang|378858|   18860|17975269|   18372987|2020|
|  71|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|   news|  6379|    4853|  808787|     820019|2022|
|  48|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|   news|  1029|    2347|   97434|     100810|2022|
| 993|Celebrating My 40...|-64r1hcxtV4|  2022-05-30|mukbang| 45628|   17264| 5283664|    5346556|2022|
|1456|Physics Review - ...|-6IgkG5yZfo|  2017-01-02|physics| 10959|     525|  844015|     855499|2017|
+----+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+
only showing top 5 rows



In [48]:
# 12. Mescle os dados df_comentario no dataframe df_video em relação ao campo Video ID e crie o dataframe df_join_video_comments
df_join_video_comments = df_video.join(df_comentario, 'Video ID')
df_join_video_comments.show(5)

+-----------+---+--------------------+------------+-------+-----+--------+------+-----------+----+---+--------------------+-------------+---------+
|   Video ID|_c0|               Title|Published At|Keyword|Likes|Comments| Views|Interaction|Year|_c0|             Comment|Likes Comment|Sentiment|
+-----------+---+--------------------+------------+-------+-----+--------+------+-----------+----+---+--------------------+-------------+---------+
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|  0|Let's not forget ...|           95|       95|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|  1|Here in NZ 50% of...|           19|       19|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|  2|I will forever ac...|          161|      161|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|  3|Whenever I

In [49]:
# 13. Leia o arquivo ‘USvideos.csv' no dataframe 'df_us_videos' com cabeçalho e inferindo o esquema
df_us_videos = spark.read.csv('drive/MyDrive/Colab Notebooks/colab_ebac/m27_spark_data/USvideos.csv', header=True, inferSchema=True)
df_us_videos.show(5)

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

In [50]:
# 14. Mescle os dados df_us_videos no dataframe df_video em relação ao campo Title e crie e visualize o dataframe df_join_video_usvideos
df_join_video_usvideos = df_video.join(df_us_videos, 'Title')
df_join_video_usvideos.show(5)

+--------------------+---+-----------+------------+-------+------+--------+---------+-----------+----+-----------+-------------+-------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|               Title|_c0|   Video ID|Published At|Keyword| Likes|Comments|    Views|Interaction|Year|   video_id|trending_date|channel_title|category_id|        publish_time|                tags|   views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+--------------------+---+-----------+------------+-------+------+--------+---------+-----------+----+-----------+-------------+-------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------

In [52]:
# 15. Verifique a quantidade de campos nulos em todos os campos do dataframe df_video
df_video.select([count(when(col(c).isNull(), c)).alias(c) for c in df_video.columns]).show()

+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|_c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|Interaction|Year|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|  0|    0|       0|           0|      0|    0|       0|    0|          0|   0|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+



In [53]:
# 16. Remova a coluna '_c0' e salve o dataframe df_video como 'videos-tratados-parquet' no formato parquet e adicione o cabeçalho nos dados
df_video = df_video.drop('_c0')
df_video.write.mode('overwrite').option('header', 'true').parquet('output/videos-tratados-parquet')
spark.read.option('header', 'true').parquet('output/videos-tratados-parquet').show(5)

+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+
|               Title|   Video ID|Published At|Keyword| Likes|Comments|   Views|Interaction|Year|
+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+
|ASMR MUKBANG DOUB...|--ZI0dSbbNU|  2020-04-18|mukbang|378858|   18860|17975269|   18372987|2020|
|Deadly car bomb d...|--hxd1CrOqg|  2022-08-22|   news|  6379|    4853|  808787|     820019|2022|
|How Biden&#39;s s...|--ixiTypG8g|  2022-08-24|   news|  1029|    2347|   97434|     100810|2022|
|Celebrating My 40...|-64r1hcxtV4|  2022-05-30|mukbang| 45628|   17264| 5283664|    5346556|2022|
|Physics Review - ...|-6IgkG5yZfo|  2017-01-02|physics| 10959|     525|  844015|     855499|2017|
+--------------------+-----------+------------+-------+------+--------+--------+-----------+----+
only showing top 5 rows



In [54]:
# 17. Remova a coluna '_c0' e salve o dataframe df_join_video_comments como 'videos-comments-tratados-parquet' no formato parquet e adicione o cabeçalho nos dados
df_join_video_comments = df_join_video_comments.drop('_c0')
df_join_video_comments.write.mode('overwrite').option('header', 'true').parquet('output/videos-comments-tratados-parquet')
spark.read.option('header', 'true').parquet('output/videos-comments-tratados-parquet').show(5)

+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+-------------+---------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments| Views|Interaction|Year|             Comment|Likes Comment|Sentiment|
+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+-------------+---------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Let's not forget ...|           95|       95|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Here in NZ 50% of...|           19|       19|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|I will forever ac...|          161|      161|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Whenever I go to ...|            8|        8|
|wAZZ-UWGVHI|Apple P