In [1]:
! pip install pyspark



In [28]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, date_format
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler, PCA
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [29]:
spark = SparkSession.builder.getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [40]:
# Leia o arquivo ‘videos-tratados.snappy.parquet' no dataframe 'df_video'
df_video = spark.read.option('header', 'true').option('inferSchema', 'true').parquet('/content/drive/MyDrive/Colab Notebooks/colab_ebac/m28_spark_data/videos-comments-tratados.snappy.parquet')

df_video.show(n=5, truncate=False)

+-----------+--------------------------------------------------------------------------------------------------+------------+-------+-----+--------+------+-----------+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-------------+
|Video ID   |Title                                                                                             |Published At|Keyword|Likes|Comments|Views |Interaction|Year|Comment                                                                                                                                                                    

In [41]:
# Adicione a coluna 'Month' com o valor do mês da coluna "Published At"
df_video = df_video.withColumn('Month', date_format(col('Published At'), 'MM'))
df_video.show()

+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+-----+
|   Video ID|               Title|Published At|Keyword|Likes|Comments|  Views|Interaction|Year|             Comment|Sentiment|Likes Comment|Month|
+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+-----+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Let's not forget ...|        1|           95|   08|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Here in NZ 50% of...|        0|           19|   08|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|I will forever ac...|        2|          161|   08|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Whenever I go to ...|  

In [42]:
# Adicione a coluna "Keyword Index" com a transformação da coluna 'keyword' para valores numéricos
indexador = StringIndexer(inputCol='Keyword', outputCol='Keyword Index')
modelo_indexer = indexador.fit(df_video)
df_video = modelo_indexer.transform(df_video)
df_video.show()

+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+-----+-------------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments|  Views|Interaction|Year|             Comment|Sentiment|Likes Comment|Month|Keyword Index|
+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+-----+-------------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Let's not forget ...|        1|           95|   08|         17.0|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Here in NZ 50% of...|        0|           19|   08|         17.0|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|I will forever ac...|        2|          161|   08|         17.0|
|wAZZ-UWGVHI|Apple Pay Is Kill...|

In [43]:
# Crie um vetor chamado "Features" com os campos: "Likes", "Views", "Year", "Month", "Keyword Index" e transforme
# o dataframe df_video com o VectorAssembler, lembrando que o vetor só aceita campos do tipo numérico

# convConvertendo as colunas Year e Month para número
df_video = df_video.withColumn('Year', col('Year').cast('int'))
df_video = df_video.withColumn('Month', col('Month').cast('int'))

# Montando o vetor
montar_vetor = VectorAssembler(
    inputCols=['Likes', 'Views', 'Year', 'Month', 'Keyword Index'],
    outputCol='Features',
    handleInvalid='skip'
)

# Aplicando as transformações
df_video = montar_vetor.transform(df_video)
df_video.show(truncate=False)

+-----------+--------------------------------------------------------------------------------------------------+------------+-------+-----+--------+-------+-----------+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-------------+-----+-------------+-----------------------------------+
|Video ID   |Title                                                                                             |Published At|Keyword|Likes|Comments|Views  |Interaction|Year|Comment                                                                                                          

In [44]:
# Adicione a coluna "Features Normal" com os dados normalizados da coluna Features, lembrando que para normalizar a coluna não pode conter valores nulos
scaler = MinMaxScaler(inputCol='Features', outputCol='Features Normal')
modelo_scaler = scaler.fit(df_video)
df_video = modelo_scaler.transform(df_video)
df_video.show(truncate=False)

+-----------+--------------------------------------------------------------------------------------------------+------------+-------+-----+--------+-------+-----------+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-------------+-----+-------------+-----------------------------------+-------------------------------------------------------------------------------------+
|Video ID   |Title                                                                                             |Published At|Keyword|Likes|Comments|Views  |Interaction|Year|Comment                    

In [45]:
# Adicione a coluna "Features PCA" com a redução de 5 características para 1, utilizando o modelo PCA
pca = PCA(k=1, inputCol='Features Normal', outputCol='Features PCA')
modelo_pca = pca.fit(df_video)
df_video = modelo_pca.transform(df_video)
df_video.select('Features Normal', 'Features PCA').show(truncate=False)

+-------------------------------------------------------------------------------------+--------------------+
|Features Normal                                                                      |Features PCA        |
+-------------------------------------------------------------------------------------+--------------------+
|[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.42500000000000004]|[0.4548135019714761]|
|[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.42500000000000004]|[0.4548135019714761]|
|[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.42500000000000004]|[0.4548135019714761]|
|[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.42500000000000004]|[0.4548135019714761]|
|[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.42500000000000004]|[0.4548135019714761]|
|[2.072291978642988E-4,6.31492585237833E-5,1.0,0.6363636363636364,0.42500000000000004]|[0.4548135019714761]|
|[2.072291978642988

In [46]:
# Separe o dataframe df_video em 2 conjuntos: 80% para treinamento e 20% para teste
df_train, df_test = df_video.randomSplit([0.8, 0.2], seed=42)

In [47]:
# Crie um modelo de regressão linear para estimar o valor do campo "Comments", utilizando a "Features Normal" e avalie o modelo
regressao_linear = LinearRegression(featuresCol='Features Normal', labelCol='Comments')
modelo_lr = regressao_linear.fit(df_train)

avaliar_test = modelo_lr.evaluate(df_test)
print(f'R2: {avaliar_test.r2}')
print(f'RMSE: {avaliar_test.rootMeanSquaredError}')

R2: 0.6602413154888491
RMSE: 25370.3336201662


In [48]:
# Salve o dataframe df_video como 'videos-preparados-parquet' no formato parquet
df_video.write.mode('overwrite').option('header', 'true').parquet('/content/drive/MyDrive/Colab Notebooks/colab_ebac/m28_spark_data/output/videos-preparados-parquet')
spark.read.option('header', 'true').parquet('/content/drive/MyDrive/Colab Notebooks/colab_ebac/m28_spark_data/output/videos-preparados-parquet').show()

+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+-----+-------------+--------------------+--------------------+--------------------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments|  Views|Interaction|Year|             Comment|Sentiment|Likes Comment|Month|Keyword Index|            Features|     Features Normal|        Features PCA|
+-----------+--------------------+------------+-------+-----+--------+-------+-----------+----+--------------------+---------+-------------+-----+-------------+--------------------+--------------------+--------------------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Let's not forget ...|        1|           95|    8|         17.0|[3407.0,135612.0,...|[2.07229197864298...|[0.4548135019714761]|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|Here in N

In [49]:
spark.stop()