In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CSV para Parquet Otimizado") \
    .getOrCreate()

#print(spark.sparkContext.master)
#print(spark.sparkContext.uiWebUrl)

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType, IntegerType

# Esquema com os tipos numéricos corretos para colunas "limpas"
schema_com_tipos = StructType([
    # Colunas de texto
    StructField("Artist(s)", StringType(), True),
    StructField("song", StringType(), True),
    StructField("text", StringType(), True),
    StructField("emotion", StringType(), True),
    StructField("Genre", StringType(), True),
    StructField("Album", StringType(), True),
    StructField("Key", StringType(), True),
    StructField("Explicit", StringType(), True),

    # Colunas que precisam de limpeza (lidas como texto primeiro)
    StructField("Length", StringType(), True),
    StructField("Release Date", StringType(), True),
    StructField("Loudness (db)", StringType(), True),
    StructField("Time signature", StringType(), True),

    # Colunas de Inteiros (LongType é um inteiro grande, seguro de usar)
    StructField("Popularity", LongType(), True),
    StructField("Energy", LongType(), True),
    StructField("Danceability", LongType(), True),
    StructField("Positiveness", LongType(), True),
    StructField("Speechiness", LongType(), True),
    StructField("Liveness", LongType(), True),
    StructField("Tempo", LongType(), True), # Tempo (BPM) é geralmente um inteiro

    # Colunas de Ponto Flutuante (DoubleType é mais preciso)
    StructField("Acousticness", DoubleType(), True),
    StructField("Instrumentalness", DoubleType(), True),
    StructField("Similarity Score 1", DoubleType(), True),
    StructField("Similarity Score 2", DoubleType(), True),
    StructField("Similarity Score 3", DoubleType(), True),

    # O resto pode ser lido como string ou o tipo que você preferir
    StructField("Good for Party", StringType(), True),
    StructField("Good for Work/Study", StringType(), True),
    # ... e assim por diante para as outras colunas
    StructField("Good for Relaxation/Meditation", StringType(), True),
    StructField("Good for Exercise", StringType(), True),
    StructField("Good for Running", StringType(), True),
    StructField("Good for Yoga/Stretching", StringType(), True),
    StructField("Good for Driving", StringType(), True),
    StructField("Good for Social Gatherings", StringType(), True),
    StructField("Good for Morning Routine", StringType(), True),
    StructField("Similar Artist 1", StringType(), True),
    StructField("Similar Song 1", StringType(), True),
    StructField("Similar Artist 2", StringType(), True),
    StructField("Similar Song 2", StringType(), True),
    StructField("Similar Artist 3", StringType(), True),
    StructField("Similar Song 3", StringType(), True)
])


# 2. Lemos o CSV com o esquema manual e REMOVEMOS inferSchema
caminho_csv_novo = "/spark-data/spotify_dataset.csv"

df = spark.read.csv(
    caminho_csv_novo,
    header=True,
    inferSchema=False,
    quote='"',             # Define que valores entre aspas são um único campo
    escape='"',            # Para lidar com aspas duplas dentro de strings ("" vira ")
    multiLine=True         # Para permitir que campos entre aspas quebrem linha
)

df.show()

25/07/08 17:29:49 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------------+--------------------+--------------------+------+--------+-------+--------------------+-----------------+------+-----+-------------+--------------+--------+----------+------+------------+------------+-----------+--------+------------+----------------+--------------+-------------------+------------------------------+-----------------+----------------+------------------------+----------------+--------------------------+------------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+
|     Artist(s)|                song|                text|Length| emotion|  Genre|               Album|     Release Date|   Key|Tempo|Loudness (db)|Time signature|Explicit|Popularity|Energy|Danceability|Positiveness|Speechiness|Liveness|Acousticness|Instrumentalness|Good for Party|Good for Work/Study|Good for Relaxation/Meditation|Good for Exercise|Goo

In [6]:
caminho_parquet = "/spark-data/meuarquivo.parquet"

# Sobrescreve o arquivo anterior com os dados corrigidos
df.write.mode("overwrite").parquet(caminho_parquet)

                                                                                

In [7]:
df_parquet = spark.read.parquet(caminho_parquet)
df_parquet.show(5)

[Stage 4:>                                                          (0 + 1) / 1]

+---------+--------------------+--------------------+------+-------+-------+-------------+-----------------+------+-----+-------------+--------------+--------+----------+------+------------+------------+-----------+--------+------------+----------------+--------------+-------------------+------------------------------+-----------------+----------------+------------------------+----------------+--------------------------+------------------------+----------------+--------------------+------------------+----------------+--------------------+------------------+----------------+--------------+------------------+
|Artist(s)|                song|                text|Length|emotion|  Genre|        Album|     Release Date|   Key|Tempo|Loudness (db)|Time signature|Explicit|Popularity|Energy|Danceability|Positiveness|Speechiness|Liveness|Acousticness|Instrumentalness|Good for Party|Good for Work/Study|Good for Relaxation/Meditation|Good for Exercise|Good for Running|Good for Yoga/Stretching|Good 

                                                                                

In [8]:
import time

def medir_tempo_leitura(funcao):
    inicio = time.time()
    resultado = funcao()
    fim = time.time()
    duracao = fim - inicio
    print(f"Tempo de execução: {duracao:.4f} segundos")
    return resultado

In [9]:
def ler_csv():
    df = spark.read.csv("/spark-data/spotify_dataset.csv", header=True, inferSchema=True)
    df.select("song").collect()
    return df

df_csv = medir_tempo_leitura(ler_csv)


                                                                                

Tempo de execução: 100.8020 segundos


In [10]:
def ler_parquet():
    df = spark.read.parquet("/spark-data/meuarquivo.parquet")
    df.select("song").collect()
    return df

df_parquet = medir_tempo_leitura(ler_parquet)


                                                                                

Tempo de execução: 5.7439 segundos


In [11]:
spark.stop()