In [1]:
# Instalar Java e PySpark
!apt-get install openjdk-11-jdk -qq > /dev/null
!pip install pyspark -q

# Configurar variáveis de ambiente (para o PySpark achar o Java)
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] += ":/usr/lib/jvm/java-11-openjdk-amd64/bin"

# Agora sim, importar e criar a SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, count, isnan, regexp_replace, sum
from pyspark.sql.types import StringType, NumericType

spark = SparkSession.builder.master("local[*]").getOrCreate()

print("Spark iniciado com sucesso!")


Spark iniciado com sucesso!


In [2]:
# Criar um DataFrame
df = spark.read.csv("/content/transactions_data.csv", header=True, inferSchema = True)

In [3]:
# Visualizar o esquema do DataFrame incluindo nome das colunas e datatype
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- client_id: integer (nullable = true)
 |-- card_id: integer (nullable = true)
 |-- amount: string (nullable = true)
 |-- use_chip: string (nullable = true)
 |-- merchant_id: integer (nullable = true)
 |-- merchant_city: string (nullable = true)
 |-- merchant_state: string (nullable = true)
 |-- zip: double (nullable = true)
 |-- mcc: integer (nullable = true)
 |-- errors: string (nullable = true)



In [4]:
# Visualizar os dados do DataFrame
df.show()

+-------+-------------------+---------+-------+-------+------------------+-----------+---------------+--------------+-------+----+------+
|     id|               date|client_id|card_id| amount|          use_chip|merchant_id|  merchant_city|merchant_state|    zip| mcc|errors|
+-------+-------------------+---------+-------+-------+------------------+-----------+---------------+--------------+-------+----+------+
|7475327|2010-01-01 00:01:00|     1556|   2972|$-77.00| Swipe Transaction|      59935|         Beulah|            ND|58523.0|5499|  NULL|
|7475328|2010-01-01 00:02:00|      561|   4575| $14.57| Swipe Transaction|      67570|     Bettendorf|            IA|52722.0|5311|  NULL|
|7475329|2010-01-01 00:02:00|     1129|    102| $80.00| Swipe Transaction|      27092|          Vista|            CA|92084.0|4829|  NULL|
|7475331|2010-01-01 00:05:00|      430|   2860|$200.00| Swipe Transaction|      27092|    Crown Point|            IN|46307.0|4829|  NULL|
|7475332|2010-01-01 00:06:00|     

In [5]:
# Contar o número de linhas do DataFrame
df.count()

4999

In [6]:
# Excluir linhas 100% nulas e contagem após essa exclusão
df.dropna(how = "all").count()

4999

In [7]:
# Excluir linhas duplicadas e contagem após essa exclusão
df.dropDuplicates().count()

4999

In [8]:
# Substituir os dados nulos por 'Sem erros' na coluna 'errors'
df = df.withColumn(
    "errors",
    when(col("errors").isNull(), "Sem erro").otherwise(col("errors"))
)

In [9]:
# Visualizar os dados do DataFrame
df.show()

+-------+-------------------+---------+-------+-------+------------------+-----------+---------------+--------------+-------+----+--------+
|     id|               date|client_id|card_id| amount|          use_chip|merchant_id|  merchant_city|merchant_state|    zip| mcc|  errors|
+-------+-------------------+---------+-------+-------+------------------+-----------+---------------+--------------+-------+----+--------+
|7475327|2010-01-01 00:01:00|     1556|   2972|$-77.00| Swipe Transaction|      59935|         Beulah|            ND|58523.0|5499|Sem erro|
|7475328|2010-01-01 00:02:00|      561|   4575| $14.57| Swipe Transaction|      67570|     Bettendorf|            IA|52722.0|5311|Sem erro|
|7475329|2010-01-01 00:02:00|     1129|    102| $80.00| Swipe Transaction|      27092|          Vista|            CA|92084.0|4829|Sem erro|
|7475331|2010-01-01 00:05:00|      430|   2860|$200.00| Swipe Transaction|      27092|    Crown Point|            IN|46307.0|4829|Sem erro|
|7475332|2010-01-01 

In [10]:
# Esse módulo mistura Python e PySpark
counts = []

for c in df.columns:
    dtype = [f.dataType for f in df.schema.fields if f.name == c][0]

    if isinstance(dtype, NumericType):
        # Para colunas numéricas: considerar isNull + NaN
        counts.append(
            count(when(col(c).isNull() | isnan(col(c)), c)).alias(c)
        )
    else:
        # Para colunas string/objeto: considerar isNull + ''
        counts.append(
            count(when(col(c).isNull() | (col(c) == ""), c)).alias(c)
        )

null_counts = df.select(counts)

# Transformar em formato de tabela
null_counts = null_counts.toPandas().T.reset_index()
null_counts.columns = ['coluna', 'qtd_nulos']
null_counts = null_counts.sort_values(by='qtd_nulos', ascending=False)

null_counts


Unnamed: 0,coluna,qtd_nulos
9,zip,644
8,merchant_state,635
0,id,0
1,date,0
3,card_id,0
2,client_id,0
4,amount,0
5,use_chip,0
7,merchant_city,0
6,merchant_id,0


In [11]:
# Substituir valores nulos da coluna "merchant_state" por "ONLINE"
df = df.withColumn(
    "merchant_state",
    when((col("merchant_city") == "ONLINE") & col("merchant_state").isNull(), "ONLINE")
    .otherwise(col("merchant_state"))
)

In [12]:
# Substituir valores nulos (isNull) da coluna "zip" por "0"

df = df.withColumn(
    "zip",
    when(
        (
            ((~col("merchant_city").isNull()) & (col("merchant_city") != "")) |
            ((~col("merchant_state").isNull()) & (col("merchant_state") != ""))
        ) & col("zip").isNull(),
        "0"
    ).otherwise(col("zip"))
)

In [13]:
# Visualizar os dados do DataFrame
df.show()

+-------+-------------------+---------+-------+-------+------------------+-----------+---------------+--------------+-------+----+--------+
|     id|               date|client_id|card_id| amount|          use_chip|merchant_id|  merchant_city|merchant_state|    zip| mcc|  errors|
+-------+-------------------+---------+-------+-------+------------------+-----------+---------------+--------------+-------+----+--------+
|7475327|2010-01-01 00:01:00|     1556|   2972|$-77.00| Swipe Transaction|      59935|         Beulah|            ND|58523.0|5499|Sem erro|
|7475328|2010-01-01 00:02:00|      561|   4575| $14.57| Swipe Transaction|      67570|     Bettendorf|            IA|52722.0|5311|Sem erro|
|7475329|2010-01-01 00:02:00|     1129|    102| $80.00| Swipe Transaction|      27092|          Vista|            CA|92084.0|4829|Sem erro|
|7475331|2010-01-01 00:05:00|      430|   2860|$200.00| Swipe Transaction|      27092|    Crown Point|            IN|46307.0|4829|Sem erro|
|7475332|2010-01-01 

In [14]:
# Contar valores isNull, vazios ou NaN por coluna
null_counts


Unnamed: 0,coluna,qtd_nulos
9,zip,644
8,merchant_state,635
0,id,0
1,date,0
3,card_id,0
2,client_id,0
4,amount,0
5,use_chip,0
7,merchant_city,0
6,merchant_id,0


In [15]:
# Mudar do nome das colunas para língua portuguesa
df = (df
      .withColumnRenamed("id", "id")
      .withColumnRenamed("date", "data")
      .withColumnRenamed("client_id", "id_cliente")
      .withColumnRenamed("card_id", "id_cartao")
      .withColumnRenamed("amount", "valor_dolar")
      .withColumnRenamed("use_chip", "tipo_transacao")
      .withColumnRenamed("merchant_id", "id_comerciante")
      .withColumnRenamed("merchant_city", "cidade_comerciante")
      .withColumnRenamed("merchant_state", "estado_comerciante")
      .withColumnRenamed("zip_code", "cep")
      .withColumnRenamed("merchant_category_code", "codigo_categoria_comerciante")
      .withColumnRenamed("errors", "erros")
)

In [16]:
df.show()

+-------+-------------------+----------+---------+-----------+------------------+--------------+------------------+------------------+-------+----+--------+
|     id|               data|id_cliente|id_cartao|valor_dolar|    tipo_transacao|id_comerciante|cidade_comerciante|estado_comerciante|    zip| mcc|   erros|
+-------+-------------------+----------+---------+-----------+------------------+--------------+------------------+------------------+-------+----+--------+
|7475327|2010-01-01 00:01:00|      1556|     2972|    $-77.00| Swipe Transaction|         59935|            Beulah|                ND|58523.0|5499|Sem erro|
|7475328|2010-01-01 00:02:00|       561|     4575|     $14.57| Swipe Transaction|         67570|        Bettendorf|                IA|52722.0|5311|Sem erro|
|7475329|2010-01-01 00:02:00|      1129|      102|     $80.00| Swipe Transaction|         27092|             Vista|                CA|92084.0|4829|Sem erro|
|7475331|2010-01-01 00:05:00|       430|     2860|    $200

In [17]:
# Limpar a coluna 'valor em dólar', removendo os símbolos '$' e ',' e convertendo os valores para o tipo double
df = df.withColumn("valor_dolar", regexp_replace(col("valor_dolar"), "[$,]", "").cast("double"))

df.select("valor_dolar").show()

+-----------+
|valor_dolar|
+-----------+
|      -77.0|
|      14.57|
|       80.0|
|      200.0|
|      46.41|
|       4.81|
|       77.0|
|      26.46|
|     261.58|
|      10.74|
|       3.51|
|       2.58|
|      39.63|
|      43.33|
|      49.42|
|       1.09|
|      73.79|
|      100.0|
|      26.04|
|      -64.0|
+-----------+
only showing top 20 rows



In [18]:
# Calcular o valor gasto por cliente e mostrar o resultado
total_gasto_cliente = df.groupBy("id_cliente").agg(
    sum("valor_dolar").alias("total_gasto_cliente")
)

total_gasto_cliente.show()

+----------+-------------------+
|id_cliente|total_gasto_cliente|
+----------+-------------------+
|       148|              75.16|
|      1591|              153.0|
|      1238|             533.67|
|      1645| -3.270000000000005|
|      1959|              29.33|
|      1088|             212.59|
|       496|             172.73|
|      1127|             157.29|
|       858|             134.02|
|      1084| 10.629999999999999|
|       737| 216.09000000000003|
|      1507|             116.55|
|      1896|             207.49|
|       623|             124.37|
|      1618|              146.2|
|      1352|             219.29|
|      1903|             155.96|
|      1699|             271.99|
|       137|             329.58|
|       580|             200.65|
+----------+-------------------+
only showing top 20 rows



In [19]:
total_gasto_cliente.write.csv("/content/total_gasto_cliente.csv")

In [20]:
#Total gasto por categoria

total_gasto_categoria = df.groupBy("tipo_transacao").agg(
    sum("valor_dolar").alias("total_gasto_categoria")
)

# Show the result
total_gasto_categoria.show()

+------------------+---------------------+
|    tipo_transacao|total_gasto_categoria|
+------------------+---------------------+
| Swipe Transaction|   163621.75000000006|
|Online Transaction|   31137.030000000013|
+------------------+---------------------+



In [23]:
total_gasto_categoria.write.csv("/content/total_gasto_categoria.csv")

In [24]:
#Salvar dados do DataFrame no formato parquet
df.write.parquet("/content/transações_dados_tratados.parquet")