In [31]:
file_path = '/home/jovyan/work/data/car_prices-v2.csv'

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when

spark = SparkSession.builder \
    .appName("projeto-pratico-1") \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [56]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
schema = StructType([
    StructField("year", IntegerType(), True),
    StructField("make", StringType(), True),
    StructField("model", StringType(), True),
    StructField("trim", StringType(), True),
    StructField("body", StringType(), True),
    StructField("transmission", StringType(), True),
    StructField("vin", StringType(), True),
    StructField("state", StringType(), True),
    StructField("condition", IntegerType(), True),
    StructField("odometer", IntegerType(), True),
    StructField("color", StringType(), True),
    StructField("interior", StringType(), True),
    StructField("seller", StringType(), True),
    StructField("mmr", IntegerType(), True),
    StructField("sellingprice", IntegerType(), True),
    StructField("salemonth", IntegerType(), True),
    StructField("saleyear", IntegerType(), True) 
])
df = spark.read.option("header", "true").csv(file_path, schema=schema)

## Quantos veículos foram vendidos e a receita total no ano de 2014?

In [57]:
df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- make: string (nullable = true)
 |-- model: string (nullable = true)
 |-- trim: string (nullable = true)
 |-- body: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- vin: string (nullable = true)
 |-- state: string (nullable = true)
 |-- condition: integer (nullable = true)
 |-- odometer: integer (nullable = true)
 |-- color: string (nullable = true)
 |-- interior: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- mmr: integer (nullable = true)
 |-- sellingprice: integer (nullable = true)
 |-- salemonth: integer (nullable = true)
 |-- saleyear: integer (nullable = true)



In [66]:
from pyspark.sql.functions import sum, count

df.filter(df.saleyear == 2014).groupBy("saleyear").agg(sum("sellingprice").alias("sales_amount"), count("vin").alias("sales_total"))

saleyear,sales_amount,sales_total
2014,236490972,21476


## Quais são os 5 carros mais vendidos até então? e os 5 menos vendidos?

In [87]:
from pyspark.sql.functions import desc
car_sales_ordered = df.groupBy("model").count().orderBy(desc("count"))
car_sales_ordered.limit(5)

model,count
F-150,642
3 Series,574
Accord,403
Altima,397
Escape,379


In [83]:
top_5_cars = car_sales_ordered.tail(5)
top_5_cars

[Row(model='Cooper Roadster', count=1),
 Row(model='SC 300', count=1),
 Row(model='Millenia', count=1),
 Row(model='Tahoe Limited/Z71', count=1),
 Row(model='NV200', count=1)]

## Qual o veículo com maior quilometragem marcada no odômetro?

In [91]:
from pyspark.sql.functions import max
df_cars_max_odometer = df.agg(max("odometer").alias("max_odometer"))
df_cars_max_odometer

max_odometer
999999


## Quais os 5 estados com o menor número de vendas?

In [96]:
from pyspark.sql.functions import col

df_cars_max_state_sales = df.groupBy("state").count()
df_state_min = df_cars_max_state_sales.orderBy(col("count")).limit(5)
df_state_min

state,count
nm,1
ms,1
qc,12
or,13
ok,16
