# Entregable e2e

## Prerrequisites

Install Java and Spark in VM

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.1
!wget -q https://apache.osuosl.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz

In [2]:
# unzip it
!tar xf spark-3.5.1-bin-hadoop3.tgz

In [3]:
!pip install -q findspark

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [5]:
import findspark
findspark.init("spark-3.5.1-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Joins") \
        .master("local[*]") \
        .config("spark.ui.port", "4500") \
        .getOrCreate()

spark.version

'3.5.1'

In [6]:
spark

In [7]:
# Import sql functions
from pyspark.sql.functions import *

Load the datasets

In [15]:
carsDF = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("/content/car_data.csv")

carsDF.printSchema()

root
 |-- Car_id: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Annual Income: integer (nullable = true)
 |-- Dealer_Name: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Engine: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Price ($): integer (nullable = true)
 |-- Dealer_No : string (nullable = true)
 |-- Body Style: string (nullable = true)
 |-- Phone: integer (nullable = true)
 |-- Dealer_Region: string (nullable = true)



In [16]:
carsDF.show(2)

+------------+--------+-------------+------+-------------+--------------------+-------+----------+--------------------+------------+-----+---------+----------+----------+-------+-------------+
|      Car_id|    Date|Customer Name|Gender|Annual Income|         Dealer_Name|Company|     Model|              Engine|Transmission|Color|Price ($)|Dealer_No |Body Style|  Phone|Dealer_Region|
+------------+--------+-------------+------+-------------+--------------------+-------+----------+--------------------+------------+-----+---------+----------+----------+-------+-------------+
|C_CND_000001|1/2/2022|    Geraldine|  Male|        13500|Buddy Storbeck's ...|   Ford|Expedition|DoubleÂ Overhead ...|        Auto|Black|    26000|06457-3834|       SUV|8264678|   Middletown|
|C_CND_000002|1/2/2022|          Gia|  Male|      1480000|    C & M Motors Inc|  Dodge|   Durango|DoubleÂ Overhead ...|        Auto|Black|    19000|60504-7114|       SUV|6848189|       Aurora|
+------------+--------+------------

## Llamadas

In [17]:
#Ventas por género

carsDF.groupBy("Gender").count().show()

+------+-----+
|Gender|count|
+------+-----+
|Female| 5108|
|  Male|18798|
+------+-----+



In [18]:
#Renta media anual por cliente

carsDF.groupBy("Annual Income").avg().show()

+-------------+------------------+------------------+-----------------+
|Annual Income|avg(Annual Income)|    avg(Price ($))|       avg(Phone)|
+-------------+------------------+------------------+-----------------+
|      1078000|         1078000.0|           31800.0|        7125989.0|
|       499000|          499000.0|          34000.25|        7781931.5|
|      1263500|         1263500.0|           36350.0|        7945770.0|
|       702000|          702000.0|          24250.25|7953040.416666667|
|       586500|          586500.0|           26500.0|        7877487.0|
|       745500|          745500.0|30833.333333333332|6999182.333333333|
|       419000|          419000.0|           24000.0|        7936461.5|
|      1662500|         1662500.0|           28250.0|        8363704.0|
|       488000|          488000.0|21167.166666666668|        7012831.5|
|      1265500|         1265500.0|           23551.0|        7761314.0|
|       441000|          441000.0|           33600.6|        688

In [20]:
#Top 10 modelos más vendidos

carsDF.groupBy("Model").count().orderBy("count", ascending=False).show(10)

+----------+-----+
|     Model|count|
+----------+-----+
|  Diamante|  418|
|Silhouette|  411|
|     Prizm|  411|
|    Passat|  391|
|Ram Pickup|  383|
|     Jetta|  382|
|        RL|  372|
|     LS400|  354|
|       LHS|  330|
|        A6|  329|
+----------+-----+
only showing top 10 rows



In [21]:
#Precio medio por fabricante

carsDF.groupBy("Company").avg("Price ($)").orderBy("avg(Price ($))", ascending=False).show()

+----------+------------------+
|   Company|    avg(Price ($))|
+----------+------------------+
|  Cadillac|40972.093558282206|
|      Saab|  36516.3380952381|
|     Lexus| 34024.56733167082|
|     Buick| 33634.36218678815|
|Oldsmobile|  31894.2502250225|
|   Lincoln|31407.036585365855|
|    Saturn|31092.609215017066|
|    Toyota| 29513.12072072072|
|  Plymouth|29404.980551053486|
|   Pontiac| 29358.30025125628|
|  Infiniti|29318.153846153848|
|      Ford| 29263.68215613383|
|   Mercury| 28535.16361556064|
|     Honda|28082.959039548023|
|    Subaru| 27931.34074074074|
|     Volvo|27788.593155893537|
|    Nissan|27047.511286681714|
|Mercedes-B| 26944.84280155642|
|Mitsubishi| 26673.81832419734|
|     Dodge|26406.341113105926|
+----------+------------------+
only showing top 20 rows



In [22]:
#Ventas por región

carsDF.groupBy("Dealer_Region").count().orderBy("count", ascending=False).show()

+-------------+-----+
|Dealer_Region|count|
+-------------+-----+
|       Austin| 4135|
|   Janesville| 3821|
|   Scottsdale| 3433|
|        Pasco| 3131|
|       Aurora| 3130|
|   Middletown| 3128|
|   Greenville| 3128|
+-------------+-----+



In [33]:
# Ventas mensuales

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

from pyspark.sql.functions import to_date, month

df_with_month = carsDF.withColumn("Sale_Month", month(to_date("Date", "M/d/yyyy")))
df_with_month.groupBy("Sale_Month").count().orderBy("Sale_Month").show()


+----------+-----+
|Sale_Month|count|
+----------+-----+
|         1|  790|
|         2|  735|
|         3| 1535|
|         4| 1655|
|         5| 1895|
|         6| 1715|
|         7| 1725|
|         8| 1705|
|         9| 3305|
|        10| 1830|
|        11| 3470|
|        12| 3546|
+----------+-----+

