In [1]:
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [2]:
import getpass

username = getpass.getuser()

spark = (
    SparkSession.builder.appName(f"{username} | CCA-175")
    .master("local")
    .config("spark.sql.debug.maxToStringFields", 1000)
    .getOrCreate()
)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Exercise 01 - Get monthly crime count by type

In [37]:
crime_path = "/home/nghiaht7/data-engineer/data-engineering-essentials/data/crime.csv"
crime = (
    spark.read.format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load(crime_path)
)

In [38]:
crime.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- CASE_NUMBER: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- BLOCK: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- PRIMARY_TYPE: string (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- LOCATION_DESCRIPTION: string (nullable = true)
 |-- ARREST: boolean (nullable = true)
 |-- DOMESTIC: boolean (nullable = true)
 |-- BEAT: integer (nullable = true)
 |-- DISTRICT: integer (nullable = true)
 |-- WARD: integer (nullable = true)
 |-- COMMUNITY_AREA_NUMBER: integer (nullable = true)
 |-- FBICODE: string (nullable = true)
 |-- X_COORDINATE: integer (nullable = true)
 |-- Y_COORDINATE: integer (nullable = true)
 |-- YEAR: integer (nullable = true)
 |-- UPDATEDON: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- LOCATION: string (nullable = true)



In [39]:
crime.count()

533

In [40]:
crime.toPandas().head(5)

Unnamed: 0,ID,CASE_NUMBER,DATE,BLOCK,IUCR,PRIMARY_TYPE,DESCRIPTION,LOCATION_DESCRIPTION,ARREST,DOMESTIC,...,WARD,COMMUNITY_AREA_NUMBER,FBICODE,X_COORDINATE,Y_COORDINATE,YEAR,UPDATEDON,LATITUDE,LONGITUDE,LOCATION
0,3512276,HK587712,08/28/2004 05:50:56 PM,047XX S KEDZIE AVE,890,THEFT,FROM BUILDING,SMALL RETAIL STORE,False,False,...,14.0,58.0,6,1155838.0,1873050.0,2004,02/10/2018 03:50:01 PM,41.80744,-87.703956,"(41.8074405, -87.703955849)"
1,3406613,HK456306,06/26/2004 12:40:00 PM,009XX N CENTRAL PARK AVE,820,THEFT,$500 AND UNDER,OTHER,False,False,...,27.0,23.0,6,1152206.0,1906127.0,2004,02/28/2018 03:56:25 PM,41.89828,-87.716406,"(41.898279962, -87.716405505)"
2,8002131,HT233595,04/04/2011 05:45:00 AM,043XX S WABASH AVE,820,THEFT,$500 AND UNDER,NURSING HOME/RETIREMENT HOME,False,False,...,3.0,38.0,6,1177436.0,1876313.0,2011,02/10/2018 03:50:01 PM,41.815933,-87.624642,"(41.815933131, -87.624642127)"
3,7903289,HT133522,12/30/2010 04:30:00 PM,083XX S KINGSTON AVE,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,...,7.0,46.0,6,1194622.0,1850125.0,2010,02/10/2018 03:50:01 PM,41.743665,-87.562463,"(41.743665322, -87.562462756)"
4,10402076,HZ138551,02/02/2016 07:30:00 PM,033XX W 66TH ST,820,THEFT,$500 AND UNDER,ALLEY,False,False,...,15.0,66.0,6,1155240.0,1860661.0,2016,02/10/2018 03:50:01 PM,41.773455,-87.70648,"(41.773455295, -87.706480471)"


In [41]:
import pyspark.sql.types as T
from pyspark.sql.functions import (
    col,
    concat,
    date_format,
    from_unixtime,
    lit,
    lpad,
    month,
    to_date,
    unix_timestamp,
    year,
)

In [42]:
crime = crime.withColumn(
    "new_date",
    from_unixtime(unix_timestamp("DATE", "MM/dd/yyyy hh:mm:ss a"), "yyyy-MM-dd").cast(
        "date"
    ),
).withColumn("final_date", date_format("new_date", "yyyy-MM"))

In [43]:
crime.select("DATE", "new_date", "final_date").printSchema()

root
 |-- DATE: string (nullable = true)
 |-- new_date: date (nullable = true)
 |-- final_date: string (nullable = true)



In [44]:
crime.select("ID", "PRIMARY_TYPE", "final_date").show(2)

+-------+------------+----------+
|     ID|PRIMARY_TYPE|final_date|
+-------+------------+----------+
|3512276|       THEFT|   2004-08|
|3406613|       THEFT|   2004-06|
+-------+------------+----------+
only showing top 2 rows



In [45]:
crime.groupby("final_date", "PRIMARY_TYPE").agg(
    F.count("*").alias("Number of case")
).orderBy("final_date", col("Number of case").desc()).show(15)

+----------+-------------------+--------------+
|final_date|       PRIMARY_TYPE|Number of case|
+----------+-------------------+--------------+
|   2001-01|              THEFT|             1|
|   2001-01|  DOMESTIC VIOLENCE|             1|
|   2001-01|       PROSTITUTION|             1|
|   2001-01|            BATTERY|             1|
|   2001-02|            BATTERY|             1|
|   2001-03|    CRIMINAL DAMAGE|             1|
|   2001-03|            BATTERY|             1|
|   2001-03|       PROSTITUTION|             1|
|   2001-03| DECEPTIVE PRACTICE|             1|
|   2001-03|CRIM SEXUAL ASSAULT|             1|
|   2001-03|          NARCOTICS|             1|
|   2001-04|            BATTERY|             1|
|   2001-04|              THEFT|             1|
|   2001-05|              THEFT|             1|
|   2001-05|       PROSTITUTION|             1|
+----------+-------------------+--------------+
only showing top 15 rows



In [46]:
crime.groupby("final_date", "PRIMARY_TYPE").agg(
    F.count("*").alias("Number of case")
).orderBy(["final_date", "Number of case"], descending=[0, 1]).show(5)

+----------+-----------------+--------------+
|final_date|     PRIMARY_TYPE|Number of case|
+----------+-----------------+--------------+
|   2001-01|DOMESTIC VIOLENCE|             1|
|   2001-01|     PROSTITUTION|             1|
|   2001-01|            THEFT|             1|
|   2001-01|          BATTERY|             1|
|   2001-02|          BATTERY|             1|
+----------+-----------------+--------------+
only showing top 5 rows



## Exercise 03 - Get top 3 crime types based on number of incidents in RESIDENCE area

In [52]:
from pyspark.sql.functions import count

crime.filter(col("LOCATION_DESCRIPTION") == "RESIDENCE").groupBy("PRIMARY_TYPE").agg(count(
    "*").alias("number crime per type")
).orderBy(col("number crime per type").desc()).show(5)

+---------------+---------------------+
|   PRIMARY_TYPE|number crime per type|
+---------------+---------------------+
|        BATTERY|                   19|
|CRIMINAL DAMAGE|                   16|
|  OTHER OFFENSE|                   10|
|       BURGLARY|                   10|
|          THEFT|                    6|
+---------------+---------------------+
only showing top 5 rows



## Exercise 02 - Get details of inactive customers

In [4]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructField, StructType

orders_path = (
    "/home/nghiaht7/data-engineer/data-engineering-essentials/data/retail_db/orders"
)
orders_schema_str = """
            order_id INT, 
            order_date STRING, 
            order_customer_id INT, 
            order_status STRING
            """

orders_schema = StructType(
    [
        StructField("order_id", IntegerType(), False),
        StructField("order_date", StringType(), False),
        StructField("order_customer_id", IntegerType(), False),
        StructField("order_status", StringType(), False),
    ]
)

customers_path = (
    "/home/nghiaht7/data-engineer/data-engineering-essentials/data/retail_db/customers"
)

customers_schema_str = """
            customer_id INT ,
            customer_fname STRING,
            customer_lname STRING,
            customer_email STRING,
            customer_password STRING,
            customer_street STRING,
            customer_city STRING,
            customer_state STRING,
            customer_zipcode STRING
            s"""

customers_schema = StructType(
    [
        StructField("customer_id", IntegerType(), False),
        StructField("customer_fname", StringType(), False),
        StructField("customer_lname", StringType(), False),
        StructField("customer_email", StringType(), False),
        StructField("customer_password", StringType(), False),
        StructField("customer_street", StringType(), False),
        StructField("customer_city", StringType(), False),
        StructField("customer_zipcode", StringType(), False),
    ]
)

In [5]:
orders = (
    spark.read.format("csv")
    .option("header", "false")
    .schema(orders_schema)
    .load(orders_path)
)
orders.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [6]:
orders.show(10)

[Stage 0:>                                                          (0 + 1) / 1]

+--------+--------------------+-----------------+---------------+
|order_id|          order_date|order_customer_id|   order_status|
+--------+--------------------+-----------------+---------------+
|       1|2013-07-25 00:00:...|            11599|         CLOSED|
|       2|2013-07-25 00:00:...|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|            12111|       COMPLETE|
|       4|2013-07-25 00:00:...|             8827|         CLOSED|
|       5|2013-07-25 00:00:...|            11318|       COMPLETE|
|       6|2013-07-25 00:00:...|             7130|       COMPLETE|
|       7|2013-07-25 00:00:...|             4530|       COMPLETE|
|       8|2013-07-25 00:00:...|             2911|     PROCESSING|
|       9|2013-07-25 00:00:...|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|             5648|PENDING_PAYMENT|
+--------+--------------------+-----------------+---------------+
only showing top 10 rows



                                                                                

In [7]:
customers = (
    spark.read.format("csv")
    .option("header", "false")
    .schema(customers_schema)
    .load(customers_path)
)

customers.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_fname: string (nullable = true)
 |-- customer_lname: string (nullable = true)
 |-- customer_email: string (nullable = true)
 |-- customer_password: string (nullable = true)
 |-- customer_street: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_zipcode: string (nullable = true)



In [8]:
customers.toPandas().head(10)

Unnamed: 0,customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_zipcode
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX
1,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO
2,3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR
3,4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA
4,5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,10 Crystal River Mall,Caguas,PR
5,6,Mary,Smith,XXXXXXXXX,XXXXXXXXX,3151 Sleepy Quail Promenade,Passaic,NJ
6,7,Melissa,Wilcox,XXXXXXXXX,XXXXXXXXX,9453 High Concession,Caguas,PR
7,8,Megan,Smith,XXXXXXXXX,XXXXXXXXX,3047 Foggy Forest Plaza,Lawrence,MA
8,9,Mary,Perez,XXXXXXXXX,XXXXXXXXX,3616 Quaking Street,Caguas,PR
9,10,Melissa,Smith,XXXXXXXXX,XXXXXXXXX,8598 Harvest Beacon Plaza,Stafford,VA


In [30]:
result = customers.join(
    orders, customers.customer_id == orders.order_customer_id, "left"
)

In [31]:
# when start to .show()
result.toPandas().head()

Unnamed: 0,customer_id,customer_fname,customer_lname,customer_email,customer_password,customer_street,customer_city,customer_zipcode,order_id,order_date,order_customer_id,order_status
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,22945.0,2013-12-13 00:00:00.0,1.0,COMPLETE
1,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,67863.0,2013-11-30 00:00:00.0,2.0,COMPLETE
2,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,57963.0,2013-08-02 00:00:00.0,2.0,ON_HOLD
3,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,33865.0,2014-02-18 00:00:00.0,2.0,COMPLETE
4,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,15192.0,2013-10-29 00:00:00.0,2.0,PENDING_PAYMENT


In [34]:
result.filter(col("order_id").isNull()).select(
    "customer_lname", "customer_fname", "order_id"
).orderBy("customer_lname", "customer_fname").show(500)

+--------------+--------------+--------+
|customer_lname|customer_fname|order_id|
+--------------+--------------+--------+
|        Bolton|          Mary|    null|
|       Ellison|        Albert|    null|
|         Green|       Carolyn|    null|
|        Greene|          Mary|    null|
|       Harrell|          Mary|    null|
|         Lewis|          Mary|    null|
|       Mueller|          Mary|    null|
|         Patel|       Matthew|    null|
|          Shaw|          Mary|    null|
|         Smith|        Amanda|    null|
|         Smith|        Ashley|    null|
|         Smith|          Carl|    null|
|         Smith|          Emma|    null|
|         Smith|         Grace|    null|
|         Smith|         James|    null|
|         Smith|          Joan|    null|
|         Smith|       Kenneth|    null|
|         Smith|         Kevin|    null|
|         Smith|          Mary|    null|
|         Smith|          Mary|    null|
|         Smith|          Mary|    null|
|         Smith|

In [35]:
result.filter(col("order_id").isNull()).select(
    "customer_lname", "customer_fname", "order_id"
).orderBy("customer_lname", "customer_fname").count()

30