In [78]:
import os
import json
import sys
from urllib.parse import urljoin, urlsplit
from pathlib import Path
from zipfile import ZipFile
from typing import Dict, Optional, Iterable, Any

import requests
from pyspark.sql.dataframe import DataFrame
from pyspark.files import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

## Creating a new session

In [79]:
spark = SparkSession.builder \
        .appName("SparkMagic") \
        .enableHiveSupport() \
        .getOrCreate()
sc = spark.sparkContext

## Using local files

In [80]:
samples = Path("./samples/landing/2019/Hearing_files")

## Reading files

In [81]:
parquet_files = [
    p for p in samples.rglob("*arb*real*")
]
frames = {
    f.stem: spark.read.parquet(f.as_posix())
    for f in parquet_files
}
print(*frames)

arb_hearings_real arb_protest_real


In [82]:
frames["arb_protest_real"].show()

+-------------+------------+------------+
|         acct|protested_by|protested_dt|
+-------------+------------+------------+
|1143210310026|        5082|  05/11/2019|
|1183480020014|         463|  04/23/2019|
|1146730000146|       Owner|  03/29/2019|
|0611380690023|       Owner|  03/23/2019|
|1058680000020|        7092|  05/15/2019|
|1397150020013|        3088|  05/15/2019|
|0511010030018|        4634|  05/14/2019|
|1141250040032|       46327|  05/10/2019|
|0720270040037|         463|  05/10/2019|
|1158470000003|       Owner|  04/04/2019|
|1188950020019|       Owner|  06/22/2019|
|1360540020018|           6|  05/15/2019|
|1374240010020|       Owner|  05/18/2019|
|0660460390024|        5094|  05/15/2019|
|1298500010008|        3030|  05/14/2019|
|0750860580001|         650|  05/09/2019|
|1035090000013|         463|  04/23/2019|
|1332450000004|       Owner|  03/28/2019|
|1304150010006|       Owner|  03/21/2019|
|0861370000007|       Owner|  05/13/2019|
+-------------+------------+------

In [83]:
protests_real_df = frames['arb_protest_real']

In [84]:
protests_real_df.select("protested_dt").show(5)

+------------+
|protested_dt|
+------------+
|  05/11/2019|
|  04/23/2019|
|  03/29/2019|
|  03/23/2019|
|  05/15/2019|
+------------+
only showing top 5 rows



In [85]:
protests_real_df.select("*").where(col("acct") == "1052860000018").show(5)

+-------------+------------+------------+
|         acct|protested_by|protested_dt|
+-------------+------------+------------+
|1052860000018|       Owner|  05/21/2019|
+-------------+------------+------------+



In [86]:
hearings_real_df = frames["arb_hearings_real"]

In [87]:
hearings_real_df.select(
    col("Initial_Appraised_Value"), 
    col("Final_Appraised_Value"), 
    col("Initial_Appraised_Value") - col("Final_Appraised_Value")
).where(
    "acct=1052860000018"
).show()

+-----------------------+---------------------+-------------------------------------------------+
|Initial_Appraised_Value|Final_Appraised_Value|(Initial_Appraised_Value - Final_Appraised_Value)|
+-----------------------+---------------------+-------------------------------------------------+
|                 260150|               240000|                                          20150.0|
+-----------------------+---------------------+-------------------------------------------------+



In [88]:
def read_record(id: str, df: DataFrame) -> DataFrame:
    return (df.select("*").where(df.acct==id))

In [44]:
read_record("1052860000018", df=frames["arb_hearings_real"])

DataFrame[acct: string, Tax_Year: string, Real_Personal_Property: string, Hearing_Type: string, State_Class_Code: string, Owner_Name: string, Scheduled_for_Date: string, Actual_Hearing_Date: string, Release_Date: string, Letter_Type: string, Agent_Code: string, Initial_Appraised_Value: string, Initial_Market_Value: string, Final_Appraised_Value: string, Final_Market_Value: string]

In [50]:
joined_df = protests_real_df.join(hearings_real_df, on="acct")

In [54]:
joined_df.count()

418230

In [56]:
protests_real_df.count()

424219

In [57]:
hearings_real_df.count()

398178

In [58]:
424219-398178

26041

In [60]:
joined_df = hearings_real_df.join(protests_real_df, on="acct")

In [72]:
joined_df.count()

418230

In [75]:
owners_df = spark.read.parquet("samples/landing/2019/Real_acct_owner/owners.parquet/")

In [76]:
owners_df

DataFrame[acct: string, ln_num: string, name: string, aka: string, pct_own: string]

In [90]:
joined_df = joined_df.join(owners_df, on="acct")

In [91]:
joined_df.show()

+-------------+--------+----------------------+------------+----------------+--------------------+------------------+-------------------+------------+-----------+----------+-----------------------+--------------------+---------------------+------------------+------------+------------+------+--------------------+--------------------+-------+
|         acct|Tax_Year|Real_Personal_Property|Hearing_Type|State_Class_Code|          Owner_Name|Scheduled_for_Date|Actual_Hearing_Date|Release_Date|Letter_Type|Agent_Code|Initial_Appraised_Value|Initial_Market_Value|Final_Appraised_Value|Final_Market_Value|protested_by|protested_dt|ln_num|                name|                 aka|pct_own|
+-------------+--------+----------------------+------------+----------------+--------------------+------------------+-------------------+------------+-----------+----------+-----------------------+--------------------+---------------------+------------------+------------+------------+------+--------------------+-

In [94]:
joined_df.select("acct", "name").where("acct=1052860000018").show?

Object `show` not found.


In [97]:
joined_df.show()

+-------------+--------+----------------------+------------+----------------+--------------------+------------------+-------------------+------------+-----------+----------+-----------------------+--------------------+---------------------+------------------+------------+------------+------+--------------------+--------------------+-------+
|         acct|Tax_Year|Real_Personal_Property|Hearing_Type|State_Class_Code|          Owner_Name|Scheduled_for_Date|Actual_Hearing_Date|Release_Date|Letter_Type|Agent_Code|Initial_Appraised_Value|Initial_Market_Value|Final_Appraised_Value|Final_Market_Value|protested_by|protested_dt|ln_num|                name|                 aka|pct_own|
+-------------+--------+----------------------+------------+----------------+--------------------+------------------+-------------------+------------+-----------+----------+-----------------------+--------------------+---------------------+------------------+------------+------------+------+--------------------+-