# Data Engineering Notebook
> Data engineering can consist of ***collection, cleaning, transformation, processing, and automating and monitoring tasks***
* Collection 
* Cleaning
* Transformation
* Processing
* Automating

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# import packages and modules
import os,arcpy
from arcgis.features import FeatureSet, GeoAccessor, GeoSeriesAccessor
import pandas as pd
from functools import reduce
import pyodbc

# set data frame display options
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 1000)
pd.options.display.float_format = '{:,.2f}'.format

# set overwrite to true
arcpy.env.overwriteOutput = True

# in memory output file path
wk_memory = "memory" + "\\"

# set workspace and sde connections 
scratchFolder = "C:\\GIS"
workspace     = "//Trpa-fs01/GIS/PROJECTS/ResearchAnalysis/MailingLists/IPES"
desktop       = "C:\\Users\\mbindl\\Desktop"
arcpy.env.workspace = "C:\\GIS\\Scratch.gdb"

## SDE Connection Files saved on the Network
# sdeTabular = "F:\\GIS\\GIS_DATA\\Tabular.sde"
# sdeBase    = "F:\\GIS\\GIS_DATA\\Vector.sde"
# sdeCollect = "F:\\GIS\\GIS_DATA\\Collect.sde"

# network path to connection files
filePath = "C:\\GIS\\DB_CONNECT"

# database file path 
sdeBase = os.path.join(filePath, "Vector.sde")
sdeCollect = os.path.join(filePath, "Collection.sde")

In [6]:
import pandas as pd
import os
from utils import *

# read_file, read_excel, get_fs_data, get_fs_data_query, get_fs_data_spatial, get_fs_data_spatial_query, import_lookup_dictionary, update_field_from_dictionary

***Get Reference Data***
* https://www.laketahoeinfo.org/WebServices/List
* https://maps.trpa.org/server/rest/services/

In [None]:
## LT Info Data
# Deed Restrictions as a DataFrame
dfDeed      = pd.read_json("https://laketahoeinfo.org/WebServices/GetDeedRestrictedParcels/JSON/e17aeb86-85e3-4260-83fd-a2b32501c476")
# IPES LTinfo as a DataFrame
dfIPES      = pd.read_json("https://www.laketahoeinfo.org/WebServices/GetParcelIPESScores/JSON/e17aeb86-85e3-4260-83fd-a2b32501c476")
# Development Rights Transacted and Banked as a DataFrame
dfDevRights = pd.read_json("https://www.laketahoeinfo.org/WebServices/GetTransactedAndBankedDevelopmentRights/JSON/e17aeb86-85e3-4260-83fd-a2b32501c476")
# All Parcels as a DataFrame
dfLTParcel  = pd.read_json("https://www.laketahoeinfo.org/WebServices/GetAllParcels/JSON/e17aeb86-85e3-4260-83fd-a2b32501c476")


In [None]:
## TRPA Data 
# Parcel Master as a Spatially Enabled Dataframe from a Feature Service
sdfParcel     = get_fs_data_spatial("https://maps.trpa.org/server/rest/services/Parcels/MapServer/0")
# TRPA Boundary as a Spatially Enabled Dataframe from a Feature Service
sdfBoundary   = get_fs_data_spatial("https://maps.trpa.org/server/rest/services/Boundaries/MapServer/4")
# Plan Area Boundary as a Spatially Enabled Dataframe from a Feature Service
sdfPlanArea   = get_fs_data_spatial("https://maps.trpa.org/server/rest/services/Boundaries/MapServer/0")
# District Boundary as a Spatially Enabled Dataframe from a Feature Service
sdfDistrict   = get_fs_data_spatial("https://maps.trpa.org/server/rest/services/Zoning/MapServer/0")
# Town Center Boundary as a Spatially Enabled Dataframe from a Feature Service
sdfTownCenter = get_fs_data_spatial("https://maps.trpa.org/server/rest/services/Boundaries/MapServer/1")

## Permit Data Engineering

#### TRPA Permit Data

***Get Data***

In [None]:
## TRPA permit data is exported from accela nightly
## then stored in colleciton.sde enterprise geodatabase and published to the trpa server as the web service below
# web service url
permitTable = "https://maps.trpa.org/server/rest/services/Permit_Records/MapServer/1"
# get permit data as a dataframe
dfTRPAPermit = get_fs_data(permitTable)

In [None]:
## TRPA Permit Data Engineering
dfTRPAPermit.info()
dfTRPAPermit.head()

***Transformation***

In [None]:
# get the permit type and description lookup dictionary
permitTypeDict = import_lookup_dictionary("https://maps.trpa.org/server/rest/services/Permit_Records/MapServer/2")
# update the permit type and description fields
dfTRPAPermit = update_field_from_dictionary(dfTRPAPermit, "PermitType", permitTypeDict, "PermitTypeDesc")
# get the permit status lookup dictionary
permitStatusDict = import_lookup_dictionary("https://maps.trpa.org/server/rest/services/Permit_Records/MapServer/3")
# update the permit status field
dfTRPAPermit = update_field_from_dictionary(dfTRPAPermit, "PermitStatus", permitStatusDict, "PermitStatusDesc")
# get the permit category lookup dictionary
permitCategoryDict = import_lookup_dictionary("https://maps.trpa.org/server/rest/services/Permit_Records/MapServer/4")
# update the permit category field
dfTRPAPermit = update_field_from_dictionary(dfTRPAPermit, "PermitCategory", permitCategoryDict, "PermitCategoryDesc")


***Processing***

In [None]:
# pivot the permit data to get the permit counts by permit type
dfPermitType = dfTRPAPermit.pivot_table(index='PermitTypeDesc', columns='PermitStatusDesc', values='OBJECTID', aggfunc='count', fill_value=0)
dfPermitType = dfPermitType.reset_index()
dfPermitType.columns.name = None
dfPermitType = dfPermitType.rename_axis(None, axis=1)
dfPermitType = dfPermitType.rename(columns={"PermitTypeDesc":"Permit Type"})
dfPermitType = dfPermitType.sort_values("Permit Type", ascending=True)
dfPermitType["Total"] = dfPermitType.sum(axis=1)
dfPermitType = dfPermitType.sort_values("Total", ascending=False)
dfPermitType


#### City of South Lake Tahoe Permit Data

***Get Data***

In [None]:
## City of South Lake Tahoe Permit data was sent over by Ryan Malhoski on 4/9/2021
dfCSLTPermit = read_file("data\PermitData_CSLT_040924.csv")

In [None]:
## CSLT Permit Data Engineering
# get unique permit status
dfCSLTPermit.Status.unique()

***Transformation***

***Processing***

#### El Dorado County Permit Data

***Get Data***

In [None]:
# El Dorado Permit data was exporeted by Ken Kasman on 4/1/2021 from their Trakit database
dfElDoPermit = read_file("data\PermitData_ElDorado_040124.csv")

***Transformation***

In [None]:
# get lookup dictionary
lookupTable = read_file("resources/lookup_reporting_category.csv")
lookupTable["Reporting Category"].unique()


***Processing***

#### Placer County Permit Data

***Get Data***

In [None]:
## Placer Permit Data Comes in monthly via email, and gets saved to the folder below.
## The code below will merge all the files in the folder into a single file, return a dataframe, and export to csv

# folder with the CSV files
folder_path = r"F:\Research and Analysis\Local Jurisdiction MOU data collection\Placer MOU Files\Placer"
# List to hold the DataFrames
dfs = []

# Loop through the files in the folder and identify CSV files
for file_name in os.listdir(folder_path):
    # Construct the full file path
    file_path = os.path.join(folder_path, file_name)
    # Read the CSV file into a DataFrame and append to the list
    df = pd.read_excel(file_path)
    # Append the DataFrame to the list
    dfs.append(df)
# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(dfs, ignore_index=True)
# Add today's date at the end of the file name _MMDDYY
today = pd.Timestamp.today().strftime("%m%d%y")
# Export the final DataFrame to a CSV file
final_df.to_csv("data\PermitData_Placer_" + today + ".csv", index=False)

In [None]:
## Placer Permit data explained above. 
dfPlacerPermit =read_file("data\PermitData_Placer_040924.csv")

***Transformation***

In [None]:
# create lookup dictionary
lookupTable = read_file("resources/PL_lookup_reporting_category.csv")
lookupTable["Reporting Category"].unique()


***Processing***

#### Merge

#### Load

## Cumulative Accounting Data Engineering

***Get Data***

In [None]:
## get 2022 development units
devhistoryURL = "https://maps.trpa.org/server/rest/services/Existing_Development/MapServer/2"
parcelUnits12 = get_fs_data_spatial_query(devhistoryURL, "Year = 2012")
parcelUnits18 = get_fs_data_spatial_query(devhistoryURL, "Year = 2018")
parcelUnits19 = get_fs_data_spatial_query(devhistoryURL, "Year = 2019")
parcelUnits20 = get_fs_data_spatial_query(devhistoryURL, "Year = 2020")
parcelUnits21 = get_fs_data_spatial_query(devhistoryURL, "Year = 2021")
parcelUnits22 = get_fs_data_spatial_query(devhistoryURL, "Year = 2022")


In [None]:
parcelUnits12.Residential_Units.sum()

***Transformation***

#### Deed Restrictions
> We need to get Ken's housing deed restricted unit research merged with LTinfo housing deed restricitons and unit data from 2022

***Get Data***

In [None]:
dfDeedUnits = read_excel("data\Housing_Deed_Restrcitions.xlsx", sheet=0)


#### ADU Tracking
> I’ve been working on tracking ADU permits from TRPA and other jurisdictions where I’ve located them. This is a compilation of other information, but over time I’d like to establish a system of record for this information (LT Info). This is similar to the Residential Bonus Unit data and there’s crossover on some of these, where a bonus unit was used to create an ADU, but you can have an ADU without requiring a bonus unit, and you can use a bonus unit without it being an ADU… 

***Get Data***

In [None]:
dfADU = read_excel("data\ADU Tracking.xlsx", sheet=0)

#### Allocations
> this file includes all of the allocations that have been tracked in LT Info, and adds in whether the subject parcel has been issued a BMP/SCC certificate and/or whether Air Quality/Mobility Mitigation fees (for added VMT) or Water Quality Mitigation fees (for added coverage) have been paid. 

In [None]:
allocations = read_excel("data\Allocation_Tracking.xlsx", 0)

#### Transactions with Inactive APNs

In [None]:
inactiveParcels = read_file("data\Transactions_InactiveParcels.csv")