<a href="https://colab.research.google.com/github/ugoGS/Py/blob/main/landing_to_raw_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#New version of columns within Data Template

# Initial configuration in Google Colab

In [None]:
# Instalar PySpark y findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark
!pip install -q findspark




# Java and Spark configuration


In [None]:
import os
import findspark

# Configurar la ruta de Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Inicializar findspark
findspark.init()



# Mount drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/scenarios/

landing				raw			     refined
landing_to_raw_v5.ipynb		raw_to_trusted_v3.ipynb      scenarios.ipynb
load_dummy_in_trusted_v1.ipynb	raw_to_trusted_v4_gdf.ipynb  trusted
notebooks			raw_to_trusted_v5.ipynb


# Create Spark Session & Import Libraries

In [None]:

# from pyspark.sql import SparkSession

# # Detén cualquier sesión existente y crea una nueva
# try:
#     spark.stop()
# except:
#     pass


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, lit, concat_ws, when, length, asc, desc, monotonically_increasing_id
import json
import pandas as pd
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DateType
from pyspark.sql import functions as F
from datetime import datetime


# Crear una sesión de Spark
spark = SparkSession.builder.appName("proof of concept").getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)



---
# LANDING TO RAW
---



# Upload file to Colab (manually)

In [None]:
from google.colab import files
import shutil

uploaded = files.upload()
#path = "/content/drive/MyDrive/Colab Notebooks/scenarios/landing"
path = "/content/drive/MyDrive/scenarios/landing" #Acceso directo a scenarios desde compartidos
#path = "/content/data_lake/landing"

# Crear la carpeta de destino si no existe
os.makedirs(path, exist_ok=True)

for filename in uploaded.keys():
    print(filename)
    shutil.move(filename, f'{path}/{filename}')


Saving GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx to GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx
GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx


# Schema definitions

In [None]:
def define_schemas():

  schema_main_template = StructType([
    StructField('source_system_to_be_updated', StringType(), True),
    StructField("type_of_update", StringType(), True),
    StructField("additional_notes_and_observations", StringType(), True),
    StructField("couldnt_capture_due_external_reasons", StringType(), True),
    StructField("placed_on_exception_report", StringType(), True),
    StructField("object_type", StringType(), True),
    StructField("description_of_object_type", StringType(), True),
    StructField("floc", StringType(), True),
    StructField("floc_description", StringType(), True),
    StructField("sap_equipment_number", StringType(), True),
    StructField("equipment_category", StringType(), True),
    StructField("cu_id", StringType(), True),
    StructField("mid", StringType(), True),
    StructField("sort_field_pole_number", StringType(), True),
    StructField("opco", StringType(), True),
    StructField("floc_circuit_number", StringType(), True),
    StructField("nyseg_line_number", StringType(), True),
    StructField("address_number", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("main_work_center", StringType(), True),
    StructField("service_center", StringType(), True),
    StructField("start_up_date", StringType(), True),
    StructField("system_status", StringType(), True),
    StructField("status_for_users", StringType(), True),
    StructField("changed_on", StringType(), True),
    StructField("construction_year", StringType(), True),
    StructField("construction_month", StringType(), True),
    StructField("technical_id_work_order", StringType(), True),
    StructField("survey_date", StringType(), True),
    StructField("proxy_vert", StringType(), True),
    StructField("phantom_location", StringType(), True),
    StructField("long_text", StringType(), True),
    StructField("manufacturer", StringType(), True),
    StructField("model_number", StringType(), True),
    StructField("manufacturer_serial_number", StringType(), True),
    StructField("nameplate_image_captured", StringType(), True),
    StructField("lfi_device_on_correct_segment_on_every_node", StringType(), True),
    StructField("lfi_device_type_on_segment", StringType(), True),
    StructField("lfi_field_tag_present", StringType(), True),
    StructField("lfi_status", StringType(), True),
    StructField("lfi_device_at_circuit_tie_point", StringType(), True),
    StructField("scada_capability", StringType(), True),
    StructField("tie_point_location", StringType(), True),
    StructField("status", StringType(), True),
    StructField("ue_condr_circuit_associated_to_segment", StringType(), True),
    StructField("ue_condr_location_of_circuit_node_identified", StringType(), True),
    StructField("ue_condr_conductor_transitions_from_oh_to_ug", StringType(), True),
    StructField("ue_condr_riser_present", StringType(), True),
    StructField("ue_condr_segment_associated_with_correct_circuit", StringType(), True),
    StructField("ue_pole_equipment_on_pole_lfi_devices_exists_on_pole", StringType(), True),
    StructField("ue_pole_equipment_on_pole_non_lfi_devices_exists_on_pole", StringType(), True),
    StructField("ue_pole_laterals_branch_off", StringType(), True),
    StructField("ue_pole_primary_metered_customer_point_of_delivery", StringType(), True),
    StructField("ue_pole_model_coordinates_are_accurate", StringType(), True),
    StructField("ue_pole_pole_has_riser", StringType(), True),
    StructField("ue_pole_pole_has_primary_conductors", StringType(), True),
    StructField("ue_xfmer_abandoned", StringType(), True),
    StructField("ue_vaulp_model_coordinates_are_accurate", StringType(), True)
  ])

  schema_sap_updates = StructType([
    StructField("object_type", StringType(), True),
    StructField("type_of_sap_update", StringType(), True),
    StructField("sap_discrepancy", StringType(), True),
    StructField("additional_object_type_notes_and_observations", StringType(), True),
    StructField("floc", StringType(), True),
    StructField("floc_description", StringType(), True),
    StructField("description_of_object_type", StringType(), True),
    StructField("sap_equipment_number", StringType(), True),
    StructField("cu_id", StringType(), True),
    StructField("cyme_id", StringType(), True),
    StructField("circuit_number", StringType(), True),
    StructField("nyseg_line_number", StringType(), True),
    StructField("equipment_category", StringType(), True),
    StructField("mid", StringType(), True),
    StructField("sort_field_pole_number", StringType(), True),
    StructField("opco", StringType(), True),
    StructField("address_number", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("main_work_center", StringType(), True),
    StructField("service_center", StringType(), True),
    StructField("start_up_date", StringType(), True),
    StructField("system_status", StringType(), True),
    StructField("status_for_users", StringType(), True),
    StructField("changed_on", StringType(), True),
    StructField("construction_year", StringType(), True),
    StructField("construction_month", StringType(), True),
    StructField("technical_id_work_order", StringType(), True),
    StructField("survey_date", StringType(), True),
    StructField("proxy_vert", StringType(), True),
    StructField("phantom_location", StringType(), True),
    StructField("long_text", StringType(), True),
    StructField("manufacturer", StringType(), True),
    StructField("model_number", StringType(), True),
    StructField("manufacturer_serial_number", StringType(), True),
    StructField("nameplate_image_captured", StringType(), True),
    StructField("ue_capsg_capacitor_type", StringType(), True),
    StructField("ue_capsg_circuit_switch_number", StringType(), True),
    StructField("ue_capsg_kvar", StringType(), True),
    StructField("ue_capsg_nominal_voltage_rating", StringType(), True),
    StructField("ue_capsg_number_of_bushings", StringType(), True),
    StructField("ue_capsg_number_of_phases", StringType(), True),
    StructField("ue_capsg_public_or_private", StringType(), True),
    StructField("ue_capsg_scada_controlled", StringType(), True),
    StructField("ue_capsg_status", StringType(), True),
    StructField("ue_cbank_capacitor_type", StringType(), True),
    StructField("ue_cbank_circuit_switch_number", StringType(), True),
    StructField("ue_cbank_controlled", StringType(), True),
    StructField("ue_cbank_kvar_total", StringType(), True),
    StructField("ue_cbank_nominal_voltage_rating", StringType(), True),
    StructField("ue_cbank_number_of_capacitors", StringType(), True),
    StructField("ue_cbank_number_of_phases", StringType(), True),
    StructField("ue_cbank_public_or_private", StringType(), True),
    StructField("ue_cbank_scada_controlled", StringType(), True),
    StructField("ue_cbank_status", StringType(), True),
    StructField("ue_condr_conductor_length", StringType(), True),
    StructField("ue_condr_conductor_size", StringType(), True),
    StructField("ue_condr_conductor_type", StringType(), True),
    StructField("ue_condr_insulation_type", StringType(), True),
    StructField("ue_condr_neutral_material", StringType(), True),
    StructField("ue_condr_neutral_size", StringType(), True),
    StructField("ue_condr_nominal_voltage_rating", StringType(), True),
    StructField("ue_condr_location_oh_or_ug", StringType(), True),
    StructField("ue_condr_primary_or_secondary", StringType(), True),
    StructField("ue_condr_primary_conductor_material", StringType(), True),
    StructField("ue_condr_public_length", StringType(), True),
    StructField("ue_condr_public_or_private", StringType(), True),
    StructField("ue_condr_trailing_span_length", StringType(), True),
    StructField("ue_condr_trailing_span_location", StringType(), True),
    StructField("ue_condr_trans_or_dist", StringType(), True),
    StructField("ue_ctout_character_of_construction", StringType(), True),
    StructField("ue_ctout_cutout_rating", StringType(), True),
    StructField("ue_ctout_fuse_size", StringType(), True),
    StructField("ue_ctout_fuse_type", StringType(), True),
    StructField("ue_ctout_material", StringType(), True),
    StructField("ue_ctout_nominal_voltage_rating", StringType(), True),
    StructField("ue_ctout_phase", StringType(), True),
    StructField("ue_ctout_public_or_private", StringType(), True),
    StructField("ue_ctout_state", StringType(), True),
    StructField("ue_ctout_type", StringType(), True),
    StructField("ue_disc_amp_rating", StringType(), True),
    StructField("ue_disc_circuit_switch_number", StringType(), True),
    StructField("ue_disc_number_of_phases", StringType(), True),
    StructField("ue_disc_phase_designation", StringType(), True),
    StructField("ue_disc_public_or_private", StringType(), True),
    StructField("ue_disc_switch_style", StringType(), True),
    StructField("ue_disc_type", StringType(), True),
    StructField("ue_pole_circuit_phase_label", StringType(), True),
    StructField("ue_pole_class", StringType(), True),
    StructField("ue_pole_opco_owner_percent", StringType(), True),
    StructField("ue_pole_owner", StringType(), True),
    StructField("ue_pole_owner_maintained", StringType(), True),
    StructField("ue_pole_pole_length", StringType(), True),
    StructField("ue_pole_pole_material", StringType(), True),
    StructField("ue_pole_pole_number", StringType(), True),
    StructField("ue_pole_between_pole_number_since", StringType(), True),
    StructField("ue_pole_between_pole_number_to", StringType(), True),
    StructField("ue_pole_pole_type", StringType(), True),
    StructField("ue_pole_public_or_private", StringType(), True),
    StructField("ue_pole_trans_or_dist", StringType(), True),
    StructField("ue_pole_treatment", StringType(), True),
    StructField("ue_recl_circuit_switch_number", StringType(), True),
    StructField("ue_recl_number_of_phases", StringType(), True),
    StructField("ue_recl_public_or_private", StringType(), True),
    StructField("ue_recl_recloser_acts_as", StringType(), True),
    StructField("ue_recl_scada_controlled", StringType(), True),
    StructField("ue_recl_trans_or_dist", StringType(), True),
    StructField("ue_reg_circuit_switch_number", StringType(), True),
    StructField("ue_reg_kva", StringType(), True),
    StructField("ue_reg_phase_designation", StringType(), True),
    StructField("ue_reg_public_or_private", StringType(), True),
    StructField("ue_reg_scada_controlled", StringType(), True),
    StructField("ue_reg_status", StringType(), True),
    StructField("ue_sectz_circuit_switch_number", StringType(), True),
    StructField("ue_sectz_number_of_phases", StringType(), True),
    StructField("ue_sectz_public_or_private", StringType(), True),
    StructField("ue_sectz_scada_controlled", StringType(), True),
    StructField("ue_ratio_location_oh_or_ug", StringType(), True),
    StructField("ue_ratio_number_of_phases", StringType(), True),
    StructField("ue_ratio_operating_voltage", StringType(), True),
    StructField("ue_ratio_phase_designation", StringType(), True),
    StructField("ue_ratio_primary_voltage_text", StringType(), True),
    StructField("ue_ratio_public_or_private", StringType(), True),
    StructField("ue_ratio_secondary_voltage_text", StringType(), True),
    StructField("ue_ratio_size_kva", StringType(), True),
    StructField("ue_ratio_status", StringType(), True),
    StructField("ue_ratio_subtype_transformer_type", StringType(), True),
    StructField("ue_swtch_circuit_switch_number", StringType(), True),
    StructField("ue_swtch_load_break", StringType(), True),
    StructField("ue_swtch_normal_position", StringType(), True),
    StructField("ue_swtch_number_of_phases", StringType(), True),
    StructField("ue_swtch_operating_voltage", StringType(), True),
    StructField("ue_swtch_phase_designation", StringType(), True),
    StructField("ue_swtch_public_or_private", StringType(), True),
    StructField("ue_swtch_rated_kv", StringType(), True),
    StructField("ue_swtch_state", StringType(), True),
    StructField("ue_swtch_scada_controlled", StringType(), True),
    StructField("ue_swtch_switch_type", StringType(), True),
    StructField("ue_swtgr_amperage", StringType(), True),
    StructField("ue_swtgr_fuse_type", StringType(), True),
    StructField("ue_swtgr_circuit_switch_number", StringType(), True),
    StructField("ue_swtgr_installation_type", StringType(), True),
    StructField("ue_swtgr_loadbreak_capability", StringType(), True),
    StructField("ue_swtgr_nameplate", StringType(), True),
    StructField("ue_swtgr_nominal_voltage_rating", StringType(), True),
    StructField("ue_swtgr_public_or_private", StringType(), True),
    StructField("ue_swtgr_switchgear_distribution_type", StringType(), True),
    StructField("ue_swtgr_trans_or_dist", StringType(), True),
    StructField("ue_swtgr_year_installed", StringType(), True),
    StructField("ue_xfmer_date_retired_abandoned", StringType(), True),
    StructField("ue_xfmer_dual_voltage", StringType(), True),
    StructField("ue_xfmer_kva_rating", StringType(), True),
    StructField("ue_xfmer_location_oh_or_ug", StringType(), True),
    StructField("ue_xfmer_number_of_phases", StringType(), True),
    StructField("ue_xfmer_phase_designation", StringType(), True),
    StructField("ue_xfmer_primary_voltage_text", StringType(), True),
    StructField("ue_xfmer_public_or_private", StringType(), True),
    StructField("ue_xfmer_secondary_voltage", StringType(), True),
    StructField("ue_xfmer_secondary_voltage_text", StringType(), True),
    StructField("ue_xfmer_size_kva", StringType(), True),
    StructField("ue_xfmer_transformer_type", StringType(), True),
    StructField("ue_xfmer_trans_type", StringType(), True),
    StructField("ue_xfmer_year_installed", StringType(), True),
    StructField("ue_vaulp_foundation_material", StringType(), True),
    StructField("ue_vaulp_opco_owner_percent", StringType(), True),
    StructField("ue_vaulp_public_or_private", StringType(), True),
    StructField("ue_vaulp_tr_number", StringType(), True),
    StructField("ue_vaulp_trans_or_dist", StringType(), True)
  ])

  schema_gis_updates = StructType([
    StructField("object_type", StringType(), True),
    StructField("type_of_gis_update", StringType(), True),
    StructField("gis_discrepancy", StringType(), True),
    StructField("additional_object_type_notes_and_observations", StringType(), True),
    StructField("floc", StringType(), True),
    StructField("floc_description", StringType(), True),
    StructField("description_of_object_type", StringType(), True),
    StructField("sap_equipment_number", StringType(), True),
    StructField("cu_id", StringType(), True),
    StructField("cyme_id", StringType(), True),
    StructField("circuit_number", StringType(), True),
    StructField("nyseg_line_number", StringType(), True),
    StructField("mid", StringType(), True),
    StructField("sort_field_pole_number", StringType(), True),
    StructField("opco", StringType(), True),
    StructField("address_number", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", StringType(), True),
    StructField("main_work_center", StringType(), True),
    StructField("service_center", StringType(), True),
    StructField("changed_on", StringType(), True),
    StructField("technical_id_work_order", StringType(), True),
    StructField("survey_date", StringType(), True),
    StructField("proxy_vert", StringType(), True),
    StructField("manufacturer", StringType(), True),
    StructField("model_number", StringType(), True),
    StructField("manufacturer_serial_number", StringType(), True),
    StructField("ue_capsg_circuit_number", StringType(), True),
    StructField("ue_capsg_circuit_switch_number", StringType(), True),
    StructField("ue_capsg_nominal_voltage_rating", StringType(), True),
    StructField("ue_capsg_number_of_phases", StringType(), True),
    StructField("ue_capsg_public_or_private", StringType(), True),
    StructField("ue_capsg_scada_controlled", StringType(), True),
    StructField("ue_capsg_status", StringType(), True),
    StructField("ue_cbank_capacitor_type", StringType(), True),
    StructField("ue_cbank_circuit_switch_number", StringType(), True),
    StructField("ue_cbank_controlled", StringType(), True),
    StructField("ue_cbank_kvar_total", StringType(), True),
    StructField("ue_cbank_nominal_voltage_rating", StringType(), True),
    StructField("ue_cbank_number_of_phases", StringType(), True),
    StructField("ue_cbank_public_or_private", StringType(), True),
    StructField("ue_cbank_scada_controlled", StringType(), True),
    StructField("ue_cbank_status", StringType(), True),
    StructField("ue_condr_circuit_associated_to_segment", StringType(), True),
    StructField("ue_condr_conductor_configuration", StringType(), True),
    StructField("ue_condr_conductor_length", StringType(), True),
    StructField("ue_condr_conductor_size", StringType(), True),
    StructField("ue_condr_conductor_type", StringType(), True),
    StructField("ue_condr_gis_id_from_structure", StringType(), True),
    StructField("ue_condr_gis_id_to_structure", StringType(), True),
    StructField("ue_condr_linetype", StringType(), True),
    StructField("ue_condr_insulation_type", StringType(), True),
    StructField("ue_condr_neutral_material", StringType(), True),
    StructField("ue_condr_neutral_size", StringType(), True),
    StructField("ue_condr_nominal_voltage_rating", StringType(), True),
    StructField("ue_condr_number_of_phases", StringType(), True),
    StructField("ue_condr_location_oh_or_ug", StringType(), True),
    StructField("ue_condr_operating_voltage", StringType(), True),
    StructField("ue_condr_phase_designation", StringType(), True),
    StructField("ue_condr_phase_orientation", StringType(), True),
    StructField("ue_condr_primary_or_secondary", StringType(), True),
    StructField("ue_condr_primary_conductor_material", StringType(), True),
    StructField("ue_condr_proxy_indicator", StringType(), True),
    StructField("ue_condr_proxy_vert_from", StringType(), True),
    StructField("ue_condr_proxy_vert_to", StringType(), True),
    StructField("ue_condr_public_or_private", StringType(), True),
    StructField("ue_condr_segment_associated_with_correct_circuit", StringType(), True),
    StructField("ue_condr_trailing_span_location", StringType(), True),
    StructField("ue_condr_voltage_includes_circuit_configuration", StringType(), True),
    StructField("ue_ctout_fuse_size", StringType(), True),
    StructField("ue_ctout_fuse_type", StringType(), True),
    StructField("ue_ctout_nominal_voltage_rating", StringType(), True),
    StructField("ue_ctout_phase", StringType(), True),
    StructField("ue_ctout_public_or_private", StringType(), True),
    StructField("ue_ctout_state", StringType(), True),
    StructField("ue_disc_amp_rating", StringType(), True),
    StructField("ue_disc_circuit_switch_number", StringType(), True),
    StructField("ue_disc_disconnect_type", StringType(), True),
    StructField("ue_disc_number_of_phases", StringType(), True),
    StructField("ue_disc_phase_designation", StringType(), True),
    StructField("ue_disc_public_or_private", StringType(), True),
    StructField("ue_pole_circuit_phase_label", StringType(), True),
    StructField("ue_pole_laterals_branch_off", StringType(), True),
    StructField("ue_pole_geocoordinates_latitude", StringType(), True),
    StructField("ue_pole_geocoordinates_longitude", StringType(), True),
    StructField("ue_pole_model_coordinates_are_accurate", StringType(), True),
    StructField("ue_pole_pole_number", StringType(), True),
    StructField("ue_pole_between_pole_number_since", StringType(), True),
    StructField("ue_pole_between_pole_number_to", StringType(), True),
    StructField("ue_pole_pole_type", StringType(), True),
    StructField("ue_recl_circuit_switch_number", StringType(), True),
    StructField("ue_recl_number_of_phases", StringType(), True),
    StructField("ue_recl_public_or_private", StringType(), True),
    StructField("ue_recl_recloser_acts_as", StringType(), True),
    StructField("ue_recl_scada_controlled", StringType(), True),
    StructField("ue_recl_trans_or_dist", StringType(), True),
    StructField("ue_reg_circuit_switch_number", StringType(), True),
    StructField("ue_reg_kva", StringType(), True),
    StructField("ue_reg_phase_designation", StringType(), True),
    StructField("ue_reg_public_or_private", StringType(), True),
    StructField("ue_reg_scada_controlled", StringType(), True),
    StructField("ue_reg_status", StringType(), True),
    StructField("ue_sectz_circuit_switch_number", StringType(), True),
    StructField("ue_sectz_number_of_phases", StringType(), True),
    StructField("ue_sectz_public_or_private", StringType(), True),
    StructField("ue_sectz_scada_controlled", StringType(), True),
    StructField("ue_ratio_location_oh_or_ug", StringType(), True),
    StructField("ue_ratio_number_of_phases", StringType(), True),
    StructField("ue_ratio_operating_voltage", StringType(), True),
    StructField("ue_ratio_phase_designation", StringType(), True),
    StructField("ue_ratio_pole_gis_id_", StringType(), True),
    StructField("ue_ratio_pole_gis_id2", StringType(), True),
    StructField("ue_ratio_primary_voltage_text", StringType(), True),
    StructField("ue_ratio_public_or_private", StringType(), True),
    StructField("ue_ratio_secondary_voltage_text", StringType(), True),
    StructField("ue_ratio_size_kva", StringType(), True),
    StructField("ue_ratio_status", StringType(), True),
    StructField("ue_ratio_subtype_transformer_type", StringType(), True),
    StructField("ue_ratio_transformer_in", StringType(), True),
    StructField("ue_ratio_transformer_in_state", StringType(), True),
    StructField("ue_ratio_transformer_out", StringType(), True),
    StructField("ue_swtch_circuit_switch_number", StringType(), True),
    StructField("ue_swtch_facility_id_child_level", StringType(), True),
    StructField("ue_swtch_feeder_id_circuit_number", StringType(), True),
    StructField("ue_swtch_feeder_id2_circuit_number", StringType(), True),
    StructField("ue_swtch_load_break", StringType(), True),
    StructField("ue_swtch_normal_position", StringType(), True),
    StructField("ue_swtch_normal_position_a", StringType(), True),
    StructField("ue_swtch_normal_position_b", StringType(), True),
    StructField("ue_swtch_normal_position_c", StringType(), True),
    StructField("ue_swtch_number_of_phases", StringType(), True),
    StructField("ue_swtch_operating_voltage", StringType(), True),
    StructField("ue_swtch_phase_designation", StringType(), True),
    StructField("ue_swtch_present_position_a", StringType(), True),
    StructField("ue_swtch_present_position_b", StringType(), True),
    StructField("ue_swtch_present_position_c", StringType(), True),
    StructField("ue_swtch_public_or_private", StringType(), True),
    StructField("ue_swtch_rated_kv", StringType(), True),
    StructField("ue_swtch_scada_controlled", StringType(), True),
    StructField("ue_swtch_switch_type", StringType(), True),
    StructField("ue_swtgr_circuit_switch_number", StringType(), True),
    StructField("ue_swtgr_facility_id", StringType(), True),
    StructField("ue_swtgr_feeder_id", StringType(), True),
    StructField("ue_swtgr_feeder_id2", StringType(), True),
    StructField("ue_swtgr_fuse_type", StringType(), True),
    StructField("ue_swtgr_nominal_voltage_rating", StringType(), True),
    StructField("ue_swtgr_operating_voltage", StringType(), True),
    StructField("ue_swtgr_public_or_private", StringType(), True),
    StructField("ue_swtgr_switchgear_name", StringType(), True),
    StructField("ue_swtgr_switch_number", StringType(), True),
    StructField("ue_xfmer_dual_voltage", StringType(), True),
    StructField("ue_xfmer_kva_rating", StringType(), True),
    StructField("ue_xfmer_location_oh_or_ug", StringType(), True),
    StructField("ue_xfmer_number_of_phases", StringType(), True),
    StructField("ue_xfmer_phase_designation", StringType(), True),
    StructField("ue_xfmer_primary_voltage_text", StringType(), True),
    StructField("ue_xfmer_proxy_vert", StringType(), True),
    StructField("ue_xfmer_secondary_voltage", StringType(), True),
    StructField("ue_xfmer_secondary_voltage_text", StringType(), True),
    StructField("ue_xfmer_size_kva", StringType(), True),
    StructField("ue_xfmer_subtype", StringType(), True),
    StructField("ue_xfmer_transformer_in_state", StringType(), True),
    StructField("ue_xfmer_transformer_out_state", StringType(), True),
    StructField("ue_vaulp_geocoordinates_latitude", StringType(), True),
    StructField("ue_vaulp_tr_number", StringType(), True),
    StructField("ue_vaulp_geocoordinates_longitude", StringType(), True),
    StructField("ue_vaulp_model_coordinates_are_accurate", StringType(), True)
  ])

  schema_sap_equipment_report = StructType([
    StructField("floc", StringType(), True),
    StructField("floc_description", StringType(), True),
    StructField("super_floc", StringType(), True),
    StructField("company", StringType(), True),
    StructField("floc_circuit_number", StringType(), True),
    StructField("object_type", StringType(), True),
    StructField("sap_equipment_number", StringType(), True),
    StructField("equipment_description", StringType(), True),
    StructField("equipment_category", StringType(), True),
    StructField("sort_field_pole_number", StringType(), True),
    StructField("main_work_center", StringType(), True),
    StructField("service_center", StringType(), True),
    StructField("address_number", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("district", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("rg", StringType(), True),
    StructField("system_status", StringType(), True),
    StructField("status_for_users", StringType(), True),
    StructField("construction_year", StringType(), True),
    StructField("start_up_date", StringType(), True),
    StructField("changed_on", StringType(), True),
    StructField("material", StringType(), True),
    StructField("material_description", StringType(), True),
    StructField("manufacturer", StringType(), True),
    StructField("model_number", StringType(), True),
    StructField("manufacturer_serial_number", StringType(), True),
    StructField("technical_id_work_order", StringType(), True),
    StructField("object_number", StringType(), True),
    StructField("cu_id", StringType(), True),
    StructField("cyme_id", StringType(), True),
    StructField("capacitor_type", StringType(), True),
    StructField("equipment_circuit_number_1", StringType(), True),
    StructField("equipment_circuit_number_2", StringType(), True),
    StructField("circuit_switch_number", StringType(), True),
    StructField("kvar", StringType(), True),
    StructField("kvar_total", StringType(), True),
    StructField("number_of_bushings", StringType(), True),
    StructField("nominal_voltage_rating", StringType(), True),
    StructField("normal_state", StringType(), True),
    StructField("public_or_private", StringType(), True),
    StructField("scada_controlled", StringType(), True),
    StructField("conductor_length", StringType(), True),
    StructField("conductor_size", StringType(), True),
    StructField("conductor_type", StringType(), True),
    StructField("insulation_type", StringType(), True),
    StructField("number_of_phases", StringType(), True),
    StructField("location_oh_or_ug", StringType(), True),
    StructField("phase_designation", StringType(), True),
    StructField("primary_or_secondary", StringType(), True),
    StructField("public_length", StringType(), True),
    StructField("trans_or_dist", StringType(), True),
    StructField("trailing_span_location", StringType(), True),
    StructField("fuse_size", StringType(), True),
    StructField("fuse_type", StringType(), True),
    StructField("material_2", StringType(), True),
    StructField("amp_rating", StringType(), True),
    StructField("cutout_distribution_type", StringType(), True),
    StructField("disconnect_type", StringType(), True),
    StructField("controlled", StringType(), True),
    StructField("owner", StringType(), True),
    StructField("opco_owner_percent", StringType(), True),
    StructField("pole_class", StringType(), True),
    StructField("pole_length", StringType(), True),
    StructField("pole_material", StringType(), True),
    StructField("pole_type", StringType(), True),
    StructField("treatment", StringType(), True),
    StructField("location_oh_or_ug_2", StringType(), True),
    StructField("size_kva", StringType(), True),
    StructField("recloser_acts_as", StringType(), True),
    StructField("kva", StringType(), True),
    StructField("load_break_yes_no", StringType(), True),
    StructField("normal_position", StringType(), True),
    StructField("rated_kv", StringType(), True),
    StructField("switch_style", StringType(), True),
    StructField("switch_type", StringType(), True),
    StructField("amperage", StringType(), True),
    StructField("switchgear_distribution_type", StringType(), True),
    StructField("year_installed", StringType(), True),
    StructField("foundation_material", StringType(), True),
    StructField("dual_voltage", StringType(), True),
    StructField("primary_voltage_text", StringType(), True),
    StructField("secondary_voltage", StringType(), True),
    StructField("secondary_voltage_text", StringType(), True),
    StructField("trans_type", StringType(), True),
    StructField("transformer_type", StringType(), True)
  ])

  return schema_main_template, schema_sap_updates, schema_gis_updates, schema_sap_equipment_report

# Columns to select

In [None]:
def define_columns_to_select():

  columns_to_select_main_template = [
    'Source System to be updated',
    'Type of Update',
    'Additional \nNotes & Observations',
    "Couldn't capture due to external reasons", #Couldn't
    'Placed on Exception Report',
    'Object Type',
    'Description of Object Type\n(Use MID Description if blank)',
    'FLOC',
    'FLOC Description',
    'SAP \nEquipment #',
    'Equipment Category',
    'CU ID',
    'MID',
    'Sort Field\n(Pole #)',
    'OPCO',
    'Circuit Number',
    'NYSEG Line Number',
    'Address number',
    'Street',
    'City',
    'State',
    'Postal Code',
    'Main Work Center',
    'Service Center',
    'Start-Up Date',
    'System Status',
    'Status for users',
    'Changed On',
    'Construction Year',
    'Construction Month',
    'Technical ID/\nWork Order',
    'Survey Date',
    'Proxy Vert',
    'Phantom Location',
    'Long Text\n(SAP Notes for new equipment or modifications to records)',
    'Manufacturer',
    'Model #',
    'Manufacturer Serial #',
    'Nameplate Image Captured',
    'LFI Device on Correct Segment on Every Node\n',
    'LFI Device Type on Segment\n(UE_CTOUT & UE_DISC ONLY)',
    'LFI Field Tag Present',
    'LFI\nStatus',
    'LFI Device at Circuit Tie Point',
    'SCADA Capability',
    'Tie Point Location',
    'Status',
    'Circuit Associated to Segment',
    'Location of Circuit Node Identified',
    'Conductor Transitions from OH to UG',
    'Riser Present',
    'Segment Associated with Correct Circuit',
    'Equipment on Pole\n(LFI Devices exist on Pole)',
    'Equipment on Pole\n(Non LFI devices exists on Pole)',
    'Laterals Branch Off',
    'Primary Metered Customer Point of Delivery',
    'Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)',
    'Pole has Primary Riser',
    'Pole has Primary Conductors',
    'Abandoned Transformer',
    'Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)'
  ]

  columns_to_select_sap_updates = [
    'Object Type',
    'Type of SAP Update',
    'SAP Discrepancy',
    'Additional Object Type Notes & Observations',
    'FLOC',
    'FLOC Description',
    'Description of Object Type',
    'SAP \nEquipment #',
    'CU ID',
    'CYME ID',
    'Circuit Number',
    'NYSEG Line Number',
    'Equipment Category',
    'MID',
    'Sort Field\n(Pole #)',
    'OPCO',
    'Address number',
    'Street',
    'City',
    'State',
    'Postal Code',
    'Main Work Center',
    'Service Center',
    'Start-Up Date',
    'System Status',
    'Status for users',
    'Changed On',
    'Construction Year',
    'Construction Month',
    'Technical ID/\nWork Order',
    'Survey Date',
    'Proxy Vert',
    'Phantom Location',
    'Long Text\n(SAP Notes for new equipment or modifications to records)',
    'Manufacturer',
    'Model #',
    'Manufacturer Serial #',
    'Nameplate Image Captured',
    'Capacitor Type',
    'Circuit Switch Number',
    'KVAR',
    'Nominal Voltage Rating',
    'Number of Bushings',
    'Number of Phases',
    'Public or Private',
    'SCADA\nControlled',
    'Status',
    'Capacitor Type',
    'Circuit Switch Number',
    'Controlled',
    'KVAR Total',
    'Nominal Voltage Rating',
    'Number of Capacitors',
    'Number of Phases',
    'Public or Private',
    'SCADA\nControlled',
    'Status',
    'Conductor Length',
    'Conductor Size',
    'Conductor\nType',
    'Insulation Type',
    'Neutral Material',
    'Neutral Size',
    'Nominal Voltage Rating',
    'O/H or U/G',
    'Primary or Secondary',
    'Primary Conductor Material',
    'Public Length',
    'Public or Private',
    'Trailing Span Length',
    'Trailing Span Location',
    'Trans or Dist',
    'Character of Construction',
    'Cutout Rating',
    'Fuse Size',
    'Fuse Type',
    'Material',
    'Nominal Voltage Rating',
    'Phase',
    'Public or Private',
    'State',
    'Type',
    'Amp Rating',
    'Circuit Switch Number',
    'Number of Phases',
    'Phase Designation',
    'Public or Private',
    'Switch Style',
    'Type',
    'Circuit Phase Label',
    'Class',
    'OPCO Owner \n%',
    'Owner',
    'Owner Maintained',
    'Pole Length',
    'Pole Material',
    'Pole\nNumber',
    'Between\nPole #\n(if new)',
    'Between\nPole #\n(if new)',
    'Pole Type',
    'Public or Private',
    'Trans or Dist',
    'Treatment',
    'Circuit Switch Number',
    'Number of Phases',
    'Public or Private',
    'Recloser Acts As',
    'SCADA\nControlled',
    'Trans or\nDist',
    'Circuit Switch Number',
    'KVA',
    'Phase Designation',
    'Public or Private',
    'SCADA\nControlled',
    'Status',
    'Circuit Switch Number',
    'Number of Phases',
    'Public or Private',
    'SCADA\nControlled',
    'Location\n(OH/UG)',
    'Number of Phases',
    'Operating Voltage',
    'Phase Designation',
    'Primary Voltage - Text',
    'Public or Private',
    'Secondary Voltage - Text',
    'Size (KVA)',
    'Status',
    'Subtype\n(Transformer Type)',
    'Circuit Switch Number',
    'Load Break',
    'Normal Position',
    'Number of Phases',
    'Operating Voltage',
    'Phase Designation',
    'Public or Private',
    'Rated kV',
    'State',
    'SCADA\nControlled',
    'Switch Type',
    'Amperage',
    'Fuse Type',
    'Circuit Switch Number',
    'Installation Type',
    'Loadbreak Capability',
    'Nameplate',
    'Nominal Voltage Rating',
    'Public or Private',
    'Switchgear  Distribution Type\n(GIS: Facility ID)',
    'Trans or\nDist',
    'Year Installed',
    'Date Retired/\nAbandoned',
    'Dual Voltage',
    'KVA Rating',
    'Location\n (UG or OH )',
    'Number of Phases',
    'Phase Designation',
    'Primary Voltage - Text',
    'Public or\nPrivate',
    'Secondary Voltage',
    'Secondary Voltage - Text',
    'Size (KVA)',
    '"Type, Transformer "\n(GIS: Subtype)',
    'Trans type',
    'Year installed',
    'Foundation Material',
    'OPCO Owner Percent',
    'Public or\nPrivate',
    'TR#',
    'Trans or Dist'
  ]

  columns_to_select_gis_updates = [
    'Object Type',
    'Type of GIS Update',
    'GIS Discrepancy',
    'Additional Object Type\nNotes & Observations',
    'FLOC',
    'FLOC Description',
    'Description of Object Type',
    'SAP \nEquipment #',
    'CU ID',
    'CYME ID',
    'Circuit Number',
    'NYSEG Line Number',
    'MID',
    'Sort Field\n(Pole #)',
    'OPCO',
    'Address number',
    'Street',
    'City',
    'State',
    'Zip',
    'Main Work Center',
    'Service Center',
    'Changed On',
    'Technical ID/\nWork Order',
    'Survey Date',
    'Proxy Vert',
    'Manufacturer',
    'Model #',
    'Manufacturer Serial #',
    'Circuit Number',
    'Circuit Switch Number',
    'Nominal Voltage Rating',
    'Number of Phases',
    'Public or Private',
    'SCADA\nControlled',
    'Status',
    'Capacitor Type',
    'Circuit Switch Number',
    'Controlled',
    'KVAR Total',
    'Nominal Voltage Rating',
    'Number of Phases',
    'Public or Private',
    'SCADA\nControlled',
    'Status',
    'Circuit Associated to Segment',
    'Conductor Configuration',
    'Conductor Length',
    'Conductor Size',
    'Conductor\nType',
    'GIS ID \nFrom Structure',
    'GIS ID\nTo Structure',
    'Linetype',
    'Insulation Type',
    'Neutral Material',
    'Neutral Size',
    'Nominal Voltage Rating',
    'Number of Phases',
    'O/H or U/G',
    'Operating Voltage',
    'Phase Designation',
    'Phase Orientation',
    'Primary or Secondary',
    'Primary Conductor Material',
    'Proxy\nIndicator',
    'Proxy Vert From',
    'Proxy Vert To',
    'Public or Private',
    'Segment Associated with Correct Circuit',
    'Trailing Span Location',
    'Voltage \n(Includes Circuit Configuration)',
    'Fuse Size',
    'Fuse Type',
    'Nominal Voltage Rating',
    'Phase',
    'Public or Private',
    'State',
    'Amp Rating',
    'Circuit Switch Number',
    'Disconnect Type',
    'Number of Phases',
    'Phase Designation',
    'Public or Private',
    'Circuit Phase Label',
    'Laterals Branch Off',
    'GeoCoordinates\nLatitude',
    'GeoCoordinates\nLongitude',
    'Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)',
    'Pole\nNumber',
    'Between\nPole #\n(if new)',
    'Between\nPole #\n(if new)',
    'Pole Type',
    'Circuit Switch Number',
    'Number of Phases',
    'Public or Private',
    'Recloser Acts As',
    'SCADA\nControlled',
    'Trans or\nDist',
    'Circuit Switch Number',
    'KVA',
    'Phase Designation',
    'Public or Private',
    'SCADA\nControlled',
    'Status',
    'Circuit Switch Number',
    'Number of Phases',
    'Public or Private',
    'SCADA\nControlled',
    'Location\n(OH/UG)',
    'Number of Phases',
    'Operating Voltage',
    'Phase Designation',
    'Pole GIS ID ',
    'Pole GIS ID2*',
    'Primary Voltage - Text',
    'Public or Private',
    'Secondary Voltage - Text',
    'Size (KVA)',
    'Status',
    'Subtype\n(Transformer Type)',
    'Transformer In',
    'Transformer In State',
    'Transformer Out',
    'Circuit Switch Number',
    'Facility ID*\n(child level)',
    'Feeder ID\n(Circuit #)',
    'Feeder ID2\n(Circuit #)',
    'Load Break',
    'Normal Position',
    'Normal Position - A',
    'Normal Position - B',
    'Normal Position -C',
    'Number of Phases',
    'Operating Voltage',
    'Phase Designation',
    'Present Position - A',
    'Present Position - B ',
    'Present Position - C',
    'Public or Private',
    'Rated kV',
    'SCADA\nControlled',
    'Switch Type',
    'Circuit Switch Number',
    'Facility ID',
    'Feeder ID',
    'Feeder ID2',
    'Fuse Type',
    'Nominal Voltage Rating',
    'Operating Voltage\n(SAP field: Amperage)',
    'Public or Private',
    'Switchgear Name',
    'Switch Number',
    'Dual Voltage',
    'KVA Rating',
    'Location\n (UG or OH)',
    'Number of Phases',
    'Phase Designation',
    'Primary Voltage - Text',
    'Proxy Vert',
    'Secondary Voltage',
    'Secondary Voltage - Text',
    'Size (KVA)',
    'SubType',
    'Transformer In State',
    'Transformer Out State',
    'GeoCoordinates\nLatitude',
    'GeoCoordinates\nLongitude',
    'Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)',
    'TR#'
  ]

  columns_to_select_sap_equipment_report = [
    'Functional Location',
    'FunctLocDescrip.',
    'Super FLOC',
    'Company',
    'Circuit ',
    'ObjectType',
    'Equipment',
    'Equipment Description',
    'Equipment Category',
    'Sort fld',
    'Mn.wk.ctr',
    'Service Center Name',
    'Addr. no',
    'Street',
    'City',
    'District',
    'Postl Code',
    'Rg',
    'SysStatus',
    'UserStatus',
    'ConY',
    'Start-up',
    'Chngd',
    'Material',
    'Material Description',
    'Mfr',
    'Model no.',
    'SerNo.',
    'TechID',
    'Object number',
    'CU ID',
    'CYME ID',
    'Capacitor Type',
    'Circuit',
    'Circuit No',
    'Circuit Switch Number',
    'KVAR',
    'KVAR total',
    'No of bushings',
    'Nominal Voltage Rating',
    'Normal State',
    'Public or Private',
    'SCADA Controlled',
    'Conductor Length',
    'Size, Conductor',
    'Conductor Type',
    'Insulation Type',
    'No of phases',
    'O/H or U/G',
    'Phase Designation',
    'Primary or Secondary',
    'Public Length',
    'Trans or Dist',
    'Trailing Span location',
    'Fuse Size',
    'Fuse Type',
    'Material.1',
    'Amp Rating',
    'Cutout Distribution type',
    'Disconnect Type',
    'Controlled',
    'Owner',
    'OPCO Owner Perc',
    'Pole Class',
    'Pole Length',
    'Pole Material',
    'Pole Type',
    'Treatment',
    'Location (UG or O/H )',
    'Size (KVA)',
    'Recloser Acts As',
    'Kva',
    'Load Break Yes/No',
    'Normal Position',
    'Rated kV',
    'switch style',
    'Switch Type',
    'Amperage',
    'Switchgear Distribution Type',
    'Year installed',
    'Foundation Material',
    'Dual Voltage',
    'Primary Voltage - Text',
    'Secondary Voltage',
    'Secondary Voltage - Text',
    'Trans type',
    'Type, Transformer '
  ]

  return columns_to_select_main_template, columns_to_select_sap_updates, columns_to_select_gis_updates, columns_to_select_sap_equipment_report

# Column Mapping

In [None]:
def define_column_mappings():

  # Crear un diccionario para mapear nombres de columnas de Pandas a nombres de columnas en PySpark
  column_mapping_main_template = {
    "Source System to be updated": "source_system_to_be_updated",
    "Type of Update": "type_of_update",
    "Additional \nNotes & Observations": "additional_notes_and_observations",
    "Couldn't capture due to external reasons": "couldnt_capture_due_external_reasons",
    "Placed on Exception Report": "placed_on_exception_report",
    "Object Type": "object_type",
    "Description of Object Type\n(Use MID Description if blank)": "description_of_object_type",
    "FLOC": "floc",
    "FLOC Description": "floc_description",
    "SAP \nEquipment #": "sap_equipment_number",
    "Equipment Category": "equipment_category",
    "CU ID": "cu_id",
    "MID": "mid",
    "Sort Field\n(Pole #)": "sort_field_pole_number",
    "OPCO": "opco",
    "Circuit Number": "floc_circuit_number",
    "NYSEG Line Number": "nyseg_line_number",
    "Address number": "address_number",
    "Street": "street",
    "City": "city",
    "State": "state",
    "Postal Code": "postal_code",
    "Main Work Center": "main_work_center",
    "Service Center": "service_center",
    "Start-Up Date": "start_up_date",
    "System Status": "system_status",
    "Status for users": "status_for_users",
    "Changed On": "changed_on",
    "Construction Year": "construction_year",
    "Construction Month": "construction_month",
    "Technical ID/\nWork Order": "technical_id_work_order",
    "Survey Date": "survey_date",
    "Proxy Vert": "proxy_vert",
    "Phantom Location": "phantom_location",
    "Long Text\n(SAP Notes for new equipment or modifications to records)": "long_text",
    "Manufacturer": "manufacturer",
    "Model #": "model_number",
    "Manufacturer Serial #": "manufacturer_serial_number",
    "Nameplate Image Captured": "nameplate_image_captured",
    "LFI Device on Correct Segment on Every Node\n": "lfi_device_on_correct_segment_on_every_node",
    "LFI Device Type on Segment\n(UE_CTOUT & UE_DISC ONLY)": "lfi_device_type_on_segment",
    "LFI Field Tag Present": "lfi_field_tag_present",
    "LFI\nStatus": "lfi_status",
    "LFI Device at Circuit Tie Point": "lfi_device_at_circuit_tie_point",
    "SCADA Capability": "scada_capability",
    "Tie Point Location": "tie_point_location",
    "Status": "status",
    "Circuit Associated to Segment": "ue_condr_circuit_associated_to_segment",
    "Location of Circuit Node Identified": "ue_condr_location_of_circuit_node_identified",
    "Conductor Transitions from OH to UG": "ue_condr_conductor_transitions_from_oh_to_ug",
    "Riser Present": "ue_condr_riser_present",
    "Segment Associated with Correct Circuit": "ue_condr_segment_associated_with_correct_circuit",
    "Equipment on Pole\n(LFI Devices exist on Pole)": "ue_pole_equipment_on_pole_lfi_devices_exists_on_pole",
    "Equipment on Pole\n(Non LFI devices exists on Pole)": "ue_pole_equipment_on_pole_non_lfi_devices_exists_on_pole",
    "Laterals Branch Off": "ue_pole_laterals_branch_off",
    "Primary Metered Customer Point of Delivery": "ue_pole_primary_metered_customer_point_of_delivery",
    "Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)": "ue_pole_model_coordinates_are_accurate",
    "Pole has Primary Riser": "ue_pole_pole_has_riser",
    "Pole has Primary Conductors": "ue_pole_pole_has_primary_conductors",
    "Abandoned Transformer": "ue_xfmer_abandoned",
    "Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)": "ue_vaulp_model_coordinates_are_accurate"
  }

  column_mapping_sap_updates = {
    "Object Type": "object_type",
    "Type of SAP Update": "type_of_sap_update",
    "SAP Discrepancy": "sap_discrepancy",
    "Additional Object Type Notes & Observations": "additional_object_type_notes_and_observations",
    "FLOC": "floc",
    "FLOC Description": "floc_description",
    "Description of Object Type": "description_of_object_type",
    "SAP \nEquipment #": "sap_equipment_number",
    "CU ID": "cu_id",
    "CYME ID": "cyme_id",
    "Circuit Number": "circuit_number",
    "NYSEG Line Number": "nyseg_line_number",
    "Equipment Category": "equipment_category",
    "MID": "mid",
    "Sort Field\n(Pole #)": "sort_field_pole_number",
    "OPCO": "opco",
    "Address number": "address_number",
    "Street": "street",
    "City": "city",
    "State": "state",
    "Postal Code": "postal_code",
    "Main Work Center": "main_work_center",
    "Service Center": "service_center",
    "Start-Up Date": "start_up_date",
    "System Status": "system_status",
    "Status for users": "status_for_users",
    "Changed On": "changed_on",
    "Construction Year": "construction_year",
    "Construction Month": "construction_month",
    "Technical ID/\nWork Order": "technical_id_work_order",
    "Survey Date": "survey_date",
    "Proxy Vert": "proxy_vert",
    "Phantom Location": "phantom_location",
    "Long Text\n(SAP Notes for new equipment or modifications to records)": "long_text",
    "Manufacturer": "manufacturer",
    "Model #": "model_number",
    "Manufacturer Serial #": "manufacturer_serial_number",
    "Capacitor Type": "ue_capsg_capacitor_type",
    "Circuit Switch Number": "ue_capsg_circuit_switch_number",
    "KVAR": "ue_capsg_kvar",
    "Nominal Voltage Rating": "ue_capsg_nominal_voltage_rating",
    "Number of Bushings": "ue_capsg_number_of_bushings",
    "Number of Phases": "ue_capsg_number_of_phases",
    "Public or Private": "ue_capsg_public_or_private",
    "SCADA\nControlled": "ue_capsg_scada_controlled",
    "Capacitor Type": "ue_cbank_capacitor_type",
    "Circuit Switch Number": "ue_cbank_circuit_switch_number",
    "Controlled": "ue_cbank_controlled",
    "KVAR Total": "ue_cbank_kvar_total",
    "Nominal Voltage Rating": "ue_cbank_nominal_voltage_rating",
    "Number of Phases": "ue_cbank_number_of_phases",
    "Public or Private": "ue_cbank_public_or_private",
    "SCADA\nControlled": "ue_cbank_scada_controlled",
    "Status": "ue_cbank_status",
    "Conductor Length": "ue_condr_conductor_length",
    "Conductor Size": "ue_condr_conductor_size",
    "Conductor\nType": "ue_condr_conductor_type",
    "Insulation Type": "ue_condr_insulation_type",
    "Neutral Material": "ue_condr_neutral_material",
    "Neutral Size": "ue_condr_neutral_size",
    "Nominal Voltage Rating": "ue_condr_nominal_voltage_rating",
    "O/H or U/G": "ue_condr_location_oh_or_ug",
    "Primary or Secondary": "ue_condr_primary_or_secondary",
    "Primary Conductor Material": "ue_condr_primary_conductor_material",
    "Public Length": "ue_condr_public_length",
    "Public or Private": "ue_condr_public_or_private",
    "Trailing Span Length": "ue_condr_trailing_span_length",
    "Trailing Span Location": "ue_condr_trailing_span_location",
    "Trans or Dist": "ue_condr_trans_or_dist",
    "Character of Construction": "ue_ctout_character_of_construction",
    "Cutout Rating": "ue_ctout_cutout_rating",
    "Fuse Size": "ue_ctout_fuse_size",
    "Fuse Type": "ue_ctout_fuse_type",
    "Material": "ue_ctout_material",
    "Nominal Voltage Rating": "ue_ctout_nominal_voltage_rating",
    "Phase": "ue_ctout_phase",
    "Public or Private": "ue_ctout_public_or_private",
    "State": "ue_ctout_state",
    "Type": "ue_ctout_type",
    "Amp Rating": "ue_disc_amp_rating",
    "Circuit Switch Number": "ue_disc_circuit_switch_number",
    "Number of Phases": "ue_disc_number_of_phases",
    "Phase Designation": "ue_disc_phase_designation",
    "Public or Private": "ue_disc_public_or_private",
    "Switch Style": "ue_disc_switch_style",
    "Type": "ue_disc_type",
    "Circuit Phase Label": "ue_pole_circuit_phase_label",
    "Class": "ue_pole_class",
    "OPCO Owner \n%": "ue_pole_opco_owner_percent",
    "Owner": "ue_pole_owner",
    "Owner Maintained": "ue_pole_owner_maintained",
    "Pole Length": "ue_pole_pole_length",
    "Pole Material": "ue_pole_pole_material",
    "Pole\nNumber": "ue_pole_pole_number",
    "Between\nPole #\n(if new)": "ue_pole_between_pole_number_since",
    "Between\nPole #\n(if new)": "ue_pole_between_pole_number_to",
    "Pole Type": "ue_pole_pole_type",
    "Public or Private": "ue_pole_public_or_private",
    "Trans or Dist": "ue_pole_trans_or_dist",
    "Treatment": "ue_pole_treatment",
    "Circuit Switch Number": "ue_recl_circuit_switch_number",
    "Number of Phases": "ue_recl_number_of_phases",
    "Public or Private": "ue_recl_public_or_private",
    "Recloser Acts As": "ue_recl_recloser_acts_as",
    "SCADA\nControlled": "ue_recl_scada_controlled",
    "Trans or\nDist": "ue_recl_trans_or_dist",
    "Circuit Switch Number": "ue_reg_circuit_switch_number",
    "KVA": "ue_reg_kva",
    "Phase Designation": "ue_reg_phase_designation",
    "Public or Private": "ue_reg_public_or_private",
    "SCADA\nControlled": "ue_reg_scada_controlled",
    "Status": "ue_reg_status",
    "Circuit Switch Number": "ue_sectz_circuit_switch_number",
    "Number of Phases": "ue_sectz_number_of_phases",
    "Public or Private": "ue_sectz_public_or_private",
    "SCADA\nControlled": "ue_sectz_scada_controlled",
    "Location\n(OH/UG)": "ue_ratio_location_oh_or_ug",
    "Number of Phases": "ue_ratio_number_of_phases",
    "Operating Voltage": "ue_ratio_operating_voltage",
    "Phase Designation": "ue_ratio_phase_designation",
    "Primary Voltage - Text": "ue_ratio_primary_voltage_text",
    "Public or Private": "ue_ratio_public_or_private",
    "Secondary Voltage - Text": "ue_ratio_secondary_voltage_text",
    "Size (KVA)": "ue_ratio_size_kva",
    "Status": "ue_ratio_status",
    "Subtype\n(Transformer Type)": "ue_ratio_subtype_transformer_type",
    "Circuit Switch Number": "ue_swtch_circuit_switch_number",
    "Load Break": "ue_swtch_load_break",
    "Normal Position": "ue_swtch_normal_position",
    "Number of Phases": "ue_swtch_number_of_phases",
    "Operating Voltage": "ue_swtch_operating_voltage",
    "Phase Designation": "ue_swtch_phase_designation",
    "Public or Private": "ue_swtch_public_or_private",
    "Rated kV": "ue_swtch_rated_kv",
    "State": "ue_swtch_state",
    "SCADA\nControlled": "ue_swtch_scada_controlled",
    "Switch Type": "ue_swtch_switch_type",
    "Amperage": "ue_swtgr_amperage",
    "Fuse Type": "ue_swtgr_fuse_type",
    "Circuit Switch Number": "ue_swtgr_circuit_switch_number",
    "Installation Type": "ue_swtgr_installation_type",
    "Loadbreak Capability": "ue_swtgr_loadbreak_capability",
    "Nameplate": "ue_swtgr_nameplate",
    "Nominal Voltage Rating": "ue_swtgr_nominal_voltage_rating",
    "Public or Private": "ue_swtgr_public_or_private",
    "Switchgear  Distribution Type\n(GIS: Facility ID)": "ue_swtgr_switchgear_distribution_type",
    "Trans or\nDist": "ue_swtgr_trans_or_dist",
    "Year Installed": "ue_swtgr_year_installed",
    "Date Retired/\nAbandoned": "ue_xfmer_date_retired_abandoned",
    "Dual Voltage": "ue_xfmer_dual_voltage",
    "KVA Rating": "ue_xfmer_kva_rating",
    "Location\n (UG or OH )": "ue_xfmer_location_oh_or_ug",
    "Number of Phases": "ue_xfmer_number_of_phases",
    "Phase Designation": "ue_xfmer_phase_designation",
    "Primary Voltage - Text": "ue_xfmer_primary_voltage_text",
    "Public or\nPrivate": "ue_xfmer_public_or_private",
    "Secondary Voltage": "ue_xfmer_secondary_voltage",
    "Secondary Voltage - Text": "ue_xfmer_secondary_voltage_text",
    "Size (KVA)": "ue_xfmer_size_kva",
    "Type, Transformer \n(GIS: Subtype)": "ue_xfmer_transformer_type",
    "Trans type": "ue_xfmer_trans_type",
    "Year installed": "ue_xfmer_year_installed",
    "Foundation Material": "ue_vaulp_foundation_material",
    "OPCO Owner Percent": "ue_vaulp_opco_owner_percent",
    "Public or\nPrivate": "ue_vaulp_public_or_private",
    "TR#": "ue_vaulp_tr_number",
    "Trans or Dist": "ue_vaulp_trans_or_dist"
  }

  column_mapping_gis_updates = {
    "Object Type": "object_type",
    "Type of GIS Update": "type_of_gis_update",
    "GIS Discrepancy": "gis_discrepancy",
    "Additional Object Type\nNotes & Observations": "additional_object_type_notes_and_observations",
    "FLOC": "floc",
    "FLOC Description": "floc_description",
    "Description of Object Type": "description_of_object_type",
    "SAP \nEquipment #": "sap_equipment_number",
    "CU ID": "cu_id",
    "CYME ID": "cyme_id",
    "Circuit Number": "circuit_number",
    "NYSEG Line Number": "nyseg_line_number",
    "MID": "mid",
    "Sort Field\n(Pole #)": "sort_field_pole_number",
    "OPCO": "opco",
    "Address number": "address_number",
    "Street": "street",
    "City": "city",
    "State": "state",
    "Zip": "zip",
    "Main Work Center": "main_work_center",
    "Service Center": "service_center",
    "Changed On": "changed_on",
    "Technical ID/\nWork Order": "technical_id_work_order",
    "Survey Date": "survey_date",
    "Proxy Vert": "proxy_vert",
    "Manufacturer": "manufacturer",
    "Model #": "model_number",
    "Manufacturer Serial #": "manufacturer_serial_number",
    "Circuit Number": "ue_capsg_circuit_number",
    "Circuit Switch Number": "ue_capsg_circuit_switch_number",
    "Nominal Voltage Rating": "ue_capsg_nominal_voltage_rating",
    "Number of Phases": "ue_capsg_number_of_phases",
    "Public or Private": "ue_capsg_public_or_private",
    "SCADA\nControlled": "ue_capsg_scada_controlled",
    "Status": "ue_capsg_status",
    "Capacitor Type": "ue_cbank_capacitor_type",
    "Circuit Switch Number": "ue_cbank_circuit_switch_number",
    "Controlled": "ue_cbank_controlled",
    "KVAR Total": "ue_cbank_kvar_total",
    "Nominal Voltage Rating": "ue_cbank_nominal_voltage_rating",
    "Number of Phases": "ue_cbank_number_of_phases",
    "Public or Private": "ue_cbank_public_or_private",
    "SCADA\nControlled": "ue_cbank_scada_controlled",
    "Status": "ue_cbank_status",
    "Circuit Associated to Segment": "ue_condr_circuit_associated_to_segment",
    "Conductor Configuration": "ue_condr_conductor_configuration",
    "Conductor Length": "ue_condr_conductor_length",
    "Conductor Size": "ue_condr_conductor_size",
    "Conductor\nType": "ue_condr_conductor_type",
    "GIS ID \nFrom Structure": "ue_condr_gis_id_from_structure",
    "GIS ID\nTo Structure": "ue_condr_gis_id_to_structure",
    "Linetype": "ue_condr_linetype",
    "Insulation Type": "ue_condr_insulation_type",
    "Neutral Material": "ue_condr_neutral_material",
    "Neutral Size": "ue_condr_neutral_size",
    "Nominal Voltage Rating": "ue_condr_nominal_voltage_rating",
    "Number of Phases": "ue_condr_number_of_phases",
    "O/H or U/G": "ue_condr_o/h_or_u/g",
    "Operating Voltage": "ue_condr_operating_voltage",
    "Phase Designation": "ue_condr_phase_designation",
    "Phase Orientation": "ue_condr_phase_orientation",
    "Primary or Secondary": "ue_condr_primary_or_secondary",
    "Primary Conductor Material": "ue_condr_primary_conductor_material",
    "Proxy\nIndicator": "ue_condr_proxy_indicator",
    "Proxy Vert From": "ue_condr_proxy_vert_from",
    "Proxy Vert To": "ue_condr_proxy_vert_to",
    "Public or Private": "ue_condr_public_or_private",
    "Segment Associated with Correct Circuit": "ue_condr_segment_associated_with_correct_circuit",
    "Trailing Span Location": "ue_condr_trailing_span_location",
    "Voltage \n(Includes Circuit Configuration)": "ue_condr_voltage_includes_circuit_configuration",
    "Fuse Size": "ue_ctout_fuse_size",
    "Fuse Type": "ue_ctout_fuse_type",
    "Nominal Voltage Rating": "ue_ctout_nominal_voltage_rating",
    "Phase": "ue_ctout_phase",
    "Public or Private": "ue_ctout_public_or_private",
    "State": "ue_ctout_state",
    "Amp Rating": "ue_disc_amp_rating",
    "Circuit Switch Number": "ue_disc_circuit_switch_number",
    "Disconnect Type": "ue_disc_disconnect_type",
    "Number of Phases": "ue_disc_number_of_phases",
    "Phase Designation": "ue_disc_phase_designation",
    "Public or Private": "ue_disc_public_or_private",
    "Circuit Phase Label": "ue_pole_circuit_phase_label",
    "Laterals Branch Off": "ue_pole_laterals_branch_off",
    "GeoCoordinates\nLatitude": "ue_pole_geocoordinates_latitude",
    "GeoCoordinates\nLongitude": "ue_pole_geocoordinates_longitude",
    "Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)": "ue_pole_model_coordinates_are_accurate",
    "Pole\nNumber": "ue_pole_pole_number",
    "Between\nPole #\n(if new)": "ue_pole_between_pole_number_since",
    "Between\nPole #\n(if new)": "ue_pole_between_pole_number_to",
    "Pole Type": "ue_pole_pole_type",
    "Circuit Switch Number": "ue_recl_circuit_switch_number",
    "Number of Phases": "ue_recl_number_of_phases",
    "Public or Private": "ue_recl_public_or_private",
    "Recloser Acts As": "ue_recl_recloser_acts_as",
    "SCADA\nControlled": "ue_recl_scada_controlled",
    "Trans or\nDist": "ue_recl_trans_or_dist",
    "Circuit Switch Number": "ue_reg_circuit_switch_number",
    "KVA": "ue_reg_kva",
    "Phase Designation": "ue_reg_phase_designation",
    "Public or Private": "ue_reg_public_or_private",
    "SCADA\nControlled": "ue_reg_scada_controlled",
    "Status": "ue_reg_status",
    "Circuit Switch Number": "ue_sectz_circuit_switch_number",
    "Number of Phases": "ue_sectz_number_of_phases",
    "Public or Private": "ue_sectz_public_or_private",
    "SCADA\nControlled": "ue_sectz_scada_controlled",
    "Location\n(OH/UG)": "ue_ratio_location_oh_or_ug",
    "Number of Phases": "ue_ratio_number_of_phases",
    "Operating Voltage": "ue_ratio_operating_voltage",
    "Phase Designation": "ue_ratio_phase_designation",
    "Pole GIS ID ": "ue_ratio_pole_gis_id_",
    "Pole GIS ID2*": "ue_ratio_pole_gis_id2",
    "Primary Voltage - Text": "ue_ratio_primary_voltage_text",
    "Public or Private": "ue_ratio_public_or_private",
    "Secondary Voltage - Text": "ue_ratio_secondary_voltage_text",
    "Size (KVA)": "ue_ratio_size_kva",
    "Status": "ue_ratio_status",
    "Subtype\n(Transformer Type)": "ue_ratio_subtype_transformer_type",
    "Transformer In": "ue_ratio_transformer_in",
    "Transformer In State": "ue_ratio_transformer_in_state",
    "Transformer Out": "ue_ratio_transformer_out",
    "Circuit Switch Number": "ue_swtch_circuit_switch_number",
    "Facility ID*\n(child level)": "ue_swtch_facility_id_child_level",
    "Feeder ID\n(Circuit #)": "ue_swtch_feeder_id_circuit_number",
    "Feeder ID2\n(Circuit #)": "ue_swtch_feeder_id2_circuit_number",
    "Load Break": "ue_swtch_load_break",
    "Normal Position": "ue_swtch_normal_position",
    "Normal Position - A": "ue_swtch_normal_position_a",
    "Normal Position - B": "ue_swtch_normal_position_b",
    "Normal Position -C": "ue_swtch_normal_position_c",
    "Number of Phases": "ue_swtch_number_of_phases",
    "Operating Voltage": "ue_swtch_operating_voltage",
    "Phase Designation": "ue_swtch_phase_designation",
    "Present Position - A": "ue_swtch_present_position_a",
    "Present Position - B ": "ue_swtch_present_position_b",
    "Present Position - C": "ue_swtch_present_position_c",
    "Public or Private": "ue_swtch_public_or_private",
    "Rated kV": "ue_swtch_rated_kv",
    "SCADA\nControlled": "ue_swtch_scada_controlled",
    "Switch Type": "ue_swtch_switch_type",
    "Circuit Switch Number": "ue_swtgr_circuit_switch_number",
    "Facility ID": "ue_swtgr_facility_id",
    "Feeder ID": "ue_swtgr_feeder_id",
    "Feeder ID2": "ue_swtgr_feeder_id2",
    "Fuse Type": "ue_swtgr_fuse_type",
    "Nominal Voltage Rating": "ue_swtgr_nominal_voltage_rating",
    "Operating Voltage\n(SAP field: Amperage)": "ue_swtgr_operating_voltage",
    "Public or Private": "ue_swtgr_public_or_private",
    "Switchgear Name": "ue_swtgr_switchgear_name",
    "Switch Number": "ue_swtgr_switch_number",
    "Dual Voltage": "ue_xfmer_dual_voltage",
    "KVA Rating": "ue_xfmer_kva_rating",
    "Location\n (UG or OH)": "ue_xfmer_location_oh_or_ug",
    "Number of Phases": "ue_xfmer_number_of_phases",
    "Phase Designation": "ue_xfmer_phase_designation",
    "Primary Voltage - Text": "ue_xfmer_primary_voltage_-_text",
    "Proxy Vert": "ue_xfmer_proxy_vert",
    "Secondary Voltage": "ue_xfmer_secondary_voltage",
    "Secondary Voltage - Text": "ue_xfmer_secondary_voltage_-_text",
    "Size (KVA)": "ue_xfmer_size_kva",
    "SubType": "ue_xfmer_subtype",
    "Transformer In State": "ue_xfmer_transformer_in_state",
    "Transformer Out State": "ue_xfmer_transformer_out_state",
    "GeoCoordinates\nLatitude": "ue_vaulp_geocoordinates_latitude",
    "GeoCoordinates\nLongitude": "ue_vaulp_geocoordinates_longitude",
    "Model coordinates are accurate and do not diverge by more than 10 meter radius (30 ft)": "ue_vaulp_model_coordinates_are_accurate",
    "TR#": "ue_vaulp_tr_number"
  }

  column_mapping_sap_equipment_report = {
    "Functional Location": "floc",
    "FunctLocDescrip.": "floc_description",
    "Super FLOC": "super_floc",
    "Company": "company",
    "Circuit ": "floc_circuit_number",
    "ObjectType": "object_type",
    "Equipment": "sap_equipment_number",
    "Equipment Description": "equipment_description",
    "Equipment Category": "equipment_category",
    "Sort fld": "sort_field_pole_number",
    "Mn.wk.ctr": "main_work_center",
    "Service Center Name": "service_center",
    "Addr. no": "address_number",
    "Street": "street",
    "City": "city",
    "District": "district",
    "Postl Code": "postal_code",
    "Rg": "rg",
    "SysStatus": "system_status",
    "UserStatus": "status_for_users",
    "ConY": "construction_year",
    "Start-up": "start_up_date",
    "Chngd": "changed_on",
    "Material": "material",
    "Material Description": "material_description",
    "Mfr": "manufacturer",
    "Model no.": "model_number",
    "SerNo.": "manufacturer_serial_number",
    "TechID": "technical_id_work_order",
    "Object number": "object_number",
    "CU ID": "cu_id",
    "CYME ID": "cyme_id",
    "Capacitor Type": "capacitor_type",
    "Circuit": "equipment_circuit_number_1",
    "Circuit No": "equipment_circuit_number_2",
    "Circuit Switch Number": "circuit_switch_number",
    "KVAR": "kvar",
    "KVAR total": "kvar_total",
    "No of bushings": "number_of_bushings",
    "Nominal Voltage Rating": "nominal_voltage_rating",
    "Normal State": "normal_state",
    "Public or Private": "public_or_private",
    "SCADA Controlled": "scada_controlled",
    "Conductor Length": "conductor_length",
    "Size, Conductor": "conductor_size",
    "Conductor Type": "conductor_type",
    "Insulation Type": "insulation_type",
    "No of phases": "number_of_phases",
    "O/H or U/G": "location_oh_or_ug",
    "Phase Designation": "phase_designation",
    "Primary or Secondary": "primary_or_secondary",
    "Public Length": "public_length",
    "Trans or Dist": "trans_or_dist",
    "Trailing Span location": "trailing_span_location",
    "Fuse Size": "fuse_size",
    "Fuse Type": "fuse_type",
    "Material.1": "material_2",
    "Amp Rating": "amp_rating",
    "Cutout Distribution type": "cutout_distribution_type",
    "Disconnect Type": "disconnect_type",
    "Controlled": "controlled",
    "Owner": "owner",
    "OPCO Owner Perc": "opco_owner_percent",
    "Pole Class": "pole_class",
    "Pole Length": "pole_length",
    "Pole Material": "pole_material",
    "Pole Type": "pole_type",
    "Treatment": "treatment",
    "Location (UG or O/H )": "location_oh_or_ug_2",
    "Size (KVA)": "size_kva",
    "Recloser Acts As": "recloser_acts_as",
    "Kva": "kva",
    "Load Break Yes/No": "load_break_yes_no",
    "Normal Position": "normal_position",
    "Rated kV": "rated_kv",
    "switch style": "switch_style",
    "Switch Type": "switch_type",
    "Amperage": "amperage",
    "Switchgear Distribution Type": "switchgear_distribution_type",
    "Year installed": "year_installed",
    "Foundation Material": "foundation_material",
    "Dual Voltage": "dual_voltage",
    "Primary Voltage - Text": "primary_voltage_text",
    "Secondary Voltage": "secondary_voltage",
    "Secondary Voltage - Text": "secondary_voltage_text",
    "Trans type": "trans_type",
    "Type Transformer ": "transformer_type"
  }

  return column_mapping_main_template, column_mapping_sap_updates, column_mapping_gis_updates, column_mapping_sap_equipment_report


# Functions related to Partitions

In [None]:
def show_partitions(path_file):

  # Leer el archivo parquet
  parquet_df = spark.read.parquet(path_file)

  # Añadir una columna con el nombre del archivo (que contiene la partición)
  partitions_df = parquet_df.withColumn("file_name", F.input_file_name())

  # Extraer la partición 'process_datetime' del nombre del archivo
  partitions_df = partitions_df.withColumn("process_datetime", F.regexp_extract(F.col("file_name"), "process_datetime=([^/]+)", 1))

  # Mostrar las particiones únicas
  partitions_df.select("process_datetime").distinct().show(truncate=False)


In [None]:
def get_df_last_partition(zone_path, table_name, partition_column_name):

  df = spark.read.parquet(f'{zone_path}/{table_name}')
  last_partition = df.agg({partition_column_name: "max"}).collect()[0][0]
  df_last_partition = df.filter(col(partition_column_name) == last_partition)

  return df_last_partition


In [None]:
# def get_last_partition_old(path_file):

#   # Leer el archivo parquet
#   parquet_df = spark.read.parquet(path_file)

#   # Añadir una columna con el nombre del archivo (que contiene la partición)
#   partitions_df = parquet_df.withColumn("file_name", F.input_file_name())

#   # Extraer la partición 'process_datetime' del nombre del archivo
#   partitions_df = partitions_df.withColumn("process_datetime", F.regexp_extract(F.col("file_name"), "process_datetime=([^/]+)", 1))

#   # Encontrar la última partición (máxima 'process_datetime')
#   last_partition = partitions_df.select(F.max("process_datetime")).first()[0]

#   return last_partition


In [None]:
# def get_df_last_partition_old (zone_path, table_name, partition_value):

#   if partition_value == 0:
#     partition_value = get_last_partition(f'{zone_path}/{table_name}')

#   df_last_partition = spark.read.parquet(f'{zone_path}/{table_name}').filter(F.col("process_datetime") == partition_value)

#   return df_last_partition

In [None]:
# import os

# def get_last_partition_from_directories():

#   # Ruta del archivo Parquet particionado
#   trusted_path = "/ruta/a/la/zona/trusted"

#   # Obtener la lista de directorios (particiones) directamente del sistema de archivos
#   partitions_paths = [f.path for f in dbutils.fs.ls(trusted_path) if "process_datetime=" in f.path]

#   # Extraer los valores de 'process_datetime' de los directorios
#   partitions_dates = [p.split("process_datetime=")[-1].rstrip('/') for p in partitions_paths]

#   # Encontrar la última partición
#   ultima_particion = max(partitions_dates)

#   # Leer los datos de la última partición únicamente
#   ultimo_df = spark.read.parquet(f"{trusted_path}/process_datetime={ultima_particion}")

#   # APLICAR TRANSFORMACIONES O FILTROS SI ES NECESARIO
#   # ultimo_df = ultimo_df.filter(...)

#   # Escribir los datos refinados en la zona refined
#   refined_path = "/ruta/a/la/zona/refined"
#   ultimo_df.write.mode("overwrite").parquet(refined_path)

# Read Excel File & Generate Partition

In [None]:
def read_excel_file_and_generate_partition(excel_file_name, sheet_name_to_load, columns_to_select, column_mapping, table_schema, skip_rows, destination_path, destination_table, process_datetime, table_id):

  # Cargar la hoja del excel a un DataFrame de pandas
  df_pandas = pd.read_excel(excel_file_name, sheet_name=sheet_name_to_load, skiprows=skip_rows, dtype=str, na_values=[""], keep_default_na=False)

  # Reemplazar valores nulos por cadenas vacías
  #df_pandas.fillna("", inplace=True)

  # Procesar columnas de fecha para mantener el formato original
  #df_pandas['start_up_date'] = pd.to_datetime(df_pandas['start_up_date'], dayfirst=True, errors='coerce').dt.strftime('%d/%m/%Y')
  #df_pandas['survey_date'] = pd.to_datetime(df_pandas['survey_date'], dayfirst=True, errors='coerce').dt.strftime('%d/%m/%Y')

  # Seleccionar las columnas a ser utilizadas
  df_pandas = df_pandas[columns_to_select]

  # Renombrar las columnas en el DataFrame de Pandas usando el mapeo
  df_pandas = df_pandas.rename(columns=column_mapping)

  # Convertir el DataFrame de Pandas (con las columnas renombradas) a un DataFrame de PySpark con el esquema definido
  df_spark = spark.createDataFrame(df_pandas, schema=table_schema)

  # Agregar el campo 'process_datetime' con el valor actual
  df_spark = df_spark.withColumn("process_datetime", F.lit(process_datetime))

  if (len(table_id) != 0):
    # Agregar el campo 'unique_id' basada en el ID incremental y el valor de process_datetime
    df_spark = df_spark.withColumn("id", concat_ws("_", lit(table_id), lit(process_datetime), monotonically_increasing_id()))
  else:
    # Agregar el campo 'unique_id' basada en el ID incremental y el valor de process_datetime
    df_spark = df_spark.withColumn("id", concat_ws("_", lit(process_datetime), monotonically_increasing_id()))

  # Agregar la partición al archivo parquet
  df_spark.write.mode('append').partitionBy('process_datetime').parquet(f'{destination_path}/{destination_table}')




# Generate Partitions

In [None]:
# generate partition value
process_datetime = int(datetime.now().strftime('%Y%m%d%H%M%S'))
columns_to_select_main_template, columns_to_select_sap_updates, columns_to_select_gis_updates, columns_to_select_sap_equipment_report = define_columns_to_select()
column_mapping_main_template, column_mapping_sap_updates, column_mapping_gis_updates, column_mapping_sap_equipment_report = define_column_mappings()
schema_main_template, schema_sap_updates, schema_gis_updates, schema_sap_equipment_report = define_schemas()

#excel_file_name = "/content/drive/MyDrive/Colab Notebooks/scenarios/landing/GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx"
excel_file_name = "/content/drive/MyDrive/scenarios/landing/GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns_v2.xlsx" #Desde compartidos

#destination_path = "/content/drive/MyDrive/scenarios/landing/raw"
#destination_path = "/content/data_lake/raw" #Temporal
destination_path = "/content/drive/MyDrive/scenarios/raw/"

read_excel_file_and_generate_partition(excel_file_name, 'Main Template', columns_to_select_main_template, column_mapping_main_template, schema_main_template, 2, destination_path, 'main_template', process_datetime, "01")
read_excel_file_and_generate_partition(excel_file_name, 'SAP Updates', columns_to_select_sap_updates, column_mapping_sap_updates, schema_sap_updates, 1, destination_path, 'sap_updates', process_datetime, "02")
read_excel_file_and_generate_partition(excel_file_name, 'GIS Updates', columns_to_select_gis_updates, column_mapping_gis_updates, schema_gis_updates, 1, destination_path, 'gis_updates', process_datetime, "03")

#excel_file_name = "/content/drive/MyDrive/Colab Notebooks/scenarios/landing/NYSEG-9301-Lancaster-51000-SAP Equipment Report 9-4-2024_516000_to_522000.xlsx"
excel_file_name = "/content/drive/MyDrive/scenarios/landing/NYSEG-9301-Lancaster-51000-SAP Equipment Report 9-4-2024_516000_to_522000.xlsx" #Desde compartidos

read_excel_file_and_generate_partition(excel_file_name, 'EQP WC 51000 04.09.2024', columns_to_select_sap_equipment_report, column_mapping_sap_equipment_report, schema_sap_equipment_report, 0, destination_path, 'sap_equipment_report', process_datetime, "04")



In [None]:
#VER UN DF
#df_last_partition = get_df_last_partition("/content/drive/MyDrive/Colab Notebooks/scenarios/raw/", "main_template", "process_datetime")
#df_last_partition = get_df_last_partition("/content/data_lake/raw", "sap_updates", "process_datetime")    #.filter(col("sap_equipment_number") == "100015492239")

df_last_partition = get_df_last_partition("/content/drive/MyDrive/scenarios/raw/", "main_template", "process_datetime")

df_last_partition.show()
df_last_partition.printSchema()



+---------------------------+--------------+---------------------------------+------------------------------------+--------------------------+-----------+--------------------------+--------------------+--------------------+--------------------+------------------+-----+---------+----------------------+-----+-------------------+-----------------+--------------+------+--------+-----+-----------+----------------+--------------------+-------------------+-------------+-------------------+-------------------+-----------------+------------------+-----------------------+-------------------+----------+----------------+---------+--------------------+--------------------+--------------------------+------------------------+-------------------------------------------+--------------------------+---------------------+----------+-------------------------------+-------------------+------------------+------+--------------------------------------+--------------------------------------------+--------------

In [None]:
'''
# Cargar los DataFrames desde los archivos Parquet de las zonas raw y quarantine
df_raw = spark.read.parquet("/ruta/a/tu/raw/parquet/file")
df_quarantine = spark.read.parquet("/ruta/a/tu/quarantine/parquet/file")

# Realizar un anti-join para filtrar los registros de raw cuyo equipment_number no está en quarantine
df_filtered = df_raw.join(df_quarantine, on="equipment_number", how="left_anti")
has context menu
'''

'\n# Cargar los DataFrames desde los archivos Parquet de las zonas raw y quarantine\ndf_raw = spark.read.parquet("/ruta/a/tu/raw/parquet/file")\ndf_quarantine = spark.read.parquet("/ruta/a/tu/quarantine/parquet/file")\n\n# Realizar un anti-join para filtrar los registros de raw cuyo equipment_number no está en quarantine\ndf_filtered = df_raw.join(df_quarantine, on="equipment_number", how="left_anti")\nhas context menu\n'

#Exportar a Excel

In [None]:
import os
import pandas as pd
# Make sure openpyxl is installed and imported
!pip install openpyxl
import openpyxl

#os.makedirs(os.path.dirname("/content/data_lake/delivery/"), exist_ok=True) #Temporal
os.makedirs(os.path.dirname("content/drive/MyDrive/scenarios/delivery/"), exist_ok=True)

#file_path = f'/content/data_lake/delivery/data_template_{process_datetime}.xlsx'
file_path = f'/content/data_lake/delivery/data_template_20241129192612.xlsx'        #PROCES_DATETIME

#df_pandas = get_df_last_partition("/content/data_lake/trusted/", 'main_template', 'process_datetime')#.toPandas()


with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
    get_df_last_partition("/content/data_lake/trusted/", 'main_template', 'process_datetime').toPandas().to_excel(writer, sheet_name='main_template', index=False)
    get_df_last_partition("/content/data_lake/trusted/", 'sap_updates', 'process_datetime').toPandas().to_excel(writer, sheet_name='sap_updates', index=False)
    get_df_last_partition("/content/data_lake/trusted/", 'gis_updates', 'process_datetime').toPandas().to_excel(writer, sheet_name='gis_updates', index=False)
    get_df_last_partition("/content/data_lake/trusted/", 'sap_equipment_report', 'process_datetime').toPandas().to_excel(writer, sheet_name='sap_equipment_report', index=False)


