<a href="https://colab.research.google.com/github/ugoGS/Py/blob/main/scenarios_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Initial configurations for Google Colab and Spark

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Colab Config

In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark
!pip install -q findspark



In [3]:
import sys

sys.path.append('/content/drive/MyDrive/Colab Notebooks/GEMP/config/')
import setup_colab
setup_colab.setup_spark()

Findspark configuration completed


#Create Spark Session & Import Libraries

In [4]:
import findspark
import pandas as pd
import json
import os
import time
import shutil
import traceback

from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, lit, concat_ws, when, length, asc, desc, monotonically_increasing_id, to_date, trim, date_format, expr, lower, upper, concat, date_format
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DateType, TimestampType
from pyspark.sql import functions as F
from datetime import datetime

spark = SparkSession.builder.appName("Data Integrator").getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)

#Functions

In [33]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/functions/functions_related_to_partitions.ipynb"
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/functions/LSMW_Scripts_functions.ipynb"
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/functions/schema_definitions_v1.ipynb"
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/lsmw_export/generate_csv.ipynb"

functions_related_to_partitions: 1.1
schema_definitions_v1: 1.2


#Global Variables

In [25]:
process_datetime = int(datetime.now().strftime('%Y%m%d%H%M%S'))

# GEMP environment
# landing_path = "/content/drive/MyDrive/Colab Notebooks/GEMP/data/landing"
# raw_path = "/content/drive/MyDrive/Colab Notebooks/GEMP/data/raw"
trusted_path = "/content/drive/MyDrive/Colab Notebooks/GEMP/data/trusted"
# # provisoriamente estamos utilizando el path de raw como origen para pasar datos a trusted
# transient_path = "/content/drive/MyDrive/Colab Notebooks/GEMP/data/raw"
# refined_path = "/content/drive/MyDrive/Colab Notebooks/GEMP/data/refined/"
# logs_path = "/content/drive/MyDrive/Colab Notebooks/GEMP/data/logs/"

# local environment
raw_path = "/content/data_lake/raw"
transient_path = "/content/data_lake/raw"
#trusted_path = "/content/data_lake/trusted"
refined_path = "/content/data_lake/refined/"
logs_path = "/content/data_lake/logs/"
delivery_path = "/content/data_lake/delivery"



#Get files from trusted zone

In [7]:
df_main_template_trusted = get_df_last_partition(trusted_path, "main_template", "process_datetime")
df_sap_updates_trusted = get_df_last_partition(trusted_path, "sap_updates", "process_datetime")
df_gis_updates_trusted = get_df_last_partition(trusted_path, "gis_updates", "process_datetime")
df_sap_equipment_report_trusted = get_df_last_partition(trusted_path, "sap_equipment_report", "process_datetime")


#Dummy data (Add columns directly to trusted df for testing)

In [None]:
#Create a dummy rows
from pyspark.sql import Row

dummy_data = [
  Row(object_type="UE_CAPSG",
      type_of_sap_update="Add Equipment",
      sap_discrepancy="Equipment found in Field, not in SAP",
      additional_object_type_notes=None,
      floc="9301-L0576-1521-0004-ED00009",
      floc_description="COLDEN T     -1050         -15",
      description_of_object_type="CAPACITOR, DIST, 1PH, 7200V, 150KVAR, 2",
      sap_equipment_number=0,
      cu_id=None,
      cyme_id=None,
      circuit_number=3105301,
      nyseg_line_number=None,
      equipment_category="K",
      mid=None,
      sort_field_pole_number="PO=15",
      opco="NYSEG",
      address_number=None,
      street=None,
      city="BOSTON T",
      state=None,
      postal_code=None,
      main_work_center=51000,
      service_center="Lancaster Service Center",
      start_up_date=datetime.strptime("10/08/2018", "%d/%m/%Y").date(),
      system_status="INST",
      status_for_users="UNOP 4000",
      changed_on=datetime.strptime("10/08/2018", "%d/%m/%Y").date(),
      construction_year=2018,
      construction_month="08",
      technical_id_work_order="801000000000",
      survey_date=datetime.strptime("10/08/2018", "%d/%m/%Y").date(),
      proxy_vert=None,
      phantom_location=None,
      long_text=None,
      manufacturer="N/A",
      model_number="N/A",
      manufacturer_serial_number="N/A",
      nameplate_image_captured=None,
      ue_capsg_capacitor_type=None,
      ue_capsg_circuit_switch_number=None,
      ue_capsg_kvar=None,
      ue_capsg_nominal_voltage_rating=None,
      ue_capsg_number_of_bushings=None,
      ue_capsg_number_of_phases=None,
      ue_capsg_public_or_private=None,
      ue_capsg_scada_controlled=None,
      ue_capsg_status=None,
      ue_cbank_capacitor_type=None,
      ue_cbank_circuit_switch_number=None,
      ue_cbank_controlled=None,
      ue_cbank_kvar_total=None,
      ue_cbank_nominal_voltage_rating=None,
      ue_cbank_number_of_capacitors=None,
      ue_cbank_number_of_phases=None,
      ue_cbank_public_or_private=None,
      ue_cbank_scada_controlled=None,
      ue_cbank_status=None,
      ue_condr_conductor_length=None,
      ue_condr_conductor_size=None,
      ue_condr_conductor_type=None,
      ue_condr_insulation_type=None,
      ue_condr_neutral_material=None,
      ue_condr_neutral_size=None,
      ue_condr_nominal_voltage_rating=None,
      ue_condr_location_oh_or_ug=None,
      ue_condr_primary_or_secondary=None,
      ue_condr_primary_conductor_material=None,
      ue_condr_public_length=None,
      ue_condr_public_or_private=None,
      ue_condr_trailing_span_length=None,
      ue_condr_trailing_span_location=None,
      ue_condr_trans_or_dist=None,
      ue_ctout_character_of_construction=None,
      ue_ctout_cutout_rating=None,
      ue_ctout_fuse_size=None,
      ue_ctout_fuse_type=None,
      ue_ctout_material=None,
      ue_ctout_nominal_voltage_rating=None,
      ue_ctout_phase=None,
      ue_ctout_public_or_private=None,
      ue_ctout_state=None,
      ue_ctout_type=None,
      ue_disc_amp_rating=None,
      ue_disc_circuit_switch_number=None,
      ue_disc_number_of_phases=None,
      ue_disc_phase_designation=None,
      ue_disc_public_or_private=None,
      ue_disc_switch_style=None,
      ue_disc_type=None,
      ue_pole_circuit_phase_label=None,
      ue_pole_class=None,
      ue_pole_opco_owner_percent=None,
      ue_pole_owner=None,
      ue_pole_owner_maintained=None,
      ue_pole_pole_length=None,
      ue_pole_pole_material=None,
      ue_pole_pole_number=None,
      ue_pole_between_pole_number_since=None,
      ue_pole_between_pole_number_to=None,
      ue_pole_pole_type=None,
      ue_pole_public_or_private=None,
      ue_pole_trans_or_dist=None,
      ue_pole_treatment=None,
      ue_recl_circuit_switch_number=None,
      ue_recl_number_of_phases=None,
      ue_recl_public_or_private=None,
      ue_recl_recloser_acts_as=None,
      ue_recl_scada_controlled=None,
      ue_recl_trans_or_dist=None,
      ue_reg_circuit_switch_number=None,
      ue_reg_kva=None,
      ue_reg_phase_designation=None,
      ue_reg_public_or_private=None,
      ue_reg_scada_controlled=None,
      ue_reg_status=None,
      ue_sectz_circuit_switch_number=None,
      ue_sectz_number_of_phases=None,
      ue_sectz_public_or_private=None,
      ue_sectz_scada_controlled=None,
      ue_ratio_location_oh_or_ug=None,
      ue_ratio_number_of_phases=None,
      ue_ratio_operating_voltage=None,
      ue_ratio_phase_designation=None,
      ue_ratio_primary_voltage_text=None,
      ue_ratio_public_or_private=None,
      ue_ratio_secondary_voltage_text=None,
      ue_ratio_size_kva=None,
      ue_ratio_status=None,
      ue_ratio_subtype_transformer_type=None,
      ue_swtch_circuit_switch_number=None,
      ue_swtch_load_break=None,
      ue_swtch_normal_position=None,
      ue_swtch_number_of_phases=None,
      ue_swtch_operating_voltage=None,
      ue_swtch_phase_designation=None,
      ue_swtch_public_or_private=None,
      ue_swtch_rated_kv=None,
      ue_swtch_state=None,
      ue_swtch_scada_controlled=None,
      ue_swtch_switch_type=None,
      ue_swtgr_amperage=None,
      ue_swtgr_fuse_type=None,
      ue_swtgr_circuit_switch_number=None,
      ue_swtgr_installation_type=None,
      ue_swtgr_loadbreak_capability=None,
      ue_swtgr_nameplate=None,
      ue_swtgr_nominal_voltage_rating=None,
      ue_swtgr_public_or_private=None,
      ue_swtgr_switchgear_distribution_type=None,
      ue_swtgr_trans_or_dist=None,
      ue_swtgr_year_installed=None,
      ue_xfmer_date_retired_abandoned=None,
      ue_xfmer_dual_voltage=None,
      ue_xfmer_kva_rating=None,
      ue_xfmer_location_oh_or_ug=None,
      ue_xfmer_number_of_phases=None,
      ue_xfmer_phase_designation=None,
      ue_xfmer_primary_voltage_text=None,
      ue_xfmer_public_or_private=None,
      ue_xfmer_secondary_voltage=None,
      ue_xfmer_secondary_voltage_text=None,
      ue_xfmer_size_kva=None,
      ue_xfmer_transformer_type=None,
      ue_xfmer_trans_type=None,
      ue_xfmer_year_installed=None,
      ue_vaulp_foundation_material=None,
      ue_vaulp_opco_owner_percent=None,
      ue_vaulp_public_or_private=None,
      ue_vaulp_tr_number=None,
      ue_vaulp_trans_or_dist=None,
      id="02_20241129192612_8589934608",
      process_datetime=20241129192612
  ),
  Row(object_type="UE_CAPSG",
      type_of_sap_update="Add Equipment",
      sap_discrepancy="Equipment found in Field, not in SAP",
      additional_object_type_notes=None,
      floc="9301-L0576-1510-0014-ED00047",
      floc_description="BOSTON T -1038 -42",
      description_of_object_type="CAPACITOR, DIST, 1PH, 7200V, 150KVAR, 2",
      sap_equipment_number=0,
      cu_id=None,
      cyme_id=None,
      circuit_number=3105301,
      nyseg_line_number=None,
      equipment_category="K",
      mid=None,
      sort_field_pole_number="PO=15",
      opco="NYSEG",
      address_number=None,
      street=None,
      city="BOSTON T",
      state=None,
      postal_code=None,
      main_work_center=51000,
      service_center="Lancaster Service Center",
      start_up_date=datetime.strptime("10/08/2018", "%d/%m/%Y").date(),
      system_status="INST",
      status_for_users="UNOP 4000",
      changed_on=datetime.strptime("10/08/2018", "%d/%m/%Y").date(),
      construction_year=2018,
      construction_month="08",
      technical_id_work_order="801000000000",
      survey_date=datetime.strptime("10/08/2018", "%d/%m/%Y").date(),
      proxy_vert=None,
      phantom_location=None,
      long_text=None,
      manufacturer="N/A",
      model_number="N/A",
      manufacturer_serial_number="N/A",
      nameplate_image_captured=None,
      ue_capsg_capacitor_type=None,
      ue_capsg_circuit_switch_number=None,
      ue_capsg_kvar=None,
      ue_capsg_nominal_voltage_rating=None,
      ue_capsg_number_of_bushings=None,
      ue_capsg_number_of_phases=None,
      ue_capsg_public_or_private=None,
      ue_capsg_scada_controlled=None,
      ue_capsg_status=None,
      ue_cbank_capacitor_type=None,
      ue_cbank_circuit_switch_number=None,
      ue_cbank_controlled=None,
      ue_cbank_kvar_total=None,
      ue_cbank_nominal_voltage_rating=None,
      ue_cbank_number_of_capacitors=None,
      ue_cbank_number_of_phases=None,
      ue_cbank_public_or_private=None,
      ue_cbank_scada_controlled=None,
      ue_cbank_status=None,
      ue_condr_conductor_length=None,
      ue_condr_conductor_size=None,
      ue_condr_conductor_type=None,
      ue_condr_insulation_type=None,
      ue_condr_neutral_material=None,
      ue_condr_neutral_size=None,
      ue_condr_nominal_voltage_rating=None,
      ue_condr_location_oh_or_ug=None,
      ue_condr_primary_or_secondary=None,
      ue_condr_primary_conductor_material=None,
      ue_condr_public_length=None,
      ue_condr_public_or_private=None,
      ue_condr_trailing_span_length=None,
      ue_condr_trailing_span_location=None,
      ue_condr_trans_or_dist=None,
      ue_ctout_character_of_construction=None,
      ue_ctout_cutout_rating=None,
      ue_ctout_fuse_size=None,
      ue_ctout_fuse_type=None,
      ue_ctout_material=None,
      ue_ctout_nominal_voltage_rating=None,
      ue_ctout_phase=None,
      ue_ctout_public_or_private=None,
      ue_ctout_state=None,
      ue_ctout_type=None,
      ue_disc_amp_rating=None,
      ue_disc_circuit_switch_number=None,
      ue_disc_number_of_phases=None,
      ue_disc_phase_designation=None,
      ue_disc_public_or_private=None,
      ue_disc_switch_style=None,
      ue_disc_type=None,
      ue_pole_circuit_phase_label=None,
      ue_pole_class=None,
      ue_pole_opco_owner_percent=None,
      ue_pole_owner=None,
      ue_pole_owner_maintained=None,
      ue_pole_pole_length=None,
      ue_pole_pole_material=None,
      ue_pole_pole_number=None,
      ue_pole_between_pole_number_since=None,
      ue_pole_between_pole_number_to=None,
      ue_pole_pole_type=None,
      ue_pole_public_or_private=None,
      ue_pole_trans_or_dist=None,
      ue_pole_treatment=None,
      ue_recl_circuit_switch_number=None,
      ue_recl_number_of_phases=None,
      ue_recl_public_or_private=None,
      ue_recl_recloser_acts_as=None,
      ue_recl_scada_controlled=None,
      ue_recl_trans_or_dist=None,
      ue_reg_circuit_switch_number=None,
      ue_reg_kva=None,
      ue_reg_phase_designation=None,
      ue_reg_public_or_private=None,
      ue_reg_scada_controlled=None,
      ue_reg_status=None,
      ue_sectz_circuit_switch_number=None,
      ue_sectz_number_of_phases=None,
      ue_sectz_public_or_private=None,
      ue_sectz_scada_controlled=None,
      ue_ratio_location_oh_or_ug=None,
      ue_ratio_number_of_phases=None,
      ue_ratio_operating_voltage=None,
      ue_ratio_phase_designation=None,
      ue_ratio_primary_voltage_text=None,
      ue_ratio_public_or_private=None,
      ue_ratio_secondary_voltage_text=None,
      ue_ratio_size_kva=None,
      ue_ratio_status=None,
      ue_ratio_subtype_transformer_type=None,
      ue_swtch_circuit_switch_number=None,
      ue_swtch_load_break=None,
      ue_swtch_normal_position=None,
      ue_swtch_number_of_phases=None,
      ue_swtch_operating_voltage=None,
      ue_swtch_phase_designation=None,
      ue_swtch_public_or_private=None,
      ue_swtch_rated_kv=None,
      ue_swtch_state=None,
      ue_swtch_scada_controlled=None,
      ue_swtch_switch_type=None,
      ue_swtgr_amperage=None,
      ue_swtgr_fuse_type=None,
      ue_swtgr_circuit_switch_number=None,
      ue_swtgr_installation_type=None,
      ue_swtgr_loadbreak_capability=None,
      ue_swtgr_nameplate=None,
      ue_swtgr_nominal_voltage_rating=None,
      ue_swtgr_public_or_private=None,
      ue_swtgr_switchgear_distribution_type=None,
      ue_swtgr_trans_or_dist=None,
      ue_swtgr_year_installed=None,
      ue_xfmer_date_retired_abandoned=None,
      ue_xfmer_dual_voltage=None,
      ue_xfmer_kva_rating=None,
      ue_xfmer_location_oh_or_ug=None,
      ue_xfmer_number_of_phases=None,
      ue_xfmer_phase_designation=None,
      ue_xfmer_primary_voltage_text=None,
      ue_xfmer_public_or_private=None,
      ue_xfmer_secondary_voltage=None,
      ue_xfmer_secondary_voltage_text=None,
      ue_xfmer_size_kva=None,
      ue_xfmer_transformer_type=None,
      ue_xfmer_trans_type=None,
      ue_xfmer_year_installed=None,
      ue_vaulp_foundation_material=None,
      ue_vaulp_opco_owner_percent=None,
      ue_vaulp_public_or_private=None,
      ue_vaulp_tr_number=None,
      ue_vaulp_trans_or_dist=None,
      id="02_20241129192612_8589934609",
      process_datetime=20241129192612
  )
]

dummy_df = spark.createDataFrame(dummy_data, schema_sap_updates_trusted)

In [None]:
df_sap_updates_trusted = df_sap_updates_trusted.union(dummy_df)

#UE_SWTCH - Scenario 2: Device Switch (UE_CTOTS) / Load Break Switch (UE_SWTCH)

> Add blockquote




In [None]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/ue_switch_02_ue_ctots_load_break_switch.ipynb"

# Uploaded outputs
get_df_last_partition(logs_path, 'data_quality_log', 'process_datetime').show(truncate=False)
# get_df_last_partition(refined_path, 'lsmw_ziusachobjtype', 'process_datetime').show(truncate=False)

+------------------+--------------------------------------+--------------------+-------+----------------+---------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+-------------+---------+----------------------+--------------------------+----------------+
|id                |process_id                            |id_data             |state  |operation       |error_type           |error_desc                                                                                                                                                         |column_name|invalid_value|new_value|recipient             |date_time                 |process_datetime|
+------------------+--------------------------------------+--------------------+-------+----------------+---------------------+-----------------------------------------------------------------------------------

#UE_CAPSG Scenario 1: Serialized Equipment in Field – Not in SAP​

In [None]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/ue_capsg_01_serialized_equipment_in_field_not_in_sap.ipynb"

#Uploaded outputs
#get_df_last_partition(logs_path, 'data_quality_log', 'process_datetime').show(100, truncate=False)
#get_df_last_partition(refined_path, 'lsmw_ziusacrktypeq', 'process_datetime').show(truncate=False)


#Non-Load Break Switch (UE_DISC) - Scenario 2: Disconnect Switch in Field, Not n Records: Non-Serialized Equipment - FLOC Exists

In [None]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/ue_disc_02_disconnect_switch_in_field_not_in_records_floc_exist.ipynb"

#Uploaded outputs
#get_df_last_partition(logs_path, 'data_quality_log', 'process_datetime').show(100, truncate=False)
# get_df_last_partition(refined_path, 'ziusacreqworef', 'process_datetime').show(truncate=False)

#Cutouts (UE_CTOUT) & Conductor (UE_CONDR) - Scenario 1: Non-Serialized Equipment (Non Pre-Cap) on Record – Not in Field​

In [None]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/ue_condr_01_sap_non_serialized_on_record_not_in_field.ipynb"

#Uploaded outputs
#get_df_last_partition(logs_path, 'data_quality_log', 'process_datetime').show(100, truncate=False)
# get_df_last_partition(refined_path, 'lsmw_ziusadiseqchloc', 'process_datetime').show(truncate=False)
# get_df_last_partition(refined_path, 'lsmw_ziusaeqidismant', 'process_datetime').show(truncate=False)

In [None]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/ue_ctout_01_sap_non_serialized_on_record_not_in_field.ipynb"

#get_df_last_partition(logs_path, 'data_quality_log', 'process_datetime').show(100, truncate=False)


#Standard SAP Scenarios - Scenario 4: Equipment Removed in Field – Showing on FLOC

In [None]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/standard_sap_scenario_4_equipment_removed_in_field_showing_on_floc.ipynb"

#Uploaded outputs
get_df_last_partition(logs_path, 'data_quality_log', 'process_datetime').show(truncate=False)
# get_df_last_partition(refined_path, 'lsmw_ziusainecequi', 'process_datetime').show(truncate=False)

+------------------+----------------------------------------------------------+----------------------------+-------+----------------+--------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+-----------+-------------+---------+----------------------+--------------------------+----------------+
|id                |process_id                                                |id_data                     |state  |operation       |error_type                      |error_desc                                                                                                                                 |column_name|invalid_value|new_value|recipient             |date_time                 |process_datetime|
+------------------+----------------------------------------------------------+----------------------------+-------+----------------+--------------------------------+--------------

# Serialized equipment in field, not in records


In [34]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/scenario_xx_serialized_equipment_in_field_not_in_records.ipynb"

# Non serialized equipment in field, not in records


In [None]:
#%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/scenario_xx_non_serialized_equipment_in_field_not_in_records.ipynb"

# CU ID Missing

In [None]:
%run "/content/drive/MyDrive/Colab Notebooks/GEMP/notebooks/scenarios/scenarios_code/cu_id_missing.ipynb"


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/data_lake/trusted/sap_updates.

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/content/data_lake/trusted/sap_updates.

#Gnerate CSV Output Files

In [None]:
generate_csv_df('logs', 'data_lake') #logs

#Test

In [None]:
df_sap_updates_trusted.filter((col("type_of_sap_update") == "Add Equipment") & (col("object_type") == "UE_DISC")) \
  .select("id", "object_type", "floc", "description_of_object_type", "start_up_date", "sort_field_pole_number").show(truncate=False)

drop_partition(logs_path, 'data_quality_log', 'process_datetime', 20250121173807)






Partición eliminada: /content/data_lake/logs//data_quality_log/process_datetime=20250121173807


True