<a href="https://colab.research.google.com/github/ugoGS/Py/blob/main/raw_to_trusted_v5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#New version of columns within Data Template

# Initial configuration in Google Colab

In [None]:
# Instalar PySpark y findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark
!pip install -q findspark




# Java and Spark configuration


In [None]:
import os
import findspark

# Configurar la ruta de Java
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Inicializar findspark
findspark.init()



# Mount drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/drive/MyDrive/scenarios/landing

'GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx'
'GMEP Data Report_Sample_delivery_Poles_20241011_fixed.xlsx'
'NYSEG-9301-Lancaster-51000-SAP Equipment Report 9-4-2024_516000_to_522000.xlsx'
 raw


# Create Spark Session & Import Libraries

In [None]:
from pyspark.sql import SparkSession

# Detén cualquier sesión existente y crea una nueva
try:
  spark.stop()
except:
  pass


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, lit, concat_ws, when, length, asc, desc, monotonically_increasing_id
import json
import pandas as pd
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, DateType
from pyspark.sql import functions as F
from datetime import datetime


# Crear una sesión de Spark
spark = SparkSession.builder.appName("proof of concept").getOrCreate()
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)



# Upload file to Colab (manually)

In [None]:
from google.colab import files
import shutil

uploaded = files.upload()
#path = "/content/drive/MyDrive/Colab Notebooks/scenarios/landing"
path = "/content/drive/MyDrive/scenarios/landing" #Acceso directo a scenarios desde compartidos
#path = "/content/data_lake/landing"

# Crear la carpeta de destino si no existe
os.makedirs(path, exist_ok=True)

for filename in uploaded.keys():
    print(filename)
    shutil.move(filename, f'{path}/{filename}')


Saving GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx to GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx
GMEP Data Report_Sample_delivery_Poles_20241011_fixed_with_new_columns.xlsx


# Functions related to Partitions

In [None]:
def show_partitions(path_file):

  # Leer el archivo parquet
  parquet_df = spark.read.parquet(path_file)

  # Añadir una columna con el nombre del archivo (que contiene la partición)
  partitions_df = parquet_df.withColumn("file_name", F.input_file_name())

  # Extraer la partición 'process_datetime' del nombre del archivo
  partitions_df = partitions_df.withColumn("process_datetime", F.regexp_extract(F.col("file_name"), "process_datetime=([^/]+)", 1))

  # Mostrar las particiones únicas
  partitions_df.select("process_datetime").distinct().show(truncate=False)


In [None]:
def get_df_last_partition(zone_path, table_name, partition_column_name):

  df = spark.read.parquet(f'{zone_path}/{table_name}')
  last_partition = df.agg({partition_column_name: "max"}).collect()[0][0]
  df_last_partition = df.filter(col(partition_column_name) == last_partition)

  return df_last_partition


In [None]:
# def get_last_partition_old(path_file):

#   # Leer el archivo parquet
#   parquet_df = spark.read.parquet(path_file)

#   # Añadir una columna con el nombre del archivo (que contiene la partición)
#   partitions_df = parquet_df.withColumn("file_name", F.input_file_name())

#   # Extraer la partición 'process_datetime' del nombre del archivo
#   partitions_df = partitions_df.withColumn("process_datetime", F.regexp_extract(F.col("file_name"), "process_datetime=([^/]+)", 1))

#   # Encontrar la última partición (máxima 'process_datetime')
#   last_partition = partitions_df.select(F.max("process_datetime")).first()[0]

#   return last_partition


In [None]:
# def get_df_last_partition_old (zone_path, table_name, partition_value):

#   if partition_value == 0:
#     partition_value = get_last_partition(f'{zone_path}/{table_name}')

#   df_last_partition = spark.read.parquet(f'{zone_path}/{table_name}').filter(F.col("process_datetime") == partition_value)

#   return df_last_partition

In [None]:
# import os

# def get_last_partition_from_directories():

#   # Ruta del archivo Parquet particionado
#   trusted_path = "/ruta/a/la/zona/trusted"

#   # Obtener la lista de directorios (particiones) directamente del sistema de archivos
#   partitions_paths = [f.path for f in dbutils.fs.ls(trusted_path) if "process_datetime=" in f.path]

#   # Extraer los valores de 'process_datetime' de los directorios
#   partitions_dates = [p.split("process_datetime=")[-1].rstrip('/') for p in partitions_paths]

#   # Encontrar la última partición
#   ultima_particion = max(partitions_dates)

#   # Leer los datos de la última partición únicamente
#   ultimo_df = spark.read.parquet(f"{trusted_path}/process_datetime={ultima_particion}")

#   # APLICAR TRANSFORMACIONES O FILTROS SI ES NECESARIO
#   # ultimo_df = ultimo_df.filter(...)

#   # Escribir los datos refinados en la zona refined
#   refined_path = "/ruta/a/la/zona/refined"
#   ultimo_df.write.mode("overwrite").parquet(refined_path)

# Read Excel File & Generate Partition

In [None]:
def read_excel_file_and_generate_partition(excel_file_name, sheet_name_to_load, columns_to_select, column_mapping, table_schema, skip_rows, destination_path, destination_table, process_datetime, table_id):

  # Cargar la hoja del excel a un DataFrame de pandas
  df_pandas = pd.read_excel(excel_file_name, sheet_name=sheet_name_to_load, skiprows=skip_rows, dtype=str, na_values=[""], keep_default_na=False)

  # Reemplazar valores nulos por cadenas vacías
  #df_pandas.fillna("", inplace=True)

  # Procesar columnas de fecha para mantener el formato original
  #df_pandas['start_up_date'] = pd.to_datetime(df_pandas['start_up_date'], dayfirst=True, errors='coerce').dt.strftime('%d/%m/%Y')
  #df_pandas['survey_date'] = pd.to_datetime(df_pandas['survey_date'], dayfirst=True, errors='coerce').dt.strftime('%d/%m/%Y')

  # Seleccionar las columnas a ser utilizadas
  df_pandas = df_pandas[columns_to_select]

  # Renombrar las columnas en el DataFrame de Pandas usando el mapeo
  df_pandas = df_pandas.rename(columns=column_mapping)

  # Convertir el DataFrame de Pandas (con las columnas renombradas) a un DataFrame de PySpark con el esquema definido
  df_spark = spark.createDataFrame(df_pandas, schema=table_schema)

  # Agregar el campo 'process_datetime' con el valor actual
  df_spark = df_spark.withColumn("process_datetime", F.lit(process_datetime))

  if (len(table_id) != 0):
    # Agregar el campo 'unique_id' basada en el ID incremental y el valor de process_datetime
    df_spark = df_spark.withColumn("id", concat_ws("_", lit(table_id), lit(process_datetime), monotonically_increasing_id()))
  else:
    # Agregar el campo 'unique_id' basada en el ID incremental y el valor de process_datetime
    df_spark = df_spark.withColumn("id", concat_ws("_", lit(process_datetime), monotonically_increasing_id()))

  # Agregar la partición al archivo parquet
  df_spark.write.mode('append').partitionBy('process_datetime').parquet(f'{destination_path}/{destination_table}')




---
# RAW TO TRUSTED
---

#Tests

In [None]:
#from pyspark.sql.functions import col, when, regexp_extract
'''
get_df_last_partition("/content/data_lake/raw", 'main_template', 'process_datetime').select("main_work_center").show()


df_last_partition = get_df_last_partition("/content/data_lake/raw", 'main_template', 'process_datetime')

year_pattern = r"(\d{4})"

df_last_partition_int = df_last_partition.withColumn(
    "construction_year",
    when(regexp_extract(col("construction_year"), year_pattern, 1) != "",
         regexp_extract(col("construction_year"), year_pattern, 1))  # Extraer año si existe
    .otherwise(col("construction_year")))  # Mantener el valor original si ya es un año

df_last_partition_int_year = df_last_partition_int.filter(col("construction_year") == "1900")

df_last_partition_int_year.select("construction_year").show()


df_last_partition_int_year = df_main_template_raw_converting_int.filter((col("construction_year") == "1900") | (col("sap_equipment_number") == 0))


'''

+----------------+
|main_work_center|
+----------------+
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51000|
|           51002|
|           51002|
|           51002|
|           51002|
|           51002|
|           51002|
+----------------+
only showing top 20 rows



'\ndf_last_partition = get_df_last_partition("/content/data_lake/raw", \'main_template\', \'process_datetime\')\n\nyear_pattern = r"(\\d{4})"\n\ndf_last_partition_int = df_last_partition.withColumn(\n    "construction_year",\n    when(regexp_extract(col("construction_year"), year_pattern, 1) != "", \n         regexp_extract(col("construction_year"), year_pattern, 1))  # Extraer año si existe\n    .otherwise(col("construction_year")))  # Mantener el valor original si ya es un año\n\ndf_last_partition_int_year = df_last_partition_int.filter(col("construction_year") == "1900")\n\ndf_last_partition_int_year.select("construction_year").show()  \n\n'

# Trusted schema definitions

In [None]:
def define_trusted_schemas():

  schema_main_template = StructType([
    StructField('source_system_to_be_updated', StringType(), True),
    StructField('type_of_update', StringType(), True),
    StructField('additional_notes_and_observations', StringType(), True),
    StructField('couldnt_capture_due_external_reasons', StringType(), True),
    StructField('placed_on_exception_report', StringType(), True),
    StructField('object_type', StringType(), True),
    StructField('description_of_object_type', StringType(), True),
    StructField('floc', StringType(), True),
    StructField('floc_description', StringType(), True),
    StructField('sap_equipment_number', LongType(), True),
    StructField('equipment_category', StringType(), True),
    StructField('cu_id', StringType(), True),
    StructField('mid', StringType(), True),
    StructField('sort_field_pole_number', StringType(), True),
    StructField('opco', StringType(), True),
    StructField('floc_circuit_number', IntegerType(), True),
    StructField('nyseg_line_number', StringType(), True),
    StructField('address_number', StringType(), True),
    StructField('street', StringType(), True),
    StructField('city', StringType(), True),
    StructField('state', StringType(), True),
    StructField('postal_code', StringType(), True),
    StructField('main_work_center', IntegerType(), True),
    StructField('service_center', StringType(), True),
    StructField('start_up_date', DateType(), True),
    StructField('system_status', StringType(), True),
    StructField('status_for_users', StringType(), True),
    StructField('changed_on', DateType(), True),
    StructField('construction_year', IntegerType(), True),
    StructField('construction_month', IntegerType(), True),
    StructField('technical_id_work_order', StringType(), True),
    StructField('survey_date', DateType(), True),
    StructField('proxy_vert', StringType(), True),
    StructField('phantom_location', StringType(), True),
    StructField('long_text', StringType(), True),
    StructField('manufacturer', StringType(), True),
    StructField('model_number', StringType(), True),
    StructField('manufacturer_serial_number', StringType(), True),
    StructField('nameplate_image_captured', StringType(), True),
    StructField('lfi_device_on_correct_segment_on_every_node', StringType(), True),
    StructField('lfi_device_type_on_segment', StringType(), True),
    StructField('lfi_field_tag_present', StringType(), True),
    StructField('lfi_status', StringType(), True),
    StructField('lfi_device_at_circuit_tie_point', StringType(), True),
    StructField('scada_capability', StringType(), True),
    StructField('tie_point_location', StringType(), True),
    StructField('status', StringType(), True),
    StructField('ue_condr_circuit_associated_to_segment', StringType(), True),
    StructField('ue_condr_location_of_circuit_node_identified', StringType(), True),
    StructField('ue_condr_conductor_transitions_from_oh_to_ug', StringType(), True),
    StructField('ue_condr_riser_present', StringType(), True),
    StructField('ue_condr_segment_associated_with_correct_circuit', StringType(), True),
    StructField('ue_pole_equipment_on_pole_lfi_devices_exists_on_pole', StringType(), True),
    StructField('ue_pole_equipment_on_pole_non_lfi_devices_exists_on_pole', StringType(), True),
    StructField('ue_pole_laterals_branch_off', StringType(), True),
    StructField('ue_pole_primary_metered_customer_point_of_delivery', StringType(), True),
    StructField('ue_pole_model_coordinates_are_accurate', StringType(), True),
    StructField('ue_pole_pole_has_riser', StringType(), True),
    StructField('ue_pole_pole_has_primary_conductors', StringType(), True),
    StructField('ue_xfmer_abandoned', StringType(), True),
    StructField('ue_vaulp_model_coordinates_are_accurate', StringType(), True),
    StructField('id', StringType(), True),
    StructField('process_datetime', LongType(), True)
  ])

  schema_sap_updates = StructType([
    StructField('object_type', StringType(), True),
    StructField('type_of_sap_update', StringType(), True),
    StructField('sap_discrepancy', StringType(), True),
    StructField('additional_object_type_notes_and_observations', StringType(), True),
    StructField('floc', StringType(), True),
    StructField('floc_description', StringType(), True),
    StructField('description_of_object_type', StringType(), True),
    StructField('sap_equipment_number', LongType(), True),
    StructField('cu_id', StringType(), True),
    StructField('cyme_id', StringType(), True),
    StructField('circuit_number', IntegerType(), True),
    StructField('nyseg_line_number', StringType(), True),
    StructField('equipment_category', StringType(), True),
    StructField('mid', StringType(), True),
    StructField('sort_field_pole_number', StringType(), True),
    StructField('opco', StringType(), True),
    StructField('address_number', StringType(), True),
    StructField('street', StringType(), True),
    StructField('city', StringType(), True),
    StructField('state', StringType(), True),
    StructField('postal_code', StringType(), True),
    StructField('main_work_center', IntegerType(), True),
    StructField('service_center', StringType(), True),
    StructField('start_up_date', DateType(), True),
    StructField('system_status', StringType(), True),
    StructField('status_for_users', StringType(), True),
    StructField('changed_on', DateType(), True),
    StructField('construction_year', IntegerType(), True),
    StructField('construction_month', IntegerType(), True),
    StructField('technical_id_work_order', StringType(), True),
    StructField('survey_date', DateType(), True),
    StructField('proxy_vert', StringType(), True),
    StructField('phantom_location', StringType(), True),
    StructField('long_text', StringType(), True),
    StructField('manufacturer', StringType(), True),
    StructField('model_number', StringType(), True),
    StructField('manufacturer_serial_number', StringType(), True),
    StructField('nameplate_image_captured', StringType(), True),
    StructField('ue_capsg_capacitor_type', StringType(), True),
    StructField('ue_capsg_circuit_switch_number', StringType(), True),
    StructField('ue_capsg_kvar', StringType(), True),
    StructField('ue_capsg_nominal_voltage_rating', StringType(), True),
    StructField('ue_capsg_number_of_bushings', StringType(), True),
    StructField('ue_capsg_number_of_phases', StringType(), True),
    StructField('ue_capsg_public_or_private', StringType(), True),
    StructField('ue_capsg_scada_controlled', StringType(), True),
    StructField("ue_capsg_status", StringType(), True),
    StructField('ue_cbank_capacitor_type', StringType(), True),
    StructField('ue_cbank_circuit_switch_number', StringType(), True),
    StructField('ue_cbank_controlled', StringType(), True),
    StructField('ue_cbank_kvar_total', StringType(), True),
    StructField('ue_cbank_nominal_voltage_rating', StringType(), True),
    StructField("ue_cbank_number_of_capacitors", StringType(), True),
    StructField('ue_cbank_number_of_phases', StringType(), True),
    StructField('ue_cbank_public_or_private', StringType(), True),
    StructField('ue_cbank_scada_controlled', StringType(), True),
    StructField('ue_cbank_status', StringType(), True),
    StructField('ue_condr_conductor_length', StringType(), True),
    StructField('ue_condr_conductor_size', StringType(), True),
    StructField('ue_condr_conductor_type', StringType(), True),
    StructField('ue_condr_insulation_type', StringType(), True),
    StructField('ue_condr_neutral_material', StringType(), True),
    StructField('ue_condr_neutral_size', StringType(), True),
    StructField('ue_condr_nominal_voltage_rating', StringType(), True),
    StructField('ue_condr_location_oh_or_ug', StringType(), True),
    StructField('ue_condr_primary_or_secondary', StringType(), True),
    StructField('ue_condr_primary_conductor_material', StringType(), True),
    StructField('ue_condr_public_length', StringType(), True),
    StructField('ue_condr_public_or_private', StringType(), True),
    StructField('ue_condr_trailing_span_length', StringType(), True),
    StructField('ue_condr_trailing_span_location', StringType(), True),
    StructField('ue_condr_trans_or_dist', StringType(), True),
    StructField('ue_ctout_character_of_construction', StringType(), True),
    StructField('ue_ctout_cutout_rating', StringType(), True),
    StructField('ue_ctout_fuse_size', StringType(), True),
    StructField('ue_ctout_fuse_type', StringType(), True),
    StructField('ue_ctout_material', StringType(), True),
    StructField('ue_ctout_nominal_voltage_rating', StringType(), True),
    StructField('ue_ctout_phase', StringType(), True),
    StructField('ue_ctout_public_or_private', StringType(), True),
    StructField('ue_ctout_state', StringType(), True),
    StructField('ue_ctout_type', StringType(), True),
    StructField('ue_disc_amp_rating', StringType(), True),
    StructField('ue_disc_circuit_switch_number', StringType(), True),
    StructField('ue_disc_number_of_phases', StringType(), True),
    StructField('ue_disc_phase_designation', StringType(), True),
    StructField('ue_disc_public_or_private', StringType(), True),
    StructField('ue_disc_switch_style', StringType(), True),
    StructField('ue_disc_type', StringType(), True),
    StructField('ue_pole_circuit_phase_label', StringType(), True),
    StructField('ue_pole_class', StringType(), True),
    StructField('ue_pole_opco_owner_percent', StringType(), True),
    StructField('ue_pole_owner', StringType(), True),
    StructField('ue_pole_owner_maintained', StringType(), True),
    StructField('ue_pole_pole_length', StringType(), True),
    StructField('ue_pole_pole_material', StringType(), True),
    StructField('ue_pole_pole_number', StringType(), True),
    StructField('ue_pole_between_pole_number_since', StringType(), True),
    StructField('ue_pole_between_pole_number_to', StringType(), True),
    StructField('ue_pole_pole_type', StringType(), True),
    StructField('ue_pole_public_or_private', StringType(), True),
    StructField('ue_pole_trans_or_dist', StringType(), True),
    StructField('ue_pole_treatment', StringType(), True),
    StructField('ue_recl_circuit_switch_number', StringType(), True),
    StructField('ue_recl_number_of_phases', StringType(), True),
    StructField('ue_recl_public_or_private', StringType(), True),
    StructField('ue_recl_recloser_acts_as', StringType(), True),
    StructField('ue_recl_scada_controlled', StringType(), True),
    StructField('ue_recl_trans_or_dist', StringType(), True),
    StructField('ue_reg_circuit_switch_number', StringType(), True),
    StructField('ue_reg_kva', StringType(), True),
    StructField('ue_reg_phase_designation', StringType(), True),
    StructField('ue_reg_public_or_private', StringType(), True),
    StructField('ue_reg_scada_controlled', StringType(), True),
    StructField('ue_reg_status', StringType(), True),
    StructField('ue_sectz_circuit_switch_number', StringType(), True),
    StructField('ue_sectz_number_of_phases', StringType(), True),
    StructField('ue_sectz_public_or_private', StringType(), True),
    StructField('ue_sectz_scada_controlled', StringType(), True),
    StructField('ue_ratio_location_oh_or_ug', StringType(), True),
    StructField('ue_ratio_number_of_phases', StringType(), True),
    StructField('ue_ratio_operating_voltage', StringType(), True),
    StructField('ue_ratio_phase_designation', StringType(), True),
    StructField('ue_ratio_primary_voltage_text', StringType(), True),
    StructField('ue_ratio_public_or_private', StringType(), True),
    StructField('ue_ratio_secondary_voltage_text', StringType(), True),
    StructField('ue_ratio_size_kva', StringType(), True),
    StructField('ue_ratio_status', StringType(), True),
    StructField('ue_ratio_subtype_transformer_type', StringType(), True),
    StructField('ue_swtch_circuit_switch_number', StringType(), True),
    StructField('ue_swtch_load_break', StringType(), True),
    StructField('ue_swtch_normal_position', StringType(), True),
    StructField('ue_swtch_number_of_phases', StringType(), True),
    StructField('ue_swtch_operating_voltage', StringType(), True),
    StructField('ue_swtch_phase_designation', StringType(), True),
    StructField('ue_swtch_public_or_private', StringType(), True),
    StructField('ue_swtch_rated_kv', StringType(), True),
    StructField('ue_swtch_state', StringType(), True),
    StructField('ue_swtch_scada_controlled', StringType(), True),
    StructField('ue_swtch_switch_type', StringType(), True),
    StructField('ue_swtgr_amperage', StringType(), True),
    StructField('ue_swtgr_fuse_type', StringType(), True),
    StructField('ue_swtgr_circuit_switch_number', StringType(), True),
    StructField('ue_swtgr_installation_type', StringType(), True),
    StructField('ue_swtgr_loadbreak_capability', StringType(), True),
    StructField('ue_swtgr_nameplate', StringType(), True),
    StructField('ue_swtgr_nominal_voltage_rating', StringType(), True),
    StructField('ue_swtgr_public_or_private', StringType(), True),
    StructField('ue_swtgr_switchgear_distribution_type', StringType(), True),
    StructField('ue_swtgr_trans_or_dist', StringType(), True),
    StructField('ue_swtgr_year_installed', StringType(), True),
    StructField('ue_xfmer_date_retired_abandoned', StringType(), True),
    StructField('ue_xfmer_dual_voltage', StringType(), True),
    StructField('ue_xfmer_kva_rating', StringType(), True),
    StructField('ue_xfmer_location_oh_or_ug', StringType(), True),
    StructField('ue_xfmer_number_of_phases', StringType(), True),
    StructField('ue_xfmer_phase_designation', StringType(), True),
    StructField('ue_xfmer_primary_voltage_text', StringType(), True),
    StructField('ue_xfmer_public_or_private', StringType(), True),
    StructField('ue_xfmer_secondary_voltage', StringType(), True),
    StructField('ue_xfmer_secondary_voltage_text', StringType(), True),
    StructField('ue_xfmer_size_kva', StringType(), True),
    StructField('ue_xfmer_transformer_type', StringType(), True),
    StructField('ue_xfmer_trans_type', StringType(), True),
    StructField('ue_xfmer_year_installed', StringType(), True),
    StructField('ue_vaulp_foundation_material', StringType(), True),
    StructField('ue_vaulp_opco_owner_percent', StringType(), True),
    StructField('ue_vaulp_public_or_private', StringType(), True),
    StructField('ue_vaulp_tr_number', StringType(), True),
    StructField('ue_vaulp_trans_or_dist', StringType(), True),
    StructField('id', StringType(), True),
    StructField('process_datetime', LongType(), True)
  ])

  schema_gis_updates = StructType([
    StructField('object_type', StringType(), True),
    StructField('type_of_gis_update', StringType(), True),
    StructField('gis_discrepancy', StringType(), True),
    StructField('additional_object_type_notes_and_observations', StringType(), True),
    StructField('floc', StringType(), True),
    StructField('floc_description', StringType(), True),
    StructField('description_of_object_type', StringType(), True),
    StructField('sap_equipment_number', LongType(), True),
    StructField('cu_id', StringType(), True),
    StructField('cyme_id', StringType(), True),
    StructField('circuit_number', IntegerType(), True),
    StructField('nyseg_line_number', StringType(), True),
    StructField('mid', StringType(), True),
    StructField('sort_field_pole_number', StringType(), True),
    StructField('opco', StringType(), True),
    StructField('address_number', StringType(), True),
    StructField('street', StringType(), True),
    StructField('city', StringType(), True),
    StructField('state', StringType(), True),
    StructField('zip', StringType(), True),
    StructField('main_work_center', IntegerType(), True),
    StructField('service_center', StringType(), True),
    StructField('changed_on', DateType(), True),
    StructField('technical_id_work_order', StringType(), True),
    StructField('survey_date', DateType(), True),
    StructField('proxy_vert', StringType(), True),
    StructField('manufacturer', StringType(), True),
    StructField('model_number', StringType(), True),
    StructField('manufacturer_serial_number', StringType(), True),
    StructField('ue_capsg_circuit_number', StringType(), True),
    StructField('ue_capsg_circuit_switch_number', StringType(), True),
    StructField('ue_capsg_nominal_voltage_rating', StringType(), True),
    StructField('ue_capsg_number_of_phases', StringType(), True),
    StructField('ue_capsg_public_or_private', StringType(), True),
    StructField('ue_capsg_scada_controlled', StringType(), True),
    StructField('ue_capsg_status', StringType(), True),
    StructField('ue_cbank_capacitor_type', StringType(), True),
    StructField('ue_cbank_circuit_switch_number', StringType(), True),
    StructField('ue_cbank_controlled', StringType(), True),
    StructField('ue_cbank_kvar_total', StringType(), True),
    StructField('ue_cbank_nominal_voltage_rating', StringType(), True),
    StructField('ue_cbank_number_of_phases', StringType(), True),
    StructField('ue_cbank_public_or_private', StringType(), True),
    StructField('ue_cbank_scada_controlled', StringType(), True),
    StructField('ue_cbank_status', StringType(), True),
    StructField('ue_condr_circuit_associated_to_segment', StringType(), True),
    StructField('ue_condr_conductor_configuration', StringType(), True),
    StructField('ue_condr_conductor_length', StringType(), True),
    StructField('ue_condr_conductor_size', StringType(), True),
    StructField('ue_condr_conductor_type', StringType(), True),
    StructField('ue_condr_gis_id_from_structure', StringType(), True),
    StructField('ue_condr_gis_id_to_structure', StringType(), True),
    StructField('ue_condr_linetype', StringType(), True),
    StructField('ue_condr_insulation_type', StringType(), True),
    StructField('ue_condr_neutral_material', StringType(), True),
    StructField('ue_condr_neutral_size', StringType(), True),
    StructField('ue_condr_nominal_voltage_rating', StringType(), True),
    StructField('ue_condr_number_of_phases', StringType(), True),
    StructField('ue_condr_location_oh_or_ug', StringType(), True),
    StructField('ue_condr_operating_voltage', StringType(), True),
    StructField('ue_condr_phase_designation', StringType(), True),
    StructField('ue_condr_phase_orientation', StringType(), True),
    StructField('ue_condr_primary_or_secondary', StringType(), True),
    StructField('ue_condr_primary_conductor_material', StringType(), True),
    StructField('ue_condr_proxy_indicator', StringType(), True),
    StructField('ue_condr_proxy_vert_from', StringType(), True),
    StructField('ue_condr_proxy_vert_to', StringType(), True),
    StructField('ue_condr_public_or_private', StringType(), True),
    StructField('ue_condr_segment_associated_with_correct_circuit', StringType(), True),
    StructField('ue_condr_trailing_span_location', StringType(), True),
    StructField('ue_condr_voltage_includes_circuit_configuration', StringType(), True),
    StructField('ue_ctout_fuse_size', StringType(), True),
    StructField('ue_ctout_fuse_type', StringType(), True),
    StructField('ue_ctout_nominal_voltage_rating', StringType(), True),
    StructField('ue_ctout_phase', StringType(), True),
    StructField('ue_ctout_public_or_private', StringType(), True),
    StructField('ue_ctout_state', StringType(), True),
    StructField('ue_disc_amp_rating', StringType(), True),
    StructField('ue_disc_circuit_switch_number', StringType(), True),
    StructField('ue_disc_disconnect_type', StringType(), True),
    StructField('ue_disc_number_of_phases', StringType(), True),
    StructField('ue_disc_phase_designation', StringType(), True),
    StructField('ue_disc_public_or_private', StringType(), True),
    StructField('ue_pole_circuit_phase_label', StringType(), True),
    StructField('ue_pole_laterals_branch_off', StringType(), True),
    StructField('ue_pole_geocoordinates_latitude', StringType(), True),
    StructField('ue_pole_geocoordinates_longitude', StringType(), True),
    StructField('ue_pole_model_coordinates_are_accurate', StringType(), True),
    StructField('ue_pole_pole_number', StringType(), True),
    StructField('ue_pole_between_pole_number_since', StringType(), True),
    StructField('ue_pole_between_pole_number_to', StringType(), True),
    StructField('ue_pole_pole_type', StringType(), True),
    StructField('ue_recl_circuit_switch_number', StringType(), True),
    StructField('ue_recl_number_of_phases', StringType(), True),
    StructField('ue_recl_public_or_private', StringType(), True),
    StructField('ue_recl_recloser_acts_as', StringType(), True),
    StructField('ue_recl_scada_controlled', StringType(), True),
    StructField('ue_recl_trans_or_dist', StringType(), True),
    StructField('ue_reg_circuit_switch_number', StringType(), True),
    StructField('ue_reg_kva', StringType(), True),
    StructField('ue_reg_phase_designation', StringType(), True),
    StructField('ue_reg_public_or_private', StringType(), True),
    StructField('ue_reg_scada_controlled', StringType(), True),
    StructField('ue_reg_status', StringType(), True),
    StructField('ue_sectz_circuit_switch_number', StringType(), True),
    StructField('ue_sectz_number_of_phases', StringType(), True),
    StructField('ue_sectz_public_or_private', StringType(), True),
    StructField('ue_sectz_scada_controlled', StringType(), True),
    StructField('ue_ratio_location_oh_or_ug', StringType(), True),
    StructField('ue_ratio_number_of_phases', StringType(), True),
    StructField('ue_ratio_operating_voltage', StringType(), True),
    StructField('ue_ratio_phase_designation', StringType(), True),
    StructField('ue_ratio_pole_gis_id_', StringType(), True),
    StructField('ue_ratio_pole_gis_id2', StringType(), True),
    StructField('ue_ratio_primary_voltage_text', StringType(), True),
    StructField('ue_ratio_public_or_private', StringType(), True),
    StructField('ue_ratio_secondary_voltage_text', StringType(), True),
    StructField('ue_ratio_size_kva', StringType(), True),
    StructField('ue_ratio_status', StringType(), True),
    StructField('ue_ratio_subtype_transformer_type', StringType(), True),
    StructField('ue_ratio_transformer_in', StringType(), True),
    StructField('ue_ratio_transformer_in_state', StringType(), True),
    StructField('ue_ratio_transformer_out', StringType(), True),
    StructField('ue_swtch_circuit_switch_number', StringType(), True),
    StructField('ue_swtch_facility_id_child_level', StringType(), True),
    StructField('ue_swtch_feeder_id_circuit_number', StringType(), True),
    StructField('ue_swtch_feeder_id2_circuit_number', StringType(), True),
    StructField('ue_swtch_load_break', StringType(), True),
    StructField('ue_swtch_normal_position', StringType(), True),
    StructField('ue_swtch_normal_position_a', StringType(), True),
    StructField('ue_swtch_normal_position_b', StringType(), True),
    StructField('ue_swtch_normal_position_c', StringType(), True),
    StructField('ue_swtch_number_of_phases', StringType(), True),
    StructField('ue_swtch_operating_voltage', StringType(), True),
    StructField('ue_swtch_phase_designation', StringType(), True),
    StructField('ue_swtch_present_position_a', StringType(), True),
    StructField('ue_swtch_present_position_b', StringType(), True),
    StructField('ue_swtch_present_position_c', StringType(), True),
    StructField('ue_swtch_public_or_private', StringType(), True),
    StructField('ue_swtch_rated_kv', StringType(), True),
    StructField('ue_swtch_scada_controlled', StringType(), True),
    StructField('ue_swtch_switch_type', StringType(), True),
    StructField('ue_swtgr_circuit_switch_number', StringType(), True),
    StructField('ue_swtgr_facility_id', StringType(), True),
    StructField('ue_swtgr_feeder_id', StringType(), True),
    StructField('ue_swtgr_feeder_id2', StringType(), True),
    StructField('ue_swtgr_fuse_type', StringType(), True),
    StructField('ue_swtgr_nominal_voltage_rating', StringType(), True),
    StructField('ue_swtgr_operating_voltage', StringType(), True),
    StructField('ue_swtgr_public_or_private', StringType(), True),
    StructField('ue_swtgr_switchgear_name', StringType(), True),
    StructField('ue_swtgr_switch_number', StringType(), True),
    StructField('ue_xfmer_dual_voltage', StringType(), True),
    StructField('ue_xfmer_kva_rating', StringType(), True),
    StructField('ue_xfmer_location_oh_or_ug', StringType(), True),
    StructField('ue_xfmer_number_of_phases', StringType(), True),
    StructField('ue_xfmer_phase_designation', StringType(), True),
    StructField('ue_xfmer_primary_voltage_text', StringType(), True),
    StructField('ue_xfmer_proxy_vert', StringType(), True),
    StructField('ue_xfmer_secondary_voltage', StringType(), True),
    StructField('ue_xfmer_secondary_voltage_text', StringType(), True),
    StructField('ue_xfmer_size_kva', StringType(), True),
    StructField('ue_xfmer_subtype', StringType(), True),
    StructField('ue_xfmer_transformer_in_state', StringType(), True),
    StructField('ue_xfmer_transformer_out_state', StringType(), True),
    StructField('ue_vaulp_geocoordinates_latitude', StringType(), True),
    StructField('ue_vaulp_tr_number', StringType(), True),
    StructField('ue_vaulp_geocoordinates_longitude', StringType(), True),
    StructField('ue_vaulp_model_coordinates_are_accurate', StringType(), True),
    StructField('id', StringType(), True),
    StructField('process_datetime', LongType(), True)
  ])

  schema_sap_equipment_report = StructType([
    StructField("floc", StringType(), True),
    StructField("floc_description", StringType(), True),
    StructField("super_floc", StringType(), True),
    StructField("company", StringType(), True),
    StructField("floc_circuit_number", StringType(), True),#Int
    StructField("object_type", StringType(), True),
    StructField("sap_equipment_number", LongType(), True),
    StructField("equipment_description", StringType(), True),
    StructField("equipment_category", StringType(), True),
    StructField("sort_field_pole_number", StringType(), True),
    StructField("main_work_center", StringType(), True),#Int
    StructField("service_center", StringType(), True),
    StructField("address_number", StringType(), True),
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("district", StringType(), True),
    StructField("postal_code", StringType(), True),
    StructField("rg", StringType(), True),
    StructField("system_status", StringType(), True),
    StructField("status_for_users", StringType(), True),
    StructField("construction_year", StringType(), True),
    StructField("start_up_date", DateType(), True),
    StructField("changed_on", DateType(), True),
    StructField("material", StringType(), True),
    StructField("material_description", StringType(), True),
    StructField("manufacturer", StringType(), True),
    StructField("model_number", StringType(), True),
    StructField("manufacturer_serial_number", StringType(), True),
    StructField("technical_id_work_order", StringType(), True),
    StructField("object_number", StringType(), True),
    StructField("cu_id", StringType(), True),
    StructField("cyme_id", StringType(), True),
    StructField("capacitor_type", StringType(), True),
    StructField("equipment_circuit_number_1", StringType(), True),
    StructField("equipment_circuit_number_2", StringType(), True),
    StructField("circuit_switch_number", StringType(), True),
    StructField("kvar", StringType(), True),
    StructField("kvar_total", StringType(), True),
    StructField("number_of_bushings", StringType(), True),
    StructField("nominal_voltage_rating", StringType(), True),
    StructField("normal_state", StringType(), True),
    StructField("public_or_private", StringType(), True),
    StructField("scada_controlled", StringType(), True),
    StructField("conductor_length", StringType(), True),
    StructField("conductor_size", StringType(), True),
    StructField("conductor_type", StringType(), True),
    StructField("insulation_type", StringType(), True),
    StructField("number_of_phases", StringType(), True),
    StructField("location_oh_or_ug", StringType(), True),
    StructField("phase_designation", StringType(), True),
    StructField("primary_or_secondary", StringType(), True),
    StructField("public_length", StringType(), True),
    StructField("trans_or_dist", StringType(), True),
    StructField("trailing_span_location", StringType(), True),
    StructField("fuse_size", StringType(), True),
    StructField("fuse_type", StringType(), True),
    StructField("material_2", StringType(), True),
    StructField("amp_rating", StringType(), True),
    StructField("cutout_distribution_type", StringType(), True),
    StructField("disconnect_type", StringType(), True),
    StructField("controlled", StringType(), True),
    StructField("owner", StringType(), True),
    StructField("opco_owner_percent", StringType(), True),
    StructField("pole_class", StringType(), True),
    StructField("pole_length", StringType(), True),
    StructField("pole_material", StringType(), True),
    StructField("pole_type", StringType(), True),
    StructField("treatment", StringType(), True),
    StructField("location_oh_or_ug_2", StringType(), True),
    StructField("size_kva", StringType(), True),
    StructField("recloser_acts_as", StringType(), True),
    StructField("kva", StringType(), True),
    StructField("load_break_yes_no", StringType(), True),
    StructField("normal_position", StringType(), True),
    StructField("rated_kv", StringType(), True),
    StructField("switch_style", StringType(), True),
    StructField("switch_type", StringType(), True),
    StructField("amperage", StringType(), True),
    StructField("switchgear_distribution_type", StringType(), True),
    StructField("year_installed", StringType(), True),
    StructField("foundation_material", StringType(), True),
    StructField("dual_voltage", StringType(), True),
    StructField("primary_voltage_text", StringType(), True),
    StructField("secondary_voltage", StringType(), True),
    StructField("secondary_voltage_text", StringType(), True),
    StructField("trans_type", StringType(), True),
    StructField("transformer_type", StringType(), True),
    StructField('id', StringType(), True),
    StructField("process_datetime", LongType(), False)
  ])

  return schema_main_template, schema_sap_updates, schema_gis_updates, schema_sap_equipment_report

# Generate DFs

In [None]:
# generate partition value
#process_datetime = int(datetime.now().strftime('%Y%m%d%H%M%S'))
#process_datetime = 20241128182012

In [None]:
#raw_path = "/content/drive/MyDrive/Colab Notebooks/scenarios/raw/" #Lucio
raw_path = "/content/drive/MyDrive/scenarios/raw" #Acceso directo a scenarios desde compartidos

schema_main_template, schema_sap_updates, schema_gis_updates, schema_sap_equipment_report = define_trusted_schemas()

#Obtener los DFs a partir de la ultima particion

df_main_template_raw = get_df_last_partition(raw_path, 'main_template', 'process_datetime')
df_sap_updates_raw = get_df_last_partition(raw_path, 'sap_updates', 'process_datetime')
df_gis_updates_raw = get_df_last_partition(raw_path, 'gis_updates', 'process_datetime')
df_sap_equipment_report_raw = get_df_last_partition(raw_path, 'sap_equipment_report', 'process_datetime')

In [None]:
'''
df_gis_updates_raw.show()

print(process_datetime)

show_partitions("/content/drive/MyDrive/scenarios/raw/main_template")

df = spark.read.parquet("/content/drive/MyDrive/scenarios/raw/main_template")
last_partition = df.agg({"process_datetime": "max"}).collect()[0][0]
df_last_partition = df.filter(col("process_datetime") == last_partition)
print(last_partition)

# Definir la ubicación del archivo y el tipo de archivo
file_location = "/content/drive/MyDrive/scenarios/raw/main_template"
file_type = "delta"

# Leer el archivo Parquet en un DataFrame
df = spark.read.format(file_type).load(file_location)

# Mostrar los primeros registros del DataFrame
df.show()
'''

df_sap_updates_raw.printSchema()

root
 |-- object_type: string (nullable = true)
 |-- type_of_sap_update: string (nullable = true)
 |-- sap_discrepancy: string (nullable = true)
 |-- additional_object_type_notes_and_observations: string (nullable = true)
 |-- floc: string (nullable = true)
 |-- floc_description: string (nullable = true)
 |-- description_of_object_type: string (nullable = true)
 |-- sap_equipment_number: long (nullable = true)
 |-- cu_id: string (nullable = true)
 |-- cyme_id: string (nullable = true)
 |-- circuit_number: integer (nullable = true)
 |-- nyseg_line_number: string (nullable = true)
 |-- equipment_category: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- sort_field_pole_number: string (nullable = true)
 |-- opco: string (nullable = true)
 |-- address_number: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- main_work_center: integer (nullabl

#Integer data cleansing

In [None]:
#_______________main_template_________________

#Cambiar "new" por 0
df_main_template_raw_converting = (df_main_template_raw.withColumn("sap_equipment_number",
                                  when(col("sap_equipment_number") == "New", 0).otherwise(col("sap_equipment_number").cast(LongType()))))

#Extraer anio en registros como: "If unknown use: 1/1/1900"
year_pattern = r"(\d{4})"
df_main_template_raw_converting = df_main_template_raw_converting.withColumn(
    "construction_year",
    when(
        regexp_extract(col("construction_year"), year_pattern, 1) != "",  # Si se extrajo un anio
        regexp_extract(col("construction_year"), year_pattern, 1)  # Extraer anio si existe
    )
    .otherwise("1/1/1900"))

#Convertir a entero
columns_to_cast = ["floc_circuit_number", "main_work_center", "construction_year", "construction_month"]

for column in columns_to_cast:
    df_main_template_raw_converting = df_main_template_raw_converting.withColumn(column, col(column).cast("int"))


#_______________sap_updates_________________

#Cambiar "new" por 0
df_sap_updates_raw = (df_sap_updates_raw.withColumn("sap_equipment_number",
                                  when(col("sap_equipment_number") == "New", 0).otherwise(col("sap_equipment_number").cast(LongType()))))

#Extraer anio en registros como: "If unknown use: 1/1/1900"
df_sap_updates_raw = df_sap_updates_raw.withColumn(
    "construction_year",
    when(
        regexp_extract(col("construction_year"), year_pattern, 1) != "",  # Si se extrajo un año
        regexp_extract(col("construction_year"), year_pattern, 1)  # Extraer año si existe
    )
    .otherwise("1/1/1900"))

#Convertir a entero
columns_to_cast = ["circuit_number", "main_work_center", "construction_year", "construction_month"]

for column in columns_to_cast:
    df_sap_updates_raw = df_sap_updates_raw.withColumn(column, col(column).cast("int"))


#_______________gis_updates_________________

#Cambiar "new" por 0
df_gis_updates_raw = (df_gis_updates_raw.withColumn("sap_equipment_number",
                                  when(col("sap_equipment_number") == "New", 0).otherwise(col("sap_equipment_number").cast(LongType()))))

#Convertir a entero (gis_updates)
columns_to_cast = ["circuit_number", "main_work_center"]

for column in columns_to_cast:
    df_gis_updates_raw = df_gis_updates_raw.withColumn(column, col(column).cast("int"))


#_______________sap_equipment_report_________________

#Cambiar "new" por 0
df_sap_equipment_report_raw = (df_sap_equipment_report_raw.withColumn("sap_equipment_number",
                                  when(col("sap_equipment_number") == "New", 0).otherwise(col("sap_equipment_number").cast(LongType()))))

#Convertir a entero
df_sap_equipment_report_raw = df_sap_equipment_report_raw.withColumn("floc_circuit_number", col("floc_circuit_number").cast("int"))

#Cambiar "Missing" por "1900"
df_sap_equipment_report_raw = df_sap_equipment_report_raw.withColumn("construction_year",
        when(col("construction_year") == "Missing", "1900").otherwise(col("construction_year")))



In [None]:
#df_main_template_raw_converting.select("construction_year").distinct().show()
df_sap_updates_raw.select("start_up_date").show()


+-------------------+
|      start_up_date|
+-------------------+
|1900-01-01 00:00:00|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|              36708|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|2018-08-10 00:00:00|
|1900-01-01 00:00:00|
|1900-01-01 00:00:00|
|1900-01-01 00:00:00|
|1954-01-01 00:00:00|
|1983-01-01 00:00:00|
|1954-01-01 00:00:00|
|1954-01-01 00:00:00|
|2003-01-01 00:00:00|
+-------------------+
only showing top 20 rows



#Date data cleansing

In [None]:
from pyspark.sql.functions import to_date, date_format, expr
date_base = "1899-12-30"

#_______________main_template_________________
df_main_template_raw_converting = df_main_template_raw_converting.withColumn("start_up_date", to_date(col("start_up_date"), "yyyy-MM-dd HH:mm:ss"))
df_main_template_raw_converting = df_main_template_raw_converting.withColumn("survey_date", to_date(col("survey_date"), "yyyy-MM-dd HH:mm:ss"))

df_main_template_raw_converting = df_main_template_raw_converting.withColumn("changed_on", to_date(
        when(df_main_template_raw_converting["changed_on"] == "TBD", "1/1/1900").otherwise(df_main_template_raw_converting["changed_on"]),
        "yyyy-MM-dd HH:mm:ss"))

#_______________sap_updates_________________

df_sap_updates_raw = df_sap_updates_raw.withColumn("survey_date", to_date(col("survey_date"), "yyyy-MM-dd HH:mm:ss"))

df_sap_updates_raw = df_sap_updates_raw.withColumn("start_up_date",to_date(
        when(col("start_up_date") == "TBD", "1900-01-01")
        .when(col("start_up_date").rlike("^[0-9]+$"), to_date(expr(f"date_add('{date_base}', cast(start_up_date as int))")))
        .otherwise(to_date(col("start_up_date"), "yyyy-MM-dd HH:mm:ss"))))

#df_sap_updates_raw = df_sap_updates_raw.withColumn("start_up_date", date_format(col("start_up_date"), "dd/MM/yyyy"))

df_sap_updates_raw = df_sap_updates_raw.withColumn("changed_on",to_date(
        when(col("changed_on") == "TBD", "1900-01-01")
        .when(col("changed_on").rlike("^[0-9]+$"), to_date(expr(f"date_add('{date_base}', cast(changed_on as int))")))
        .otherwise(to_date(col("changed_on"), "yyyy-MM-dd HH:mm:ss"))))

#df_sap_updates_raw = df_sap_updates_raw.withColumn("changed_on", date_format(col("changed_on"), "dd/MM/yyyy"))

#_______________gis_updates_________________

df_gis_updates_raw = df_gis_updates_raw.withColumn("survey_date",to_date(col("survey_date"), "yyyy-MM-dd HH:mm:ss"))

df_gis_updates_raw = df_gis_updates_raw.withColumn("changed_on",to_date(
        when(col("changed_on") == "TBD", "1900-01-01")
        .when(col("changed_on").rlike("^[0-9]+$"), to_date(expr(f"date_add('{date_base}', cast(changed_on as int))")))
        .otherwise(col("changed_on")), "yyyy-MM-dd HH:mm:ss"))

#df_gis_updates_raw = df_gis_updates_raw.withColumn("changed_on", date_format(col("changed_on"), "dd/MM/yyyy"))



#_______________sap_equipment_report_________________


df_sap_equipment_report_raw = df_sap_equipment_report_raw.withColumn("start_up_date", to_date(col("start_up_date"), "yyyy-MM-dd HH:mm:ss"))

df_sap_equipment_report_raw = df_sap_equipment_report_raw.withColumn("changed_on",to_date(
        when(col("changed_on") == "TBD", "1900-01-01")
        .when(col("changed_on").rlike("^[0-9]+$"), to_date(expr(f"date_add('{date_base}', cast(changed_on as int))")))
        .otherwise(col("changed_on")), "yyyy-MM-dd HH:mm:ss"))

#df_sap_equipment_report_raw = df_sap_equipment_report_raw.withColumn("changed_on", date_format(col("changed_on"), "dd/MM/yyyy"))


In [None]:
#df_main_template_raw_converting = df_main_template_raw_converting.na.fill("") #.fillna("", inplace=True)

#df_main_template_raw_converting.select("start_up_date", "survey_date").show(35)

df_sap_equipment_report_raw.show()
df_sap_equipment_report_raw.printSchema()

for value in schema_sap_equipment_report:
  print(value)

#df_sap_updates_trusted = spark.createDataFrame(df_sap_updates_raw.rdd, schema_sap_updates)



+--------------------+--------------------+--------------------+--------------------+-------------------+-----------+--------------------+---------------------+------------------+----------------------+----------------+--------------------+--------------+---------------+--------+--------+-----------+---+-------------+----------------+-----------------+-------------+----------+---------+--------------------+------------+------------+--------------------------+-----------------------+--------------------+-----+------------------+--------------+--------------------------+--------------------------+---------------------+----+----------+------------------+----------------------+------------+-----------------+----------------+----------------+--------------+--------------+---------------+----------------+-----------------+-----------------+--------------------+-------------+-------------+----------------------+---------+---------+----------+----------+------------------------+---------------

#Enviar parquets listos a trusted

In [None]:
#trusted_path = "/content/data_lake/trusted"   #Temporal
#trusted_path = "/content/drive/MyDrive/Colab Notebooks/scenarios/trusted/" #Lucio
trusted_path = "/content/drive/MyDrive/scenarios/trusted" #Acceso directo a scenarios desde compartidos


df_main_template_trusted = spark.createDataFrame(df_main_template_raw_converting.rdd, schema_main_template)
df_sap_updates_trusted = spark.createDataFrame(df_sap_updates_raw.rdd, schema_sap_updates)
df_gis_updates_trusted = spark.createDataFrame(df_gis_updates_raw.rdd, schema_gis_updates)
df_sap_equipment_report_trusted = spark.createDataFrame(df_sap_equipment_report_raw.rdd, schema_sap_equipment_report)

df_main_template_trusted.write.mode('append').partitionBy('process_datetime').parquet(f'{trusted_path}/main_template')
df_sap_updates_trusted.write.mode('append').partitionBy('process_datetime').parquet(f'{trusted_path}/sap_updates')
df_gis_updates_trusted.write.mode('append').partitionBy('process_datetime').parquet(f'{trusted_path}/gis_updates')
df_sap_equipment_report_trusted.write.mode('append').partitionBy('process_datetime').parquet(f'{trusted_path}/sap_equipment_report')


# Read Last Partition

In [None]:
df_sap_equipment_report_trusted.show(35)

'''
# zone_path = "/content/gdrive/MyDrive/__ntt_proof_of_concept/raw"
zone_path = "/content/drive/MyDrive/Colab Notebooks/scenarios/raw"

get_df_partition(zone_path, 'mailast_n_template', 'process_datetime').show()

get_df_last_partition(zone_path, 'sap_updates', 'process_datetime').show()

get_df_last_partition(zone_path, 'gis_updates', 'process_datetime').show()
get_df_last_partition(zone_path, 'sap_equipment_report', 'process_datetime').show()
'''

+--------------------+--------------------+--------------------+--------------------+-------------------+-----------+--------------------+---------------------+------------------+----------------------+----------------+--------------------+--------------+---------------+-------------+--------+-----------+---+-------------+-------------------+-----------------+-------------+----------+---------+--------------------+------------+------------+--------------------------+-----------------------+--------------------+---------------+------------------+--------------+--------------------------+--------------------------+---------------------+----+----------+------------------+----------------------+------------+-----------------+----------------+----------------+--------------+--------------+---------------+----------------+-----------------+-----------------+--------------------+-------------+-------------+----------------------+---------+---------+----------+----------+----------------------

'\n# zone_path = "/content/gdrive/MyDrive/__ntt_proof_of_concept/raw"\nzone_path = "/content/drive/MyDrive/Colab Notebooks/scenarios/raw"\n\nget_df_partition(zone_path, \'mailast_n_template\', \'process_datetime\').show()\n\nget_df_last_partition(zone_path, \'sap_updates\', \'process_datetime\').show()\n\nget_df_last_partition(zone_path, \'gis_updates\', \'process_datetime\').show()\nget_df_last_partition(zone_path, \'sap_equipment_report\', \'process_datetime\').show()\n'

#Exportar a Excel

In [None]:
import os
import pandas as pd
# Make sure openpyxl is installed and imported
!pip install openpyxl
import openpyxl

#os.makedirs(os.path.dirname("/content/data_lake/delivery/"), exist_ok=True) #Temporal
os.makedirs(os.path.dirname("content/drive/MyDrive/scenarios/delivery/"), exist_ok=True)

#file_path = f'/content/data_lake/delivery/data_template_{process_datetime}.xlsx'
file_path = f'/content/data_lake/delivery/data_template_20241129192612.xlsx'

#df_pandas = get_df_last_partition("/content/data_lake/trusted/", 'main_template', 'process_datetime')#.toPandas()


with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
    get_df_last_partition("/content/data_lake/trusted/", 'main_template', 'process_datetime').toPandas().to_excel(writer, sheet_name='main_template', index=False)
    get_df_last_partition("/content/data_lake/trusted/", 'sap_updates', 'process_datetime').toPandas().to_excel(writer, sheet_name='sap_updates', index=False)
    get_df_last_partition("/content/data_lake/trusted/", 'gis_updates', 'process_datetime').toPandas().to_excel(writer, sheet_name='gis_updates', index=False)
    get_df_last_partition("/content/data_lake/trusted/", 'sap_equipment_report', 'process_datetime').toPandas().to_excel(writer, sheet_name='sap_equipment_report', index=False)



