# OBJETIVO
Este archivo tiene por objetivo crear scripts de tablas normalizadas que no dependen de otras tablas.
Todos los scripts se guardan en la carpeta seeds, que debe estar en el mismo nivel que la carpeta que contiene este notebook.

# Tablas
- AREAS_CONOCIMIENTO
- GENEROS
- JORNADAS
- MODALIDADES
- NIVELES_FORMACION
- RANGOS_EDAD
- REQUISITOS_INGRESO
- TIPOS_ACREDITACION
- TIPOS_EDUCACION
- TIPOS_INSTITUCION
- TIPOS_PLAN
- VIAS_INGRESO

# CONSTRAINTS
Oracle < 23c no permite bulk inserts, hay que hacer inserts fila por fila

In [2]:
# READ EXCEL - TRANSFORM TO CSV
import pandas as pd 
import os

file_path = '../data/matriculas_ed_superior_nuble_2021.xlsx'
df = pd.read_excel(file_path)

# check if files exists, if yes, don't overwrite
if os.path.exists('../data/matriculas_ed_superior_nuble_2021.csv'):
    print('File already exists, exiting without overwriting.')
else:
    df.to_csv('../data/matriculas_ed_superior_nuble_2021.csv', index=False)

File already exists, exiting without overwriting.


In [3]:
# HELPER FUNCTIONS
def get_reset_identity_query(table_name, id_column):
    return f'ALTER TABLE {table_name} MODIFY {id_column} GENERATED ALWAYS AS IDENTITY (START WITH 1);'

In [15]:
# RANGO EDAD
col = 'RANGO EDAD'
uniques = df[col].dropna().unique()
print(f'Unique values in {col}: {len(uniques)}')

sql_queries = []
for value in sorted(uniques):
    #trim whitespaces
    value = value.strip()
    #extract min and max age
    min_age, separator, max_age = value.split(' ')
    if max_age == 'mas':
        max_age = '100'  # assuming 100 as an arbitrary upper limit for "more than"
    sql_query = f'INSERT INTO RANGOS_EDAD (EDAD_MINIMA, EDAD_MAXIMA, RANGO_EDAD_DESCRIPCION) VALUES ({min_age}, {max_age}, \'{value}\');'
    sql_queries.append(sql_query)

truncate_query = 'TRUNCATE TABLE RANGOS_EDAD;'
reset_identity = get_reset_identity_query('RANGOS_EDAD', 'RANGO_EDAD_ID')
select_query = 'SELECT * FROM RANGOS_EDAD;'
all_queries = truncate_query + '\n' + '\n'.join(sql_queries + [select_query])

# write into file_name, if does not exist create it
file_name = '../seeds/rangos_edad.sql'
with open(file_name, 'w') as f:
    f.write(all_queries)

print('Data processing complete.')

Unique values in RANGO EDAD: 6
Data processing complete.


In [6]:
# TIPOS ACREDITACION
col = 'ACREDITACION INSTITUCIONAL'
uniques = df[col].dropna().unique()
print(f'Unique values in {col}: {len(uniques)}')

sql_queries = []
estado = -1
for value in sorted(uniques):
    #trim whitespaces
    value = value.strip().upper()
    if value == 'ACREDITADA':
        estado = 1
    elif value == 'NO ACREDITADA':
        estado = 0
    else: 
        value = 'SIN ESPECIFICAR'

    sql_query = f'INSERT INTO TIPOS_ACREDITACION (ESTADO_ACREDITACION, TIPO_ACREDITACION_DESCRIPCION) VALUES ({estado}, \'{value}\');'
    sql_queries.append(sql_query)

truncate_query = 'TRUNCATE TABLE TIPOS_ACREDITACION;'
reset_identity = get_reset_identity_query('TIPOS_ACREDITACION', 'TIPO_ACREDITACION_ID')
select_query = 'SELECT * FROM TIPOS_ACREDITACION;'
all_queries = truncate_query + '\n' + '\n'.join(sql_queries + [select_query])

# write into file_name, if does not exist create it
file_name = '../seeds/tipos_acreditacion.sql'
with open(file_name, 'w') as f:
    f.write(all_queries)

print('Data processing complete.')

Unique values in ACREDITACION INSTITUCIONAL: 2
Data processing complete.


In [21]:
# GENEROS
col = 'GENERO'
uniques = df[col].dropna().unique()
print(f'Unique values in {col}: {len(uniques)}')

sql_queries = []
for value in sorted(uniques):
    #trim whitespaces
    value = value.strip()
    #extract min and max age
    sql_query = f'INSERT INTO GENEROS (GENERO_NOMBRE) VALUES (\'{value}\');'
    sql_queries.append(sql_query)

truncate_query = 'TRUNCATE TABLE GENEROS;'
reset_identity = get_reset_identity_query('GENEROS', 'GENERO_ID')
select_query = 'SELECT * FROM GENEROS;'
all_queries = truncate_query + '\n' + '\n'.join(sql_queries + [select_query])
all_queries = '\n'.join([truncate_query, reset_identity] + sql_queries + [f'\n{select_query}'])

# write into file_name, if does not exist create it
file_name = '../seeds/generos.sql'
with open(file_name, 'w') as f:
    f.write(all_queries)

print('Data processing complete.')

Unique values in GENERO: 2
Data processing complete.


In [20]:
tables_data = [
    {'table_name': 'AREAS_CONOCIMIENTO', 'data_col': 'AREA CONOCIMIENTO', 'suffix_col': 'AREA_CONOCIMIENTO'},
    {'table_name': 'GENEROS', 'data_col': 'GENERO', 'suffix_col': 'GENERO'},
    {'table_name': 'JORNADAS', 'data_col': 'JORNADA', 'suffix_col': 'JORNADA'},
    {'table_name': 'MODALIDADES', 'data_col': 'MODALIDAD', 'suffix_col': 'MODALIDAD'},
    {'table_name': 'NIVELES_FORMACION', 'data_col': 'NIVEL DE ESTUDIO CARRERA', 'suffix_col': 'NIVEL_FORMACION'},
    {'table_name': 'REQUISITOS_INGRESO', 'data_col': 'REQUISITO INGRESO', 'suffix_col': 'REQUISITO_INGRESO'},
    {'table_name': 'TIPOS_ACREDITACION', 'data_col': 'ACREDITACION INSTITUCIONAL', 'suffix_col': 'TIPO_ACREDITACION'},
    {'table_name': 'TIPOS_EDUCACION', 'data_col': 'NIVEL CARRERA', 'suffix_col': 'TIPO_EDUCACION'},
    {'table_name': 'TIPOS_INSTITUCION', 'data_col': 'TIPO DE INSTITUCION', 'suffix_col': 'TIPO_INSTITUCION'},
    {'table_name': 'TIPOS_PLAN', 'data_col': 'TIPO PLAN CARRERA', 'suffix_col': 'TIPO_PLAN'},
    {'table_name': 'VIAS_INGRESO', 'data_col': 'VIA DE INGRESO', 'suffix_col': 'VIA_INGRESO'}
]



for table_info in tables_data:
    table_name = table_info['table_name']
    suffix = table_info['suffix_col']
    name_col = f'{suffix}_NOMBRE'
    desc_col = f'{suffix}_DESCRIPCION'
    id_col = f'{suffix}_ID'
    col = table_info['data_col']
    uniques = df[col].dropna().unique()
    print(f'Unique values in {col}: {len(uniques)}')

    sql_queries = []
    for value in sorted(uniques):
        #trim whitespaces
        value = value.strip()
        sql_query = f'INSERT INTO {table_name} ({name_col}, {desc_col}) VALUES (\'{value}\', NULL);'
        sql_queries.append(sql_query)

    truncate_query = f'TRUNCATE TABLE {table_name};'
    reset_identity = get_reset_identity_query(table_name, id_col)
    select_query = f'SELECT * FROM {table_name};'
    all_queries = '\n'.join([truncate_query, reset_identity] + sql_queries + [f'\n{select_query}'])


    # write into file_name, if does not exist create it
    file_name = f'../seeds/{table_name.lower()}.sql'
    with open(file_name, 'w') as f:
        f.write(all_queries)

    print(f'Data processing for table {table_name} complete.')

Unique values in AREA CONOCIMIENTO: 10
Data processing for table AREAS_CONOCIMIENTO complete.
Unique values in GENERO: 2
Data processing for table GENEROS complete.
Unique values in JORNADA: 5
Data processing for table JORNADAS complete.
Unique values in MODALIDAD: 3
Data processing for table MODALIDADES complete.
Unique values in NIVEL DE ESTUDIO CARRERA: 3
Data processing for table NIVELES_FORMACION complete.
Unique values in REQUISITO INGRESO: 5
Data processing for table REQUISITOS_INGRESO complete.
Unique values in ACREDITACION INSTITUCIONAL: 2
Data processing for table TIPOS_ACREDITACION complete.
Unique values in NIVEL CARRERA: 5
Data processing for table TIPOS_EDUCACION complete.
Unique values in TIPO DE INSTITUCION: 4
Data processing for table TIPOS_INSTITUCION complete.
Unique values in TIPO PLAN CARRERA: 3
Data processing for table TIPOS_PLAN complete.
Unique values in VIA DE INGRESO: 11
Data processing for table VIAS_INGRESO complete.
