# Databse Refactor
This notebook is used to refactor the CSV cleaned files into a database. The new database will use a different schema than the original CSV files to make it easier to query and analyze the data. The new schema will have the following tables:

Import requirements

In [1]:
import pandas as pd

In [11]:
# Read the csv
ubicaciones = pd.read_csv('../databases/cleaned-data/ubicaciones_cleaned.csv')
ubicaciones.head()

Unnamed: 0,ID_UBICACIO,DS_UBICACIO,ID_EDIFICI,CAPACIDAD
0,Q1/1003,aula q1/1003 (dues portes),Q,70
1,Q3/0013,aula q3/0013,Q,33
2,Q4/1013,aula q4/1013 (dues portes),Q,69
3,Q1/0007,aula d'informàtica a (2 portes,Q,68
4,Q6/2008,laboratori,Q,20


In [12]:
# Get all the distinct values of the column "ID_EDIFICI"
edficis = ubicaciones['ID_EDIFICI'].unique()

# Create a new building csv file with the column "id", which is the same as "ID_EDIFICI"
buildings = pd.DataFrame(edficis, columns=['id'])

# Add "Edifici " to the column "id" and save it as a "name" column
buildings['name'] = 'Edifici ' + buildings['id'].astype(str)

# Sort the values by the column "id"
buildings = buildings.sort_values(by='id')

# Change the index to the column "id"
buildings = buildings.set_index('id')

buildings.head()

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
B,Edifici B
C,Edifici C
Q,Edifici Q


In [10]:
# Create a new "sites" csv file with the columns "id", "name" and "capacity"
sites = pd.DataFrame(columns=['id', 'name'])

# Add the values "ID_UBICACIO", "DS_UBICACIO" and "CAPACIDAD" to the columns "id", "name" and "capacity" respectively
sites['id'] = ubicaciones['ID_UBICACIO']
sites['name'] = ubicaciones['DS_UBICACIO']
sites['capacity'] = ubicaciones['CAPACIDAD']

# Add "is_active" to the columns
sites["is_active"] = True

# Sort the values by the column "id"
sites = sites.sort_values(by='id')

# Change the index to the column "id"
sites = sites.set_index('id')

sites.head(5)

Unnamed: 0_level_0,name,capacity,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B0/-106,aula d'informàtica a,97,True
B0/-124,aula informatica b,60,True
B0/-126,aula informatica c,65,True
B1/-1026,aula 1,32,True
B1/-1028,aula 2,57,True


In [14]:
# Read the calendar group csv
calendar_group = pd.read_csv('../databases/cleaned-data/calendario_grupos_merged.csv')
calendar_group.head()

Unnamed: 0,ID_GRUPO,ID_FECHA_GRUPO,ID_HORA_INICIO,ID_HORA_FIN,ID_CURSO_ACADEMICO,ID_ASIGNATURA,ID_TIPO_DOCENCIA,ID_COD_GRUPO,ID_PERIODO_DOCENTE,IND_ALUMNOS_GRUPO_PREV,IND_ALUMNOS_GRUPO_REAL,IND_HORAS_PREVISTAS
0,2024-0-115-102708-54-311,2025-07-04,930,1030,2024,102708,54,311,1,45,50,6.0
1,2024-0-115-102764-54-472,2025-07-04,1700,1900,2024,102764,54,472,1,37,44,50.0
2,2024-0-115-104554-54-1,2025-07-03,1700,1900,2024,104554,54,1,1,23,21,12.0
3,2024-0-115-102764-54-472,2025-06-27,1700,1900,2024,102764,54,472,1,37,44,50.0
4,2024-0-115-102708-54-311,2025-06-27,930,1030,2024,102708,54,311,1,45,50,6.0
