# Databse Refactor
This notebook is used to refactor the CSV cleaned files into a database. The new database will use a different schema than the original CSV files to make it easier to query and analyze the data. The new schema will have the following tables:

Import requirements

In [52]:
import pandas as pd

In [78]:
# Read the csv
ubicaciones = pd.read_csv('../databases/cleaned-data/ubicaciones_cleaned.csv')
ubicaciones.head()

Unnamed: 0,ID_UBICACIO,DS_UBICACIO,ID_EDIFICI,CAPACIDAD
0,Q1/1003,aula q1/1003 (dues portes),Q,70
1,Q3/0013,aula q3/0013,Q,33
2,Q4/1013,aula q4/1013 (dues portes),Q,69
3,Q1/0007,aula d'informàtica a (2 portes,Q,68
4,Q6/2008,laboratori,Q,20


In [79]:
# Read the calendar group csv
calendar_group = pd.read_csv('../databases/cleaned-data/calendario_grupos_merged.csv')
calendar_group.head()

Unnamed: 0,ID_GRUPO,ID_FECHA_GRUPO,ID_HORA_INICIO,ID_HORA_FIN,ID_CURSO_ACADEMICO,ID_ASIGNATURA,ID_TIPO_DOCENCIA,ID_COD_GRUPO,ID_PERIODO_DOCENTE,IND_ALUMNOS_GRUPO_PREV,IND_ALUMNOS_GRUPO_REAL,IND_HORAS_PREVISTAS
0,2024-0-115-102708-54-311,2025-07-04,930,1030,2024,102708,54,311,1,45,50,6.0
1,2024-0-115-102764-54-472,2025-07-04,1700,1900,2024,102764,54,472,1,37,44,50.0
2,2024-0-115-104554-54-1,2025-07-03,1700,1900,2024,104554,54,1,1,23,21,12.0
3,2024-0-115-102764-54-472,2025-06-27,1700,1900,2024,102764,54,472,1,37,44,50.0
4,2024-0-115-102708-54-311,2025-06-27,930,1030,2024,102708,54,311,1,45,50,6.0


In [80]:
# Get all the distinct values of the column "ID_EDIFICI"
edficis = ubicaciones['ID_EDIFICI'].unique()

# Create a new building csv file with the column "id", which is the same as "ID_EDIFICI"
buildings = pd.DataFrame(edficis, columns=['id'])

# Add "Edifici " to the column "id" and save it as a "name" column
buildings['name'] = 'Edifici ' + buildings['id'].astype(str)

# Sort the values by the column "id"
buildings = buildings.sort_values(by='id')

# Change the index to the column "id"
buildings = buildings.set_index('id')

buildings.head()

Unnamed: 0_level_0,name
id,Unnamed: 1_level_1
B,Edifici B
C,Edifici C
Q,Edifici Q


In [82]:
# Create a new "spaces" csv file with the columns "id", "name" and "capacity"
spaces = pd.DataFrame(columns=['id', 'name'])

# Add the values "ID_UBICACIO", "DS_UBICACIO" and "CAPACIDAD" to the columns "id", "name" and "capacity" respectively
spaces['id'] = ubicaciones['ID_UBICACIO']
spaces['building_id'] = ubicaciones['ID_EDIFICI']
spaces['name'] = ubicaciones['DS_UBICACIO']
spaces['capacity'] = ubicaciones['CAPACIDAD']

# Add "is_active" to the columns
spaces["is_active"] = True

# Sort the values by the column "id"
spaces = spaces.sort_values(by='id')

# Change the index to the column "id"
spaces = spaces.set_index('id')

spaces.head(5)

Unnamed: 0_level_0,name,building_id,capacity,is_active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B0/-106,aula d'informàtica a,B,97,True
B0/-124,aula informatica b,B,60,True
B0/-126,aula informatica c,B,65,True
B1/-1026,aula 1,B,32,True
B1/-1028,aula 2,B,57,True


In [76]:
# Get all the values of the column "ID_GRUPO"
group_ids = calendar_group['ID_GRUPO']

# From \d+ - \d+ - \d+ ..., extract the third number (ex: 2024-0-115-102708-54-311 -> 115)
building_ids = group_ids.str.extract(r'[\d]+-[\d]+-([\d]+)-[\d]+-[\d]+-[\d]+')

# Add "ID_EDIFICI" as a column
calendar_group['ID_EDIFICI'] = building_ids

# Get all the distinct values of the column "ID_EDIFICI"
building_ids = building_ids[0].unique()

# Set the relationship between the building id and the real id in a map
building_relations = {}
building_relations['115'] = "Q"

# Create a new "subject" csv file with the columns "id", "building_id", "name" and "period"
subjects = pd.DataFrame(columns=['id', 'building_id', 'name', 'period'])

# Add the values "id", "building_id", "name" and "period" to the columns
subjects['id'] = calendar_group['ID_ASIGNATURA']
subjects['building_id'] = calendar_group['ID_EDIFICI']
subjects['name'] = 'Asignatura ' + calendar_group['ID_ASIGNATURA'].astype(str)
subjects['period'] = calendar_group['ID_PERIODO_DOCENTE']

# Change the values of the column "building_id" to the real values (if not, delete the row)
subjects['building_id'] = subjects['building_id'].map(building_relations)
subjects = subjects.dropna()

# Remove the duplicates
subjects = subjects.drop_duplicates()

# Sort the values by the column "id"
subjects = subjects.sort_values(by='id')

# Change the index to the column "id"
subjects = subjects.set_index('id')

subjects

Unnamed: 0_level_0,building_id,name,period
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
44728,Q,Asignatura 44728,1
44729,Q,Asignatura 44729,1
44730,Q,Asignatura 44730,1
44731,Q,Asignatura 44731,1
44732,Q,Asignatura 44732,1
...,...,...,...
106939,Q,Asignatura 106939,1
106940,Q,Asignatura 106940,1
106941,Q,Asignatura 106941,1
106942,Q,Asignatura 106942,1


In [74]:
# Create a new "subject_group" csv file with the columns "id", "subject_id", "year" and "duration"
subject_groups = pd.DataFrame(columns=['id', 'subject_id', 'year', 'duration'])

# Add the values "id", "building_id", "name" and "period" to the columns
subject_groups['id'] = calendar_group['ID_COD_GRUPO']
subject_groups['subject_id'] = calendar_group['ID_ASIGNATURA']
subject_groups['year'] = calendar_group['ID_CURSO_ACADEMICO']

# Remove the duplicates
subject_groups = subject_groups.drop_duplicates()

# Sort the values by the column "id"
subject_groups = subject_groups.sort_values(by='id')

# Change the index to the column "id"
subject_groups = subject_groups.set_index('id')

subject_groups

Unnamed: 0_level_0,subject_id,year,duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,44757,2024,
1,101739,2024,
1,101738,2024,
1,44733,2024,
1,44732,2024,
...,...,...,...
812,104345,2024,
813,104352,2024,
813,104339,2024,
813,104345,2024,


In [60]:
calendar_group.head()

Unnamed: 0,ID_GRUPO,ID_FECHA_GRUPO,ID_HORA_INICIO,ID_HORA_FIN,ID_CURSO_ACADEMICO,ID_ASIGNATURA,ID_TIPO_DOCENCIA,ID_COD_GRUPO,ID_PERIODO_DOCENTE,IND_ALUMNOS_GRUPO_PREV,IND_ALUMNOS_GRUPO_REAL,IND_HORAS_PREVISTAS,ID_EDIFICI
0,2024-0-115-102708-54-311,2025-07-04,930,1030,2024,102708,54,311,1,45,50,6.0,115
1,2024-0-115-102764-54-472,2025-07-04,1700,1900,2024,102764,54,472,1,37,44,50.0,115
2,2024-0-115-104554-54-1,2025-07-03,1700,1900,2024,104554,54,1,1,23,21,12.0,115
3,2024-0-115-102764-54-472,2025-06-27,1700,1900,2024,102764,54,472,1,37,44,50.0,115
4,2024-0-115-102708-54-311,2025-06-27,930,1030,2024,102708,54,311,1,45,50,6.0,115


In [70]:
# Create a new "subject_group" csv file with the columns "id", "subject_group_id", "space_id", "day", "start", "end", "planned_capacity", "real_capacity", "is_morning"
subject_group_sessions = pd.DataFrame(columns=['id', 'subject_group_id', 'space_id', 'day', 'start', 'end', 'planned_capacity', 'real_capacity', 'is_morning'])

# Add the values "id", "subject_group_id", "year", "planned_capacity", "real_capacity", "hours" and "is_morning" to the columns
subject_group_sessions['subject_group_id'] = calendar_group['ID_COD_GRUPO']
subject_group_sessions['day'] = calendar_group['ID_FECHA_GRUPO']
subject_group_sessions['start'] = calendar_group['ID_HORA_INICIO']
subject_group_sessions['end'] = calendar_group['ID_HORA_FIN']
subject_group_sessions['planned_capacity'] = calendar_group['IND_ALUMNOS_GRUPO_PREV']
subject_group_sessions['real_capacity'] = calendar_group['IND_ALUMNOS_GRUPO_REAL']
subject_group_sessions['is_morning'] = calendar_group['ID_HORA_INICIO'].astype(int) < 1430

# Convert day to a date format to extract the day of the week (0-7)
subject_group_sessions['day'] = pd.to_datetime(subject_group_sessions['day'], format='%Y-%m-%d')
subject_group_sessions['day'] = subject_group_sessions['day'].dt.dayofweek

# Convert the start and end to a time format (from 930 to 9:30)
subject_group_sessions['start'] = subject_group_sessions['start'].astype(str)
subject_group_sessions['end'] = subject_group_sessions['end'].astype(str)
subject_group_sessions['start'] = subject_group_sessions['start'].str.zfill(4)
subject_group_sessions['end'] = subject_group_sessions['end'].str.zfill(4)
subject_group_sessions['start'] = subject_group_sessions['start'].str[:2] + ':' + subject_group_sessions['start'].str[2:]
subject_group_sessions['end'] = subject_group_sessions['end'].str[:2] + ':' + subject_group_sessions['end'].str[2:]

# Create duration (in seconds) for each session
subject_group_sessions['start_date'] = pd.to_datetime(subject_group_sessions['start'], format='%H:%M')
subject_group_sessions['end_date'] = pd.to_datetime(subject_group_sessions['end'], format='%H:%M')
subject_group_sessions['duration'] = (subject_group_sessions['end_date'] - subject_group_sessions['start_date']).dt.total_seconds().astype(int)

# Remove the columns "start_date" and "end_date"
subject_group_sessions = subject_group_sessions.drop(columns=['start_date', 'end_date'])

# Remove the duplicates
subject_group_sessions = subject_group_sessions.drop_duplicates()

# Sort the values by the column "subject_group_id", "day", "start" and "end"
subject_group_sessions = subject_group_sessions.sort_values(by=['subject_group_id', 'day', 'start', 'end'])

# Add "id" to the columns (autoincremental)
subject_group_sessions['id'] = range(1, len(subject_group_sessions) + 1)

# Change the index to the column "id"
subject_group_sessions = subject_group_sessions.set_index('id')

subject_group_sessions

Unnamed: 0_level_0,subject_group_id,space_id,day,start,end,planned_capacity,real_capacity,is_morning,duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,,0,09:30,11:30,17,18,True,7200
2,1,,0,11:30,12:30,17,18,True,3600
3,1,,0,12:30,14:00,17,18,True,5400
4,1,,0,12:30,14:30,20,16,True,7200
5,1,,0,15:00,17:00,74,70,False,7200
...,...,...,...,...,...,...,...,...,...
1306,812,,4,13:30,14:30,30,34,True,3600
1307,813,,0,15:00,17:00,20,23,False,7200
1308,813,,2,08:30,11:30,20,21,True,10800
1309,813,,2,15:00,17:00,17,20,False,7200


In [75]:
# Get the sum of the column "duration" group by "subject_group_id"
subject_group_sessions_duration = subject_group_sessions.groupby('subject_group_id')['duration'].sum()

# Add "duration" to the columns of "subject_groups"
subject_groups['duration'] = subject_group_sessions_duration

subject_groups

Unnamed: 0_level_0,subject_id,year,duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,44757,2024,369000
1,101739,2024,369000
1,101738,2024,369000
1,44733,2024,369000
1,44732,2024,369000
...,...,...,...
812,104345,2024,79200
813,104352,2024,28800
813,104339,2024,28800
813,104345,2024,28800
