# Data Cleaning and Preparation

## Initial data exploration and cleaning

In [1]:
import pandas as pd
import json

In [2]:
with open('ICT_Bachelors_ENG.json') as f:
    data = json.load(f)

courses = [item['data']['realizationById'] for item in data]
courses_df = pd.json_normalize(courses)

courses_df.head()

Unnamed: 0,id,code,title,credits,degreeProgrammes,studentGroups,__typename,objective,teachingMethods,content,...,seats.min,seats.max,seats.__typename,createdFromProgramme.id,createdFromProgramme.title,createdFromProgramme.code,createdFromProgramme.link,createdFromProgramme.__typename,office,createdFromProgramme
0,TTC2070-3016,TTC2070-3016,Project Management and Practices,4.0,"[{'id': '82940', 'title': 'Bachelor's Degree P...","[{'id': '119696', 'code': 'TTV22S5', 'title': ...",Realization,Purpose:\r\nWorking in project format is very ...,- Flipped Learning \r\n- independent study\r\n...,- basics of project work\r\n- roles of actors ...,...,20.0,70.0,Seats,50107.0,Bachelor's Degree Programme in Information and...,TTV2022SS,,Curriculum,,
1,,,,,,,,,,,...,,,,,,,,,,
2,TTC2040-3019,TTC2040-3019,Introduction to IoT systems,3.0,"[{'id': '82940', 'title': 'Bachelor's Degree P...","[{'id': '104696', 'code': 'TIC22S1', 'title': ...",Realization,Purpose and objectives:\r\nYou recognize the p...,,Parts of IoT value chain\r\n- Sensing solution...,...,20.0,35.0,Seats,50088.0,Bachelor's Degree Programme in Information and...,TIC2022SS,,Curriculum,,
3,HTGP0060-3005,HTGP0060-3005,Collaboration Tools,2.0,"[{'id': '5158', 'title': 'Bachelor's Degree Pr...","[{'id': '131501', 'code': 'HTK23S1', 'title': ...",Realization,Software development is teamwork. In order to ...,- Lectures / real-time excercises in auditoriu...,Tools for communication\r\nTools for task mana...,...,20.0,100.0,Seats,51876.0,Bachelor's Degree Programme in Business Inform...,HTG2023SS,,Curriculum,,
4,HTGP0500-3005,HTGP0500-3005,Digital Photography and Video Production,5.0,"[{'id': '5158', 'title': 'Bachelor's Degree Pr...","[{'id': '104065', 'code': 'HTG22S1', 'title': ...",Realization,The object of the course\r\nDo you want to kno...,The primary mode of delivery for this course a...,In the course you will be focused on the basic...,...,20.0,30.0,Seats,49926.0,Bachelor's Degree Programme in Business Inform...,HTG2022SS,,Curriculum,,


### Getting some information on courses and their teachers

**This is important information, used in testing whether the junction dataframe I create later is correct**

Checking which courses are taught by a specific teacher:

In [3]:
filtered_df = courses_df[courses_df['teachers'].apply(lambda x: any(d['name'] == 'Jouko Kotkansalo' for d in x) if isinstance(x, list) else False)]

print('Courses where Jouko Kotkansalo is the teacher: \n\n',filtered_df['title'])

Courses where Jouko Kotkansalo is the teacher: 

 2          Introduction to IoT systems
82             Future IoT Technologies
84     Digital Technology and Hardware
134            Future IoT Technologies
Name: title, dtype: object


Checking the teachers for a course that I know has more than one teacher:

In [4]:
course_row = courses_df[courses_df['code'] == 'HTGP0500-3005']
teachers_list = course_row['teachers'].values[0]

for teacher in teachers_list:
    print(teacher['name'])

Kalle Raijonkari
Jari Kuskelin


### Continuation of cleaning

In [5]:
courses_df.dropna(how='all', inplace=True)
print(f'Courses after dropping empty rows: {courses_df.shape[0]}')

Courses after dropping empty rows: 159


In [6]:
courses_df.columns

Index(['id', 'code', 'title', 'credits', 'degreeProgrammes', 'studentGroups',
       '__typename', 'objective', 'teachingMethods', 'content',
       'learningMaterial', 'teachingLanguage', 'teachers', 'teachingMethod',
       'qualifications', 'evaluation', 'employerConnections', 'examSchedule',
       'internationalConnections', 'workload', 'contentScheduling',
       'informationOfCourse', 'furtherInformation', 'approveRejectDescription',
       'evaluationScale', 'curriculum', 'relatedOffering', 'educationalFields',
       'timing.start', 'timing.end', 'enrollment.start', 'enrollment.end',
       'office.id', 'office.code', 'office.title', 'office.__typename',
       'unit.id', 'unit.code', 'unit.title', 'unit.__typename',
       'teachingMethodFull.online', 'teachingMethodFull.contact',
       'teachingMethodFull.__typename', 'seats.min', 'seats.max',
       'seats.__typename', 'createdFromProgramme.id',
       'createdFromProgramme.title', 'createdFromProgramme.code',
       'crea

Useful columns in the data:

- id
- code
- title
- credits
- __typename
- objective
- teachingMethods
- content
- unit.title
- learningMaterial
- teachers
- teachingMethod
- teachingMethodFull.online
- teachingMethodFull.contact
- qualifications
- evaluation
- employerConnections
- examSchedule
- internationalConnections
- workload
- contentScheduling
- informationOfCourse
- furtherInformation
- evaluationScale
- seats.min
- seats.max
- relatedOffering

In [7]:
columns_to_keep = [
    'id', 'code', 'title', 'credits', '__typename', 'objective', 'teachingMethods', 
    'content', 'unit.title', 'learningMaterial', 'teachers', 'teachingMethod', 
    'teachingMethodFull.online', 'teachingMethodFull.contact', 'qualifications', 
    'evaluation', 'employerConnections', 'examSchedule', 'internationalConnections', 
    'workload', 'contentScheduling', 'informationOfCourse', 'furtherInformation', 
    'evaluationScale', 'seats.min', 'seats.max', 'relatedOffering'
]

# Drop other columns
columns_to_drop = [col for col in courses_df.columns if col not in columns_to_keep]
courses_df = courses_df.drop(columns=columns_to_drop)

In [8]:
courses_df.rename(columns={'__typename': 'type_name', 'teachingMethods': 'teaching_methods', 'teachingMethod': 'teaching_method',
                            'learningMaterial': 'learning_material', 'unit.title': 'unit_title', 'teachingMethod': 'teaching_method',
                            'teachingMethodFull.online': 'teaching_method_online', 'teachingMethodFull.contact': 'teaching_method_contact',
                            'employerConnections': 'employer_connections', 'examSchedule': 'exam_schedule', 'internationalConnections': 'international_connections',
                            'contentScheduling': 'content_scheduling', 'informationOfCourse': 'course_information', 'furtherInformation': 'further_information',
                            'evaluationScale': 'evaluation_scale', 'seats.min': 'min_seats', 'seats.max': 'max_seats', 'relatedOffering': 'related_offering'}
                            , inplace=True)

Checking to see if `id` and `code` are always the same, if they are we can drop `id` and use `code` as the primary key:

In [9]:
rows_where_unequal = len(courses_df[courses_df['id'] != courses_df['code']])
print(f'Number of rows where id and code are not equal: {rows_where_unequal}')

Number of rows where id and code are not equal: 0


In [10]:
courses_df.drop(columns=['id'], inplace=True)

Rename `code` column to `course_code`:

In [11]:
courses_df.rename(columns={'code': 'course_code'}, inplace=True)

## Database design

Finding out which columns contain objects, and making them into their own dataframes, which will be separate tables in the database:

In [12]:
object_cols = courses_df.columns[courses_df.map(lambda x: isinstance(x, (list, dict))).any()].tolist()

print(f'Object columns: {object_cols}')

Object columns: ['teachers', 'related_offering']


In [13]:
teachers_data = []
related_offering_data = []

for index, row in courses_df.iterrows():
    teachers = row['teachers']
    if teachers is not None:
        for teacher in teachers:
            teachers_data.append({'name': teacher['name'], 'type': teacher['__typename'], 'course_code': row['course_code']})

    related_offerings = row['related_offering']
    if related_offerings is not None:
        for related_offering in related_offerings:
            related_offering_data.append({'related_offering_code': related_offering['code'], 'title': related_offering['title'],
                                          'link': related_offering['link'], 'type': related_offering['__typename'],
                                          'course_code': row['course_code']})

teachers_df = pd.DataFrame(teachers_data)
related_offerings_df = pd.DataFrame(related_offering_data)

In [14]:
teachers_df.head()

Unnamed: 0,name,type,course_code
0,Marko Rintamäki,Teacher,TTC2070-3016
1,Jouko Kotkansalo,Teacher,TTC2040-3019
2,Juha-Tapio Teno,Teacher,HTGP0060-3005
3,Kalle Raijonkari,Teacher,HTGP0500-3005
4,Jari Kuskelin,Teacher,HTGP0500-3005


In [15]:
related_offerings_df.head()

Unnamed: 0,related_offering_code,title,link,type,course_code
0,TTK20S1OTL,Ohjelmistotuotanto ja laadunvarmistus,,Curriculum,TTC2070-3016
1,TTK21KOHJ,Ohjelmistoalan osaajaksi,,Curriculum,TTC2070-3016
2,TTK22KOHJ,Ohjelmistoalan osaajaksi,,Curriculum,TTC2070-3016
3,EXSTBUSINESSINFORMATIONTECH,Exchange Studies: Business Information Technol...,,Curriculum,HTGP0500-3005
4,STUDYABROAD,Incoming students - Special Study Programmes i...,,Curriculum,HTGP0500-3005


Dropping `teachers` and `related_offering` from the course data, as they now have their own dataframes.

In [16]:
courses_df.drop(columns=['teachers', 'related_offering'], inplace=True)

In [17]:
pd.set_option('display.max_columns', None)
courses_df.head()

Unnamed: 0,course_code,title,credits,type_name,objective,teaching_methods,content,learning_material,teaching_method,qualifications,evaluation,employer_connections,exam_schedule,international_connections,workload,content_scheduling,course_information,further_information,evaluation_scale,unit_title,teaching_method_online,teaching_method_contact,min_seats,max_seats
0,TTC2070-3016,Project Management and Practices,4.0,Realization,Purpose:\r\nWorking in project format is very ...,- Flipped Learning \r\n- independent study\r\n...,- basics of project work\r\n- roles of actors ...,Materials in the Gitlab environment.\r\n\r\nht...,Online,,,- visiting lecturers\r\n- projektiharjoitus,https://ttc2070.pages.labranet.jamk.fi/en/7-Ex...,-,One credit (1 Cr) corresponds to an average of...,After course info-day you can proceed freely d...,http://ttc2070.pages.labranet.jamk.fi/en,avoinExecution of course relays heavily on an ...,Pass/Fail,School of Technology,4,0,20.0,70.0
2,TTC2040-3019,Introduction to IoT systems,3.0,Realization,Purpose and objectives:\r\nYou recognize the p...,,Parts of IoT value chain\r\n- Sensing solution...,,Contact,Basics in Programming\r\nData Networks\r\nUse ...,,,,,,,,,0-5,School of Technology,0,3,20.0,35.0
3,HTGP0060-3005,Collaboration Tools,2.0,Realization,Software development is teamwork. In order to ...,- Lectures / real-time excercises in auditoriu...,Tools for communication\r\nTools for task mana...,Course material written by the teacher as well...,Contact,Basic computer skills are required. You must b...,,,,,Approximately 54 hours.,,,Avoin amk 10 (included in the total capacity),Pass/Fail,School of Business,0,2,20.0,100.0
4,HTGP0500-3005,Digital Photography and Video Production,5.0,Realization,The object of the course\r\nDo you want to kno...,The primary mode of delivery for this course a...,In the course you will be focused on the basic...,"Long B. Complete Digital Photography, 2018\r\n...",Contact,Basics for digital media -course needs to be p...,,Possible special lectures from the experts con...,There will be no exam during this course. The ...,,Course workload is 5 ECTS = approx. 135 hours....,,This course is focused for the degree students...,Edufutura 5\r\nAvoin amk 5,0-5,School of Business,0,5,20.0,30.0
5,HTGP0120-3002,Business Basics,2.0,Realization,Understanding the business is essential for ev...,"This implementation utilizes reverse learning,...",Marketing Basics\r\nFinancial Management\r\nFi...,- Teacher's materials and slides\r\n- 1336/199...,Contact,-,,,The final exam is on week 43. On first contact...,,One credit corresponds to an average of 27 hou...,Weekly lessons / workshops.,,The course grade is determined as follows:\r\n...,0-5,School of Business,0,2,20.0,80.0


Drop `evaluation` column, as it's empty:

In [18]:
courses_df.drop(columns=['evaluation'], inplace=True)

Finding out whether some courses have multiple teachers:

In [19]:
rows_where_unequal = len(teachers_df[teachers_df.duplicated(subset=['course_code'], keep=False)])
print(f'Number of rows where different teachers have the same course code: {rows_where_unequal}')

# Examples
teachers_df[teachers_df.duplicated(subset=['course_code'], keep=False)].head(10)

Number of rows where different teachers have the same course code: 58


Unnamed: 0,name,type,course_code
3,Kalle Raijonkari,Teacher,HTGP0500-3005
4,Jari Kuskelin,Teacher,HTGP0500-3005
6,Risto Koskenkorva,Teacher,HTGP0130-3003
7,Kalle Raijonkari,Teacher,HTGP0130-3003
8,Mika Karhulahti,Teacher,HTGP0130-3003
9,Ilari Miikkulainen,Teacher,HTGP0130-3003
22,Juha Peltomäki,Teacher,TTC8070-3006
23,Antti Häkkinen,Teacher,TTC8070-3006
29,Kalle Raijonkari,Teacher,KDMP0410-3004
30,Ilari Miikkulainen,Teacher,KDMP0410-3004


Finding out whether some teachers teach multiple courses:

In [20]:
duplicate_rows = teachers_df[teachers_df.duplicated('name', keep=False)]

# Print some examples
print(duplicate_rows.sort_values('name').head())

                    name     type    course_code
191     Alison Doolittle  Teacher  HTGP0010-3004
167     Alison Doolittle  Teacher  ZZPP0520-3223
66      Alison Doolittle  Teacher  ZZPP0420-3174
63   Anastasiia Mikhlina  Teacher  ZW00BS75-3003
187  Anastasiia Mikhlina  Teacher  ZW00BS75-3005


Creating a new dataframe storing the relationships between courses and teachers, which will be a junction table in the database:

In [21]:
# Create a unique teachers_df DataFrame where each teacher has a unique ID
unique_teachers_df = teachers_df[['name']].drop_duplicates()
unique_teachers_df['id'] = range(1, len(unique_teachers_df) + 1)

# Create a junction table by mapping the teacher names in the original DataFrame to their corresponding IDs
junction_df = teachers_df[['name', 'course_code']].copy()
junction_df = junction_df.merge(unique_teachers_df, on='name')
junction_df = junction_df[['id', 'course_code']]
junction_df.columns = ['teacher_id', 'course_code']

print(junction_df.head())

   teacher_id   course_code
0           1  TTC2070-3016
1           2  TTC2040-3019
2           2  TTC8850-3004
3           2  TTC1060-3029
4           2  TTC8850-3003


In [22]:
unique_teachers_df.head()

Unnamed: 0,name,id
0,Marko Rintamäki,1
1,Jouko Kotkansalo,2
2,Juha-Tapio Teno,3
3,Kalle Raijonkari,4
4,Jari Kuskelin,5


In [23]:
junction_df[junction_df['teacher_id'] == 2]

Unnamed: 0,teacher_id,course_code
1,2,TTC2040-3019
2,2,TTC8850-3004
3,2,TTC1060-3029
4,2,TTC8850-3003


In [24]:
teachers_df[teachers_df['name'] == 'Jouko Kotkansalo']

Unnamed: 0,name,type,course_code
1,Jouko Kotkansalo,Teacher,TTC2040-3019
96,Jouko Kotkansalo,Teacher,TTC8850-3004
98,Jouko Kotkansalo,Teacher,TTC1060-3029
159,Jouko Kotkansalo,Teacher,TTC8850-3003


In [25]:
courses_df[courses_df['course_code'].isin(['TTC2040-3019', 'TTC8850-3004', 'TTC1060-3029', 'TTC8850-3003'])]

Unnamed: 0,course_code,title,credits,type_name,objective,teaching_methods,content,learning_material,teaching_method,qualifications,employer_connections,exam_schedule,international_connections,workload,content_scheduling,course_information,further_information,evaluation_scale,unit_title,teaching_method_online,teaching_method_contact,min_seats,max_seats
2,TTC2040-3019,Introduction to IoT systems,3.0,Realization,Purpose and objectives:\r\nYou recognize the p...,,Parts of IoT value chain\r\n- Sensing solution...,,Contact,Basics in Programming\r\nData Networks\r\nUse ...,,,,,,,,0-5,School of Technology,0,3,20.0,35.0
82,TTC8850-3004,Future IoT Technologies,5.0,Realization,"Course objectives\r\nIn the course, you will l...",,"Comparison of neural networks (RL, Supervised,...",,Contact,Johdanto IoT-järjestelmiin,,,,,,,,0-5,School of Technology,0,5,0.0,35.0
84,TTC1060-3029,Digital Technology and Hardware,5.0,Realization,"After taking this course, you will understand ...",,Number systems\r\n- binary number\r\n- floatin...,,Contact,,,,,,,,,0-5,School of Technology,0,5,20.0,35.0
134,TTC8850-3003,Future IoT Technologies,5.0,Realization,"Course objectives\r\nIn the course, you will l...",,"Comparison of neural networks (RL, Supervised,...",,Online,Johdanto IoT-järjestelmiin,,,,,,,,0-5,School of Technology,5,0,0.0,35.0


These are the same courses we discovered earlier as being taught by this teacher, so the junction table is correct.

In [26]:
junction_df[junction_df['course_code'] == 'HTGP0500-3005']

Unnamed: 0,teacher_id,course_code
7,4,HTGP0500-3005
17,5,HTGP0500-3005


Renaming `id` column in `unique_teachers_df` to `teacher_id` and inspecting the names for this course:

In [27]:
unique_teachers_df.rename(columns={'id': 'teacher_id'}, inplace=True)

In [28]:
unique_teachers_df[unique_teachers_df['teacher_id'].isin([4, 5])]

Unnamed: 0,name,teacher_id
3,Kalle Raijonkari,4
4,Jari Kuskelin,5


These also line up with the names we found earlier!

## Final inspection of dataframes, and saving to .csv

In [29]:
courses_df.head()

Unnamed: 0,course_code,title,credits,type_name,objective,teaching_methods,content,learning_material,teaching_method,qualifications,employer_connections,exam_schedule,international_connections,workload,content_scheduling,course_information,further_information,evaluation_scale,unit_title,teaching_method_online,teaching_method_contact,min_seats,max_seats
0,TTC2070-3016,Project Management and Practices,4.0,Realization,Purpose:\r\nWorking in project format is very ...,- Flipped Learning \r\n- independent study\r\n...,- basics of project work\r\n- roles of actors ...,Materials in the Gitlab environment.\r\n\r\nht...,Online,,- visiting lecturers\r\n- projektiharjoitus,https://ttc2070.pages.labranet.jamk.fi/en/7-Ex...,-,One credit (1 Cr) corresponds to an average of...,After course info-day you can proceed freely d...,http://ttc2070.pages.labranet.jamk.fi/en,avoinExecution of course relays heavily on an ...,Pass/Fail,School of Technology,4,0,20.0,70.0
2,TTC2040-3019,Introduction to IoT systems,3.0,Realization,Purpose and objectives:\r\nYou recognize the p...,,Parts of IoT value chain\r\n- Sensing solution...,,Contact,Basics in Programming\r\nData Networks\r\nUse ...,,,,,,,,0-5,School of Technology,0,3,20.0,35.0
3,HTGP0060-3005,Collaboration Tools,2.0,Realization,Software development is teamwork. In order to ...,- Lectures / real-time excercises in auditoriu...,Tools for communication\r\nTools for task mana...,Course material written by the teacher as well...,Contact,Basic computer skills are required. You must b...,,,,Approximately 54 hours.,,,Avoin amk 10 (included in the total capacity),Pass/Fail,School of Business,0,2,20.0,100.0
4,HTGP0500-3005,Digital Photography and Video Production,5.0,Realization,The object of the course\r\nDo you want to kno...,The primary mode of delivery for this course a...,In the course you will be focused on the basic...,"Long B. Complete Digital Photography, 2018\r\n...",Contact,Basics for digital media -course needs to be p...,Possible special lectures from the experts con...,There will be no exam during this course. The ...,,Course workload is 5 ECTS = approx. 135 hours....,,This course is focused for the degree students...,Edufutura 5\r\nAvoin amk 5,0-5,School of Business,0,5,20.0,30.0
5,HTGP0120-3002,Business Basics,2.0,Realization,Understanding the business is essential for ev...,"This implementation utilizes reverse learning,...",Marketing Basics\r\nFinancial Management\r\nFi...,- Teacher's materials and slides\r\n- 1336/199...,Contact,-,,The final exam is on week 43. On first contact...,,One credit corresponds to an average of 27 hou...,Weekly lessons / workshops.,,The course grade is determined as follows:\r\n...,0-5,School of Business,0,2,20.0,80.0


In [30]:
unique_teachers_df.head()

Unnamed: 0,name,teacher_id
0,Marko Rintamäki,1
1,Jouko Kotkansalo,2
2,Juha-Tapio Teno,3
3,Kalle Raijonkari,4
4,Jari Kuskelin,5


In [31]:
junction_df.head()

Unnamed: 0,teacher_id,course_code
0,1,TTC2070-3016
1,2,TTC2040-3019
2,2,TTC8850-3004
3,2,TTC1060-3029
4,2,TTC8850-3003


**I won't be using related offerings in my application at the moment, so I'll leave this out. I want to create my own recommendations in a later feature.**

In [32]:
related_offerings_df.head()

Unnamed: 0,related_offering_code,title,link,type,course_code
0,TTK20S1OTL,Ohjelmistotuotanto ja laadunvarmistus,,Curriculum,TTC2070-3016
1,TTK21KOHJ,Ohjelmistoalan osaajaksi,,Curriculum,TTC2070-3016
2,TTK22KOHJ,Ohjelmistoalan osaajaksi,,Curriculum,TTC2070-3016
3,EXSTBUSINESSINFORMATIONTECH,Exchange Studies: Business Information Technol...,,Curriculum,HTGP0500-3005
4,STUDYABROAD,Incoming students - Special Study Programmes i...,,Curriculum,HTGP0500-3005


Modifying datatypes to match the database:

In [33]:
courses_df.dtypes

course_code                   object
title                         object
credits                      float64
type_name                     object
objective                     object
teaching_methods              object
content                       object
learning_material             object
teaching_method               object
qualifications                object
employer_connections          object
exam_schedule                 object
international_connections     object
workload                      object
content_scheduling            object
course_information            object
further_information           object
evaluation_scale              object
unit_title                    object
teaching_method_online        object
teaching_method_contact       object
min_seats                    float64
max_seats                    float64
dtype: object

In [34]:
courses_df = courses_df.astype({col: 'string' for col in courses_df.columns if col not in ['credits', 'min_seats', 'max_seats']})
courses_df = courses_df.astype({'credits': 'float', 'min_seats': 'float', 'max_seats': 'float', 'teaching_method_online': 'float', 'teaching_method_contact': 'float'})

courses_df.dtypes

course_code                  string[python]
title                        string[python]
credits                             float64
type_name                    string[python]
objective                    string[python]
teaching_methods             string[python]
content                      string[python]
learning_material            string[python]
teaching_method              string[python]
qualifications               string[python]
employer_connections         string[python]
exam_schedule                string[python]
international_connections    string[python]
workload                     string[python]
content_scheduling           string[python]
course_information           string[python]
further_information          string[python]
evaluation_scale             string[python]
unit_title                   string[python]
teaching_method_online              float64
teaching_method_contact             float64
min_seats                           float64
max_seats                       

In [35]:
unique_teachers_df.dtypes

name          object
teacher_id     int64
dtype: object

In [36]:
unique_teachers_df = unique_teachers_df.astype({'name': 'string'})
unique_teachers_df.dtypes

name          string[python]
teacher_id             int64
dtype: object

In [37]:
junction_df.dtypes

teacher_id      int64
course_code    object
dtype: object

In [38]:
junction_df = junction_df.astype({'course_code': 'string'})
junction_df.dtypes

teacher_id              int64
course_code    string[python]
dtype: object

In [39]:
unique_teachers_df = unique_teachers_df[['teacher_id', 'name']]
unique_teachers_df.head()

Unnamed: 0,teacher_id,name
0,1,Marko Rintamäki
1,2,Jouko Kotkansalo
2,3,Juha-Tapio Teno
3,4,Kalle Raijonkari
4,5,Jari Kuskelin


Dropping `teacher_id` from the teachers dataframe, as the database will generate this automatically:

In [41]:
unique_teachers_df.drop(columns=['teacher_id'], inplace=True)

Finally, saving the dataframes to .csv files:

In [42]:
courses_df.to_csv('courses.csv', index=False, escapechar='\\')
unique_teachers_df.to_csv('teachers.csv', index=False, escapechar='\\')
junction_df.to_csv('junction.csv', index=False, escapechar='\\')