# Gender dataset cleaning pipeline 1st iteration

## 1) Cleaning datasets from 2016 to 2022

In [42]:
# Importing libraries
import mysql.connector
from mysql.connector import Error
import os
import pandas as pd

In [43]:
# Creating a function to clean and save the dataframe
def clean_and_save_dataframe(year):
    csv_file = f"gender/accepted_gender_{year}.csv"
    try:
        df_køn = pd.read_csv(csv_file, skiprows=3)

        df_køn['OptNavn'] = df_køn['OptNavn'].str.replace(', Studiestart: sommerstart', '')
        df_køn['OptNavn'] = df_køn['OptNavn'].str.replace(', Studiestart: vinterstart', '')
        df_køn['OptNavn'] = df_køn['OptNavn'].str.replace(', Studiestart: sommer- og vinterstart', '')
        df_køn['OptNavn'] = df_køn['OptNavn'].str.replace(', Study start: Summer start', '')
        df_køn['OptNavn'] = df_køn['OptNavn'].str.replace(', Study start: Winter start', '')


        folder_path = './gender_cleaned/'
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)

        file_path = folder_path + f'accepted_gender_clean_{year}.csv'
        df_køn.to_csv(file_path, index=False)

# If file path not found print out all the missing files
    except FileNotFoundError:
        print(f"File for year {year} not found.")

# Iterate through the years from 2016 to 2022 and process each file
for year in range(2016, 2023):
    clean_and_save_dataframe(year)


## 2) Connecting to SQL server

In [44]:
# Creating a reuseable function that connect to mysql server, using host_name, user_name, user_password
def create_server_connection(host_name, user_name, user_password):
    connection = None
    try:
        connection = mysql.connector.connect(
            host=host_name,
            user=user_name,
            passwd=user_password
        )
        print("MySQL Database connection successful")
    except Error as err:
        print(f"Error: '{err}'")

    return connection 
create_server_connection('localhost', 'root', '1234')

MySQL Database connection successful


<mysql.connector.connection.MySQLConnection at 0x226e8e23f40>

### 2.1) Creating database and connecting

In [45]:
# Defining a function that create a new data base
def create_database(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        print("Database created successfully")
    except Error as err:
        print(f"Error: '{err}'")

# 1. Connecting to the SQL server and defining the connection
connection = create_server_connection('localhost', 'root', '1234')

# 2. Creating the SQL query that will create a database
create_database_query = "CREATE DATABASE university_distibution"

# 3. Using the function to create the database
create_database(connection, create_database_query)

MySQL Database connection successful
Database created successfully


Connecting to database

In [46]:
# Defining a function that connects to the database 
def create_db_connection(host_name, user_name, user_password, db_name):
    connection = None
    try:
        connection = mysql.connector.connect(
            host=host_name,
            user=user_name,
            passwd=user_password,
            database=db_name
        )
        print("MySQL Database connection successful")
    except Error as err:
        print(f"Error: '{err}'")

    return connection

# Connecting to the database
create_db_connection('localhost', 'root','1234', 'university_distibution')

MySQL Database connection successful


<mysql.connector.connection.MySQLConnection at 0x226e8e20100>

Defining execute functions

In [47]:
# Defining a query execution function
def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        print("Query successful")
    except Error as err:
        print(f"Error: '{err}'")

Creating tables 

In [48]:
# Creating table gender
def table_creator(year):
    create_gender_table = f"""
    CREATE TABLE gender_{year} (
        InstNr int,
        InstNavn varchar(200),
        OptNr int,
        OptNavn varchar(200),
        Mand int,
        Kvinder int,
        I_alt int   
    );
    """
    # Execute our defined query
    execute_query(connection, create_gender_table) 

# Connect to the Database
connection = create_db_connection("localhost", "root", "1234",'university_distibution')

# Iterate through the years from 2016 to 2022 and process each name
for year in range(2016, 2023):
    table_creator(year)

MySQL Database connection successful
Query successful
Query successful
Query successful
Query successful
Query successful
Query successful
Query successful


## 3) Loading data from .csv into tables from 2016-2022

In [49]:
# Enabling file imports
set_global = """
SET GLOBAL local_infile=1;
"""
# Connect to the Database
connection = create_db_connection("localhost", "root", "1234",'university_distibution')
execute_query(connection, set_global)

MySQL Database connection successful
Query successful


In [50]:

# Creating a function that loads .csv into tables with the corresponding year
def csv_loader(year):
    load_csv_gender = f"""
    LOAD DATA INFILE 
        'C:/Projects/SQL_Portfolio project/Videre_gaaende_uddannelser/python/gender_cleaned/accepted_gender_clean_{year}.csv'
    INTO TABLE 
        gender_{year}
    FIELDS TERMINATED BY ','
    ENCLOSED BY '"'
    LINES TERMINATED BY '\r\n'
    IGNORE 1 LINES;
    """
    # Executing defined query
    execute_query(connection, load_csv_gender) 
    
# Connect to the Database
connection = create_db_connection("localhost", "root", "1234",'university_distibution')

# Iterate through the years from 2016 to 2022 and process each name
for year in range(2016, 2023):
    csv_loader(year)

MySQL Database connection successful
Query successful
Query successful
Query successful
Query successful
Query successful
Query successful
Query successful


## 4) Data analysis

### 4.1) Academic Major Gender Distibution

Creating a new view that group all of the matching educations(optname) and taking the sum of the students, and showing gender distibution in numbers and procentage share

In [51]:
#Creating a function that execute a query select and calculate the gender distibution in pct.
def gender_dist_creator(year):
    gender_view = f"""
    CREATE VIEW gender_dist_{year} AS
    Select 
        instnr, 
        InstNavn as instname,
        Max(optnr) as OptNr,
        optnavn as optname, 
        sum(mand) as male_{year}, 
        round(sum(mand)/sum(i_alt)*100,2) as male_pct_{year},
        sum(kvinder) as female_{year}, 
        round(sum(kvinder)/sum(i_alt)*100,2) as female_pct_{year},
        sum(i_alt) as total_{year}
    from gender_{year}
    group by 
        InstNr, 
        InstNavn, 
        OptNavn
    order by 
        InstNr;   
    """
    execute_query(connection, gender_view)

# Iterate through the years from 2016 to 2022 and process each name
for year in range(2016, 2023):
    gender_dist_creator(year)

Query successful
Query successful
Query successful
Query successful
Query successful
Query successful
Query successful


### 4.2) University Gender Distibution

Grouping all academic majors from the same university and calculating the gender distibution

In [55]:
# Creating a function that groups universities and calculate the percentage distibution
def gender_dist_uni(year):
    gender_uni_view = f"""
    CREATE VIEW gender_dist_uni_{year} as
    select 
        instnr,
        instname,
        sum(male_{year}) as male_{year},
        round(sum(male_{year})/sum(total_{year})*100,2) as male_pct_{year},
        sum(female_{year}) as female_{year},
        round(sum(female_{year})/sum(total_{year})*100,2) as female_pct_{year},
        sum(total_{year}) as total_{year}
    from gender_dist_{year}
    group by
        instnr, instname
    order by 
        instnr;

    """
    execute_query(connection, gender_uni_view)

# Iterate through the years from 2016 to 2022 and process each name 
for year in range(2016, 2023):
    gender_dist_uni(year)

Query successful
Query successful
Query successful
Query successful
Query successful
Query successful
Query successful


### 4.3) Total Gender distibution

Grouping all universities and calculating the total distibution of students on the indiviual universities

In [61]:
# Defining a function that execute the following SQL Query
def student_dist(year):
    student_dist_view = f"""
    CREATE VIEW total_student_{year} as
    SELECT 
        instnr,
        instnavn,
        round(sum(i_alt) / (select sum(i_alt) from gender_{year}) * 100,2) as pct_{year},
        SUM(i_alt) AS Total_{year}
    FROM gender_{year}
    GROUP BY 
        instnr,
        instnavn
    Order by total_{year} desc;

    """
    execute_query(connection, student_dist_view)
    
# Iterate through the years from 2016 to 2022 and process each name 
for year in range(2016, 2023):
    student_dist(year)


Query successful
Query successful
Query successful
Query successful
Query successful
Query successful
Query successful
