In [60]:
import os
import pandas as pd

In [61]:

def extract():
    employees = pd.read_csv('raw/employees.csv')
    users = pd.read_csv('raw/users.csv')
    employee_dept = pd.read_csv('raw/employee_dept.csv')
    skills = pd.read_csv('raw/skills.csv')
    employee_skill_proficiency = pd.read_csv('raw/employee_skill_proficiency.csv')
    assessments = pd.read_csv('raw/assessments.csv')
    courses = pd.read_csv('raw/courses.csv')
    employee_certifications = pd.read_csv('raw/employee_certifications.csv')
    
    return {
        'Employees': employees,
        'Users': users,
        'EmployeeDept': employee_dept,
        'Skills': skills,
        'EmployeeSkillProficiency': employee_skill_proficiency,
        'Assessments': assessments,
        'Courses': courses,
        'EmployeeCertifications': employee_certifications,
    }


In [62]:
tables = extract()

In [63]:
def transform(tables):
    # Transform Employees
    df_employees = tables['Employees']
    df_employees['birth_date'] = pd.to_datetime(df_employees['birth_date'], errors='coerce')
    df_employees['hire_date'] = pd.to_datetime(df_employees['hire_date'], errors='coerce')
    df_employees['experience'] = df_employees['experience'].str.extract(r'(\d+)').astype(int)
    df_employees['gender'] = df_employees['gender'].str.capitalize()
    df_employees['activeStatus'] = df_employees['activeStatus'].str.capitalize() 

    # Transform Users
    df_users = tables['Users']
    df_users['created_at'] = pd.to_datetime(df_users['created_at'], errors='coerce')

    # Transform EmployeeDept
    df_employee_dept = tables['EmployeeDept']
    df_employee_dept['from'] = pd.to_datetime(df_employee_dept['from'], errors='coerce')
    df_employee_dept['to'] = pd.to_datetime(df_employee_dept['to'], errors='coerce')
    df_employee_dept['activeStatus'] = df_employee_dept['activeStatus'].str.capitalize()  

    # Transform Skills
    df_skills = tables['Skills']
    df_skills['skillName'] = df_skills['skillName'].str.strip() 
    df_skills['category'] = df_skills['category'].str.strip() 

    # Transform EmployeeSkillProficiency
    df_emp_skill_prof = tables['EmployeeSkillProficiency']
    df_emp_skill_prof['proficiency'] = df_emp_skill_prof['proficiency'].clip(lower=1, upper=5)

    # Transform Assessments
    df_assessments = tables['Assessments']
    df_assessments['due_date'] = pd.to_datetime(df_assessments['due_date'], errors='coerce')
    df_assessments['assessment_score'] = df_assessments['assessment_score'].clip(lower=0, upper=df_assessments['max_score'])
    df_assessments['assessment_percentage'] = (df_assessments['assessment_score'] / df_assessments['max_score']) * 100

    # Transform Courses
    df_courses = tables['Courses']
    df_courses['duration'] = df_courses['duration'].replace(" weeks", "")

    # Transform EmployeeCertifications
    df_emp_certifications = tables['EmployeeCertifications']
    df_emp_certifications['startDate'] = pd.to_datetime(df_emp_certifications['startDate'], errors='coerce')
    df_emp_certifications['completionDate'] = pd.to_datetime(df_emp_certifications['completionDate'], errors='coerce')

    # Return the transformed tables
    return {
        'Employees': df_employees,
        'Users': df_users,
        'EmployeeDept': df_employee_dept,
        'Skills': df_skills,
        'EmployeeSkillProficiency': df_emp_skill_prof,
        'Assessments': df_assessments,
        'Courses': df_courses,
        'EmployeeCertifications': df_emp_certifications,
    }

In [64]:
print(tables)

{'Employees':      empID        empName            desgination experience  birth_date  \
0        1  Alice Johnson      Software Engineer    3 years  1990-05-15   
1        2      Bob Smith  Sr. Software Engineer    5 years  1988-08-22   
2        3  Charlie Brown      Solutions Enabler    4 years  1992-01-10   
3        4   Diana Prince   Solutions Consultant    6 years  1985-12-30   
4        5  Evelyn Harper    Principal Architect   10 years  1982-09-14   
..     ...            ...                    ...        ...         ...   
145    146     Owen Scott      Solutions Enabler    3 years  1993-02-14   
146    147     Emma Clark    Principal Architect   10 years  1984-09-15   
147    148     Max Taylor  Sr. Software Engineer    6 years  1986-10-04   
148    149     Lucy Lewis      Software Engineer    2 years  1995-03-30   
149    150    Henry Brown      Software Engineer     1 year  1996-11-25   

      hire_date  salary  gender activeStatus           created_at  
0    2021-06-01  

In [65]:

transformed_tables = transform(tables)

In [66]:
def load(tables, folder_name='transformed_tables'):
    # Create a new folder if it doesn't exist
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Save each DataFrame as a CSV file in the specified folder
    for table_name, df in tables.items():
        file_path = os.path.join(folder_name, f"{table_name}.csv")
        df.to_csv(file_path, index=False)
        print(f"Saved: {file_path}")

In [67]:

load(transformed_tables)

Saved: transformed_tables\Employees.csv
Saved: transformed_tables\Users.csv
Saved: transformed_tables\EmployeeDept.csv
Saved: transformed_tables\Skills.csv
Saved: transformed_tables\EmployeeSkillProficiency.csv
Saved: transformed_tables\Assessments.csv
Saved: transformed_tables\Courses.csv
Saved: transformed_tables\EmployeeCertifications.csv
