In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
sns.set_context("paper")

In [2]:
# import the staging files
# File paths
assessments_file_path = r"D:\Final Project\Data Engineering\Staging\mart\stg_assessments.csv"
courses_file_path = r"D:\Final Project\Data Engineering\Staging\mart\stg_courses.csv"
course_skill_mapping_file_path = r"D:\Final Project\Data Engineering\Staging\mart\stg_courses_skills.csv"
skills_file_path = r"D:\Final Project\Data Engineering\Staging\mart\stg_skills.csv"
employee_skill_proficiency_file_path = r"D:\Final Project\Data Engineering\Staging\mart\stg_employee_skill_proficiency.csv"
employee_file_path = r"D:\Final Project\Data Engineering\Staging\mart\stg_employees.csv"

# Read the csv files using pandas
assessments_df = pd.read_csv(assessments_file_path)
courses_df = pd.read_csv(courses_file_path)
course_skill_df = pd.read_csv(course_skill_mapping_file_path)
skills_df = pd.read_csv(skills_file_path)
employee_skill_proficiency_df = pd.read_csv(employee_skill_proficiency_file_path)
employee_df = pd.read_csv(employee_file_path)

## Create the course and skill mapping table

In [3]:
courses_df.head()

Unnamed: 0,CourseID,CourseName,CourseDuration,Description
0,1,Introduction to Data Science,4,Learn the basics of data science
1,2,Machine Learning Basics,6,Understand the fundamentals of machine learnin...
2,3,Deep Learning with TensorFlow,8,A deep dive into deep learning concepts using ...
3,4,Statistical Analysis with R,5,Learn statistical methods and their applicatio...
4,5,Data Visualization with Python,3,Visualize data effectively using Python librar...


In [4]:
skills_df.head()

Unnamed: 0,skillid,skillname,category
0,1,Data Analysis,Data Science
1,2,Machine Learning,Data Science
2,3,Statistical Analysis,Data Science
3,4,Data Visualization,Data Science
4,5,Deep Learning,Data Science


In [5]:
course_skill_df.head()

Unnamed: 0,courseid,skillid
0,1,1
1,1,2
2,1,4
3,1,6
4,1,8


In [6]:
# Join the table
courses_skills_combined_df = pd.merge(pd.merge(skills_df, course_skill_df, on = "skillid"),
         courses_df, left_on = "courseid", right_on = "CourseID")

courses_skills_combined_df = courses_skills_combined_df[["skillid", "courseid", "skillname", "CourseName","category"]]
courses_skills_combined_df.head()

Unnamed: 0,skillid,courseid,skillname,CourseName,category
0,1,1,Data Analysis,Introduction to Data Science,Data Science
1,1,5,Data Analysis,Data Visualization with Python,Data Science
2,1,7,Data Analysis,Advanced Data Analysis Techniques,Data Science
3,1,9,Data Analysis,Data Ethics and Privacy,Data Science
4,2,1,Machine Learning,Introduction to Data Science,Data Science


## Employee and skill proficiency table

In [9]:
employee_df.head()

Unnamed: 0,empid,empname,designation,experience,birth_date,hire_date,salary,gender,activestatus,created_at
0,1,Alice Johnson,Software Engineer,3,1990-05-15,2021-06-01,75000,Female,Active,2021-06-01 09:00:00
1,2,Bob Smith,Sr. Software Engineer,5,1988-08-22,2019-03-15,90000,Male,Active,2019-03-15 10:00:00
2,3,Charlie Brown,Solutions Enabler,4,1992-01-10,2020-07-20,85000,Male,Inactive,2020-07-20 11:00:00
3,4,Diana Prince,Solutions Consultant,6,1985-12-30,2018-04-10,110000,Female,Active,2018-04-10 12:00:00
4,5,Evelyn Harper,Principal Architect,10,1982-09-14,2015-11-11,150000,Female,Active,2015-11-11 13:00:00


In [8]:
skills_df.head()

Unnamed: 0,skillid,skillname,category
0,1,Data Analysis,Data Science
1,2,Machine Learning,Data Science
2,3,Statistical Analysis,Data Science
3,4,Data Visualization,Data Science
4,5,Deep Learning,Data Science


In [10]:
employee_skill_proficiency_df.head()

Unnamed: 0,empid,skillid,proficiency
0,1,1,5
1,1,2,4
2,1,3,3
3,1,4,5
4,1,5,4


In [13]:
employee_skill_merged_df = pd.merge(
                            pd.merge(employee_df, employee_skill_proficiency_df, on = "empid"), 
                            skills_df, 
                            on = "skillid")[["empname", "skillname", "proficiency"]]
employee_skill_merged_df.head()

Unnamed: 0,empname,skillname,proficiency
0,Alice Johnson,Data Analysis,5
1,Alice Johnson,Machine Learning,4
2,Alice Johnson,Statistical Analysis,3
3,Alice Johnson,Data Visualization,5
4,Alice Johnson,Deep Learning,4


## Create a table for the employees courses taken and their respective scores

In [14]:
employee_df.head()

Unnamed: 0,empid,empname,designation,experience,birth_date,hire_date,salary,gender,activestatus,created_at
0,1,Alice Johnson,Software Engineer,3,1990-05-15,2021-06-01,75000,Female,Active,2021-06-01 09:00:00
1,2,Bob Smith,Sr. Software Engineer,5,1988-08-22,2019-03-15,90000,Male,Active,2019-03-15 10:00:00
2,3,Charlie Brown,Solutions Enabler,4,1992-01-10,2020-07-20,85000,Male,Inactive,2020-07-20 11:00:00
3,4,Diana Prince,Solutions Consultant,6,1985-12-30,2018-04-10,110000,Female,Active,2018-04-10 12:00:00
4,5,Evelyn Harper,Principal Architect,10,1982-09-14,2015-11-11,150000,Female,Active,2015-11-11 13:00:00


In [15]:
courses_df.head()

Unnamed: 0,CourseID,CourseName,CourseDuration,Description
0,1,Introduction to Data Science,4,Learn the basics of data science
1,2,Machine Learning Basics,6,Understand the fundamentals of machine learnin...
2,3,Deep Learning with TensorFlow,8,A deep dive into deep learning concepts using ...
3,4,Statistical Analysis with R,5,Learn statistical methods and their applicatio...
4,5,Data Visualization with Python,3,Visualize data effectively using Python librar...


In [16]:
assessments_df.head()

Unnamed: 0,assessmentid,empid,courseid,skillid,assessment_score,max_score,due_date,test_taken,test_completed
0,1,1.0,1,1,85.0,100,2024-12-31,2024-11-01,2024-11-15
1,2,1.0,1,2,90.0,100,2024-12-31,2024-11-05,2024-11-15
2,3,2.0,1,3,75.0,100,2024-12-31,2024-11-10,2024-11-20
3,4,1.0,2,1,88.0,100,2024-12-31,2024-11-01,2024-11-15
4,5,3.0,2,2,80.0,100,2024-12-31,2024-11-02,2024-11-16


In [20]:
employee_courses_assessments_df = pd.merge(pd.merge(assessments_df, courses_df, left_on="courseid", right_on="CourseID"), employee_df, on = "empid")[["empname","empid", "assessmentid", "courseid", "skillid", "CourseName",
                                                                                                                    "assessment_score","max_score", "due_date", "test_taken", "test_completed", 
                                                                                                                    ]]
employee_courses_assessments_df.head()

Unnamed: 0,empname,empid,assessmentid,courseid,skillid,CourseName,assessment_score,max_score,due_date,test_taken,test_completed
0,Alice Johnson,1.0,1,1,1,Introduction to Data Science,85.0,100,2024-12-31,2024-11-01,2024-11-15
1,Alice Johnson,1.0,2,1,2,Introduction to Data Science,90.0,100,2024-12-31,2024-11-05,2024-11-15
2,Bob Smith,2.0,3,1,3,Introduction to Data Science,75.0,100,2024-12-31,2024-11-10,2024-11-20
3,Alice Johnson,1.0,4,2,1,Machine Learning Basics,88.0,100,2024-12-31,2024-11-01,2024-11-15
4,Charlie Brown,3.0,5,2,2,Machine Learning Basics,80.0,100,2024-12-31,2024-11-02,2024-11-16


In [22]:
employee_courses_assessments_df["daysLeft_to_Deadline"] = pd.to_datetime(employee_courses_assessments_df["due_date"]) - pd.to_datetime(employee_courses_assessments_df["test_completed"])
employee_courses_assessments_df["timeTaken_to_Complete"] = pd.to_datetime(employee_courses_assessments_df["test_completed"]) - pd.to_datetime(employee_courses_assessments_df["test_taken"]) 
employee_courses_assessments_df.head()

Unnamed: 0,empname,empid,assessmentid,courseid,skillid,CourseName,assessment_score,max_score,due_date,test_taken,test_completed,daysLeft_to_Deadline,timeTaken_to_Complete
0,Alice Johnson,1.0,1,1,1,Introduction to Data Science,85.0,100,2024-12-31,2024-11-01,2024-11-15,46 days,14 days
1,Alice Johnson,1.0,2,1,2,Introduction to Data Science,90.0,100,2024-12-31,2024-11-05,2024-11-15,46 days,10 days
2,Bob Smith,2.0,3,1,3,Introduction to Data Science,75.0,100,2024-12-31,2024-11-10,2024-11-20,41 days,10 days
3,Alice Johnson,1.0,4,2,1,Machine Learning Basics,88.0,100,2024-12-31,2024-11-01,2024-11-15,46 days,14 days
4,Charlie Brown,3.0,5,2,2,Machine Learning Basics,80.0,100,2024-12-31,2024-11-02,2024-11-16,45 days,14 days


## Storing the fact tables

In [24]:
course_skill_df.to_csv(path_or_buf = r"mart/fct_courses_skills_combined.csv", index = False)
employee_skill_merged_df.to_csv(path_or_buf = r"mart/fct_employee_skills_combined.csv", index = False)
employee_courses_assessments_df.to_csv(path_or_buf = r"mart/fct_employee_assessments_courses_combined.csv", index = False)