In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
# Step 1: Data Acquisition
df = pd.read_csv("C:\\Users\\user\\Desktop\\Data Science\\HRDataset_v14.csv")

In [3]:
# Step 2: Data Quality Issues
# Explore the dataset and identify data quality issues
print(df.isnull().sum())  # Check for missing values
print(df.dtypes)  # Examine data types

Employee_Name                   0
EmpID                           0
MarriedID                       0
MaritalStatusID                 0
GenderID                        0
EmpStatusID                     0
DeptID                          0
PerfScoreID                     0
FromDiversityJobFairID          0
Salary                          0
Termd                           0
PositionID                      0
Position                        0
State                           0
Zip                             0
DOB                             0
Sex                             0
MaritalDesc                     0
CitizenDesc                     0
HispanicLatino                  0
RaceDesc                        0
DateofHire                      0
DateofTermination             207
TermReason                      0
EmploymentStatus                0
Department                      0
ManagerName                     0
ManagerID                       8
RecruitmentSource               0
PerformanceSco

In [5]:
# Step 3: Cleaning Strategy
# Based on the identified issues, develop a cleaning strategy
# Let's assume there are missing values and inconsistent formats

# Step 4: Data Cleaning
# Handle missing values using imputation
imputer = SimpleImputer(strategy="mean")
df["ManagerID"] = imputer.fit_transform(df[["ManagerID"]])

In [None]:
# Step 5: Data Transformation
# Feature engineering
df["years_of_service"] = pd.to_datetime("today") - df["DateofHire"]
df["years_of_service"] = df["years_of_service"].dt.days // 365

In [11]:
# Aggregation
department_summary = df.groupby("Department").agg({"Salary": "mean"})


In [13]:
# Step 6: Validation
# Validate the cleaned and transformed dataset
print(df.head())  # Check the cleaned dataset
print(department_summary)  # Examine the aggregated data

              Employee_Name  EmpID  MarriedID  MaritalStatusID  GenderID  \
0       Adinolfi, Wilson  K  10026          0                0         1   
1  Ait Sidi, Karthikeyan     10084          1                1         1   
2         Akinkuolie, Sarah  10196          1                1         0   
3              Alagbe,Trina  10088          1                1         0   
4          Anderson, Carol   10069          0                2         0   

   EmpStatusID  DeptID  PerfScoreID  FromDiversityJobFairID  Salary  ...  \
0            1       5            4                       0   62506  ...   
1            5       3            3                       0  104437  ...   
2            5       5            3                       0   64955  ...   
3            1       5            3                       0   64991  ...   
4            5       5            3                       0   50825  ...   

      ManagerName  ManagerID RecruitmentSource PerformanceScore  \
0  Michael Albert  

In [14]:
# Step 7: Documentation
# Document the steps taken during cleaning and transformation

# Step 8: Dataset Presentation
# Save the cleaned and transformed dataset
df.to_csv("cleaned_employee_data.csv", index=False)