# Precleaning inspect

### Set up & validate

In [2]:
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv("prod.env", override=True)


user = os.getenv("ORACLE_USER")
password = os.getenv("ORACLE_PASSWORD")
host = os.getenv("ORACLE_HOST")
service = os.getenv("ORACLE_SERVICE")

conn_str = f"oracle+oracledb://{user}:{password}@{host}:1521/{service}"
print(conn_str)
print("user:", os.getenv("ORACLE_USER"))

engine = create_engine(conn_str)

query = """
SELECT * FROM dual
"""

df = pd.read_sql(query, engine)
df.head()

oracle+oracledb://hr_dwh:oracle@srv2.lan:1521/free
user: hr_dwh


Unnamed: 0,dummy
0,X


## Departments

## Employees

In [None]:
query = """
SELECT COUNT(*), COUNT(DISTINCT EMPLOYEE_ID) FROM raw_employees
"""
df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,COUNT(1),COUNT(DISTINCTEMPLOYEE_ID)
0,107,107


In [4]:
query = """
SELECT 
DEPARTMENT_ID, DEPARTMENT_NAME, MANAGER_ID, LOCATION_ID, COUNT(*)
FROM raw_departments
GROUP BY DEPARTMENT_ID, DEPARTMENT_NAME, MANAGER_ID, LOCATION_ID
HAVING COUNT(*) > 1
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,department_id,department_name,manager_id,location_id,COUNT(*)
0,200,Operations,,1700,2


In [8]:
query = """
SELECT 
  D.department_id
 ,D.department_name
FROM raw_departments D
WHERE department_name IS NULL or trim(department_name) = ''
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,department_id,department_name
0,210,


In [13]:
query = """
SELECT 
  round(100*count(D.department_id) / count(*),2) "department_id%nulls"
 ,round(100*count(D.department_name)/ count(*),2) "department_name%nulls"
 ,round(100*count(d.manager_id)/ count(*),2) "manager_id%nulls"
 ,round(100*count(d.location_id)/ count(*),2) "location_id%nulls"
FROM raw_departments D
"""
df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,department_id%nulls,department_name%nulls,manager_id%nulls,location_id%nulls
0,100,96.43,39.29,96.43


In [18]:
query = """
SELECT
   'department_name' as label
  ,COUNT(*) AS total
  ,SUM(CASE WHEN department_name IS NULL THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN department_name = '' THEN 1 ELSE 0 END) AS empty_str
  ,SUM(CASE WHEN TRIM(department_name) = '' THEN 1 ELSE 0 END) AS empty_or_space
  ,SUM(CASE WHEN department_name IS NOT NULL AND TRIM(department_name) != '' THEN 1 ELSE 0 END) AS filled
FROM  raw_departments D
union all
--manager_id
SELECT
  'manager_id'
  ,COUNT(*) AS total
  ,SUM(CASE WHEN manager_id IS NULL THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN manager_id = '' THEN 1 ELSE 0 END) AS empty_str
  ,SUM(CASE WHEN TRIM(manager_id) = '' THEN 1 ELSE 0 END) AS empty_or_space
  ,SUM(CASE WHEN manager_id IS NOT NULL AND TRIM(manager_id) != '' THEN 1 ELSE 0 END) AS filled
FROM raw_departments D
union all
SELECT
  'location_id'
  ,COUNT(*) AS total
  ,SUM(CASE WHEN location_id IS NULL THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN location_id = '' THEN 1 ELSE 0 END) AS empty_str
  ,SUM(CASE WHEN TRIM(location_id) = '' THEN 1 ELSE 0 END) AS empty_or_space
  ,SUM(CASE WHEN location_id IS NOT NULL AND TRIM(location_id) != '' THEN 1 ELSE 0 END) AS filled
FROM raw_departments D
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,label,total,nulls,empty_str,empty_or_space,filled
0,department_name,28,1,0,0,0
1,manager_id,28,17,0,0,0
2,location_id,28,1,0,0,0
