# Precleaning inspect

### Set up & validate

In [3]:
import os
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv("prod.env", override=True)


user = os.getenv("ORACLE_USER")
password = os.getenv("ORACLE_PASSWORD")
host = os.getenv("ORACLE_HOST")
service = os.getenv("ORACLE_SERVICE")

conn_str = f"oracle+oracledb://{user}:{password}@{host}:1521/{service}"
print(conn_str)
print("user:", os.getenv("ORACLE_USER"))

engine = create_engine(conn_str)

query = """
SELECT * FROM dual
"""

df = pd.read_sql(query, engine)
df.head()

oracle+oracledb://hr_dwh:oracle@srv2.lan:1521/free
user: hr_dwh


Unnamed: 0,dummy
0,X


## Cleaning - Departments data

### Structure and data types

In [14]:
query = """
SELECT column_name, data_type, nullable
FROM all_tab_columns
WHERE lower(table_name) = 'raw_departments'
  AND owner = 'HR_DWH'
ORDER BY column_id
"""
df = pd.read_sql(query, con=engine)
df


Unnamed: 0,column_name,data_type,nullable
0,DEPARTMENT_ID,NUMBER,Y
1,DEPARTMENT_NAME,VARCHAR2,Y
2,MANAGER_ID,VARCHAR2,Y
3,LOCATION_ID,VARCHAR2,Y


TODO:
- [ ] cast MANAGER_ID to NUMBER
- [ ] cast LOCATION_ID to NUMBER

In [23]:
query = """
SELECT 
   cast(department_id as number)
 , department_name
 , cast(manager_id as number)
 , cast(location_id as number)
FROM raw_departments
"""
print('DatabaseError: ORA-01722: unable to convert string value containing X to a number: MANAGER_ID')
# df = pd.read_sql(query, engine)
# df.head()

DatabaseError: ORA-01722: unable to convert string value containing X to a number: MANAGER_ID


In [31]:
query = """
SELECT
   'department_name' as label
  ,COUNT(*) AS total
  ,SUM(CASE WHEN REGEXP_LIKE(trim(department_name),'\d+$') THEN 1 ELSE 0 END) only_digit
  ,SUM(CASE WHEN department_name IS NULL THEN 1 ELSE 0 END) nulls
FROM  raw_departments D
union all
SELECT
  'manager_id'
  ,COUNT(*) AS total
  ,SUM(CASE WHEN REGEXP_LIKE(trim(manager_id),'\d+$') THEN 1 ELSE 0 END)
  ,SUM(CASE WHEN manager_id IS NULL THEN 1 ELSE 0 END)
FROM raw_departments D
union all
SELECT
  'location_id'
  ,COUNT(*) AS total
  ,SUM(CASE WHEN REGEXP_LIKE(trim(location_id),'\d+$')  THEN 1 ELSE 0 END)
  ,SUM(CASE WHEN location_id IS NULL THEN 1 ELSE 0 END)
FROM raw_departments D
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,label,total,only_digit,nulls
0,department_name,28,0,1
1,manager_id,28,10,17
2,location_id,28,26,1


In [36]:
query = """
SELECT 
    d.department_id,
    d.location_id
FROM raw_departments d
WHERE NOT EXISTS (
    SELECT 1
    FROM raw_departments x
    WHERE REGEXP_LIKE(TRIM(x.location_id), '^\d+$')
      AND x.department_id = d.department_id
)
"""
df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,department_id,location_id
0,70,
1,130,abc


TODO
- [ ] fix location-id values

In [68]:
query = """
SELECT 
    d.department_id
    ,d.manager_id
FROM raw_departments d
WHERE NOT EXISTS (
    SELECT 1
    FROM raw_departments x
    WHERE 
    REGEXP_LIKE(TRIM(d.manager_id), '^\d+$')
     AND x.department_id = d.department_id
)
"""
df = pd.read_sql(query, engine)
df

Unnamed: 0,department_id,manager_id
0,40,XXX
1,120,
2,130,
3,140,
4,150,
5,160,
6,170,
7,180,
8,190,
9,200,


TODO
- [ ] fix manager id values, xxx etc

In [None]:
print("""
SELECT 
 d.department_id
,d.manager_id
FROM hr.departments d
WHERE d.department_id = 40
-----
40	203
""")


SELECT 
 d.department_id
,d.manager_id
FROM hr.departments d
WHERE d.department_id = 40
      


### Integrity and uniqueness checks

In [12]:
query = """
SELECT COUNT(*), COUNT(DISTINCT DEPARTMENT_ID) 
FROM raw_departments
"""
df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,COUNT(*),COUNT(DISTINCTDEPARTMENT_ID)
0,28,27


In [4]:
query = """
SELECT 
DEPARTMENT_ID, DEPARTMENT_NAME, MANAGER_ID, LOCATION_ID, COUNT(*)
FROM raw_departments
GROUP BY DEPARTMENT_ID, DEPARTMENT_NAME, MANAGER_ID, LOCATION_ID
HAVING COUNT(*) > 1
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,department_id,department_name,manager_id,location_id,COUNT(*)
0,200,Operations,,1700,2


In [8]:
query = """
SELECT 
  D.department_id
 ,D.department_name
FROM raw_departments D
WHERE department_name IS NULL or trim(department_name) = ''
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,department_id,department_name
0,210,


TODO
- [ ] non unique department_id = 200

### Nulls and blank values

In [None]:
query = """
SELECT 
  round(100*count(D.department_id) / count(*),2) "department_id%nulls"
 ,round(100*count(D.department_name)/ count(*),2) "department_name%nulls"
 ,round(100*count(d.manager_id)/ count(*),2) "manager_id%nulls"
 ,round(100*count(d.location_id)/ count(*),2) "location_id%nulls"
FROM raw_departments D
"""
df = pd.read_sql(query, engine)
df.head()


Unnamed: 0,department_id%nulls,department_name%nulls,manager_id%nulls,location_id%nulls
0,100,96.43,39.29,96.43


In [74]:
query = """
SELECT 
    count(*)
FROM raw_departments D
WHERE  department_name  is null and manager_id  is null and location_id is null
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,COUNT(*)
0,0


In [20]:
query = """
SELECT
   'department_id' as label
  ,COUNT(*) AS total
  ,SUM(CASE WHEN department_id IS NULL THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN department_id = '' THEN 1 ELSE 0 END) AS empty_str
  ,SUM(CASE WHEN TRIM(department_id) = '' THEN 1 ELSE 0 END) AS empty_or_space
  ,SUM(CASE WHEN department_id IS NOT NULL AND TRIM(department_id) != '' THEN 1 ELSE 0 END) AS filled
FROM  raw_departments D
union all
SELECT
   'department_name' as label
  ,COUNT(*) AS total
  ,SUM(CASE WHEN department_name IS NULL THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN department_name = '' THEN 1 ELSE 0 END) AS empty_str
  ,SUM(CASE WHEN TRIM(department_name) = '' THEN 1 ELSE 0 END) AS empty_or_space
  ,SUM(CASE WHEN department_name IS NOT NULL AND TRIM(department_name) != '' THEN 1 ELSE 0 END) AS filled
FROM  raw_departments D
union all
--manager_id
SELECT
  'manager_id'
  ,COUNT(*) AS total
  ,SUM(CASE WHEN manager_id IS NULL THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN manager_id = '' THEN 1 ELSE 0 END) AS empty_str
  ,SUM(CASE WHEN TRIM(manager_id) = '' THEN 1 ELSE 0 END) AS empty_or_space
  ,SUM(CASE WHEN manager_id IS NOT NULL AND TRIM(manager_id) != '' THEN 1 ELSE 0 END) AS filled
FROM raw_departments D
union all
SELECT
  'location_id'
  ,COUNT(*) AS total
  ,SUM(CASE WHEN location_id IS NULL THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN location_id = '' THEN 1 ELSE 0 END) AS empty_str
  ,SUM(CASE WHEN TRIM(location_id) = '' THEN 1 ELSE 0 END) AS empty_or_space
  ,SUM(CASE WHEN location_id IS NOT NULL AND TRIM(location_id) != '' THEN 1 ELSE 0 END) AS filled
FROM raw_departments D
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,label,total,nulls,empty_str,empty_or_space,filled
0,department_id,28,0,0,0,0
1,department_name,28,1,0,0,0
2,manager_id,28,17,0,0,0
3,location_id,28,1,0,0,0


In [76]:
query = """
SELECT
  *
FROM  raw_departments D
WHERE department_name is null or location_id is null
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,department_id,department_name,manager_id,location_id
0,70,Public Relations,204.0,
1,210,,,1700.0


TODO
- [ ] Fix null values

In [70]:
query = """
SELECT
   'department_id' as label
  ,COUNT(*) AS total
  ,SUM(CASE WHEN REGEXP_LIKE(department_id,'\D') THEN 1 ELSE 0 END) AS nulls
  ,SUM(CASE WHEN department_id = '' THEN 1 ELSE 0 END) AS empty_str
FROM  raw_departments D
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,label,total,nulls,empty_str
0,department_id,28,0,0


In [27]:
query = """
SELECT 
    department_id
    ,manager_id
FROM raw_departments D 
WHERE REGEXP_LIKE (manager_id, '\D')
"""
df = pd.read_sql(query, engine)
df.head()

Unnamed: 0,department_id,manager_id
0,40,XXX


TODO 
- [ ] fix: manager_id is not number here