# Exploratory data analysis - DuckDb Experiments

### Set up & validate

In [2]:
import pandas as pd
import duckdb as db

df = pd.read_csv("hr_dwh/seeds/departments_dbt.csv")

In [14]:
db.register('dep',df)

<duckdb.duckdb.DuckDBPyConnection at 0x7f72ba9ed230>

In [153]:
db.sql("DESCRIBE SELECT * FROM read_csv_auto('hr_dwh/seeds/departments_dbt.csv')")


┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name   │ column_type │  null   │   key   │ default │  extra  │
│     varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ DEPARTMENT_ID   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ DEPARTMENT_NAME │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ MANAGER_ID      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ LOCATION_ID     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [203]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   DEPARTMENT_ID    28 non-null     int64 
 1   DEPARTMENT_NAME  27 non-null     object
 2   MANAGER_ID       11 non-null     object
 3   LOCATION_ID      27 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.0+ KB


In [3]:
df.describe()

Unnamed: 0,DEPARTMENT_ID
count,28.0
mean,142.142857
std,78.709835
min,10.0
25%,77.5
50%,145.0
75%,202.5
max,270.0


**Dataframe preview**

In [166]:
df.head(10)

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
0,10,Administration,200,1700.0
1,20,Marketing,201,1800.0
2,30,Purchasing,114,1700.0
3,40,Human Resources,XXX,2400.0
4,50,shipping,121,1500.0
5,60,IT,103,1400.0
6,70,Public Relations,204,
7,80,Sales,145,2500.0
8,90,Executive,100,1700.0
9,100,Finance,108,1700.0


**Table preview**

In [154]:
db.sql("desc dep").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,DEPARTMENT_ID,BIGINT,YES,,,
1,DEPARTMENT_NAME,VARCHAR,YES,,,
2,MANAGER_ID,DOUBLE,YES,,,
3,LOCATION_ID,BIGINT,YES,,,


In [172]:
db.sql("SELECT * FROM dep LIMIT 100").df().head(10)

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
0,10,Administration,200.0,1700
1,20,Marketing,201.0,1800
2,30,Purchasing,114.0,1700
3,40,Human Resources,203.0,2400
4,50,Shipping,121.0,1500
5,60,IT,103.0,1400
6,70,Public Relations,204.0,2700
7,80,Sales,145.0,2500
8,90,Executive,100.0,1700
9,100,Finance,108.0,1700


### Duplicates 

**Try**

In [178]:
db.sql('''
       WITH cleaned_dep AS (
       SELECT 
         COALESCE(DEPARTMENT_ID, -1) as DEPARTMENT_ID
        ,COALESCE(DEPARTMENT_NAME, 'Unknown') AS DEPARTMENT_NAME
        ,COALESCE(MANAGER_ID, -1) AS MANAGER_ID
        ,COALESCE(LOCATION_ID, -1) AS LOCATION_ID
         FROM dep
       )
       SELECT 
         DEPARTMENT_ID
        ,DEPARTMENT_NAME
        ,MANAGER_ID
        ,LOCATION_ID
        ,COUNT(*)
       FROM cleaned_dep 
       GROUP BY 1,2,3,4
      HAVING count(*) > 1
       ''').df().head(10)

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID,count_star()


In [179]:
db.sql(
'''
SELECT 
    department_id AS department_id
  , department_name AS department_name
  , manager_id AS manager_id
  , location_id AS location_id
  , COUNT(*) AS ile
FROM dep
GROUP BY 1, 2, 3, 4
HAVING COUNT(*) > 1
''').df()

Unnamed: 0,department_id,department_name,manager_id,location_id,ile


**Where is my duplicate row?**

**In DuckDB table?**

Let's debug...

In [None]:
db.sql('''
       WITH cleaned_dep AS (
       SELECT 
         COALESCE(DEPARTMENT_ID, -1) as DEPARTMENT_ID
        ,COALESCE(DEPARTMENT_NAME, 'Unknown') AS DEPARTMENT_NAME
        ,COALESCE(MANAGER_ID, -1) AS MANAGER_ID
        ,COALESCE(LOCATION_ID, -1) AS LOCATION_ID
         FROM dep
       )
       SELECT 
         *
       FROM cleaned_dep 
       WHERE DEPARTMENT_ID = 200 --debug here
       ''').df()

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
0,200,Operations,-1.0,1700


**Be careful - CSV != REGISTER_TABLE**

Direct read csv

In [None]:
db.sql("SELECT DEP.* FROM read_csv_auto('hr_dwh/seeds/departments_dbt.csv') AS DEP WHERE DEP.DEPARTMENT_ID = 200").df()

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
0,200,Operations,,1700
1,200,Operations,,1700


Table

In [None]:
db.sql("SELECT DEP.* FROM dep AS DEP WHERE DEP.DEPARTMENT_ID = 200").df()

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
0,200,Operations,,1700


OK:

In [171]:
db.sql("SELECT DEP.*, COUNT(*) FROM read_csv_auto('hr_dwh/seeds/departments_dbt.csv') AS DEP GROUP BY 1,2,3,4 HAVING count(*) > 1")

┌───────────────┬─────────────────┬────────────┬─────────────┬──────────────┐
│ DEPARTMENT_ID │ DEPARTMENT_NAME │ MANAGER_ID │ LOCATION_ID │ count_star() │
│     int64     │     varchar     │  varchar   │   varchar   │    int64     │
├───────────────┼─────────────────┼────────────┼─────────────┼──────────────┤
│           200 │ Operations      │ NULL       │ 1700        │            2 │
└───────────────┴─────────────────┴────────────┴─────────────┴──────────────┘

**Postmortem - Solution**

registered table deleted row with duplicated PK automatically

In [185]:
df_copy = df.reset_index(drop=True)
db.register('dep_auto_index', df_copy)
db.sql("SELECT DEP.* FROM dep_auto_index AS DEP WHERE DEP.DEPARTMENT_ID = 200").df()

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
0,200,Operations,,1700
1,200,Operations,,1700


### Uniqness & Other stats

**Count rows**

In [157]:
db.sql("SELECT COUNT(*) FROM dep").df()

Unnamed: 0,count_star()
0,27


**Is ID unique?**

In [188]:
db.sql("SELECT count(*), count(distinct DEPARTMENT_ID) FROM dep").df()

Unnamed: 0,count_star(),count(DISTINCT DEPARTMENT_ID)
0,27,27


In [190]:
db.sql("SELECT COUNT(*), count(distinct department_id) FROM read_csv_auto('hr_dwh/seeds/departments_dbt.csv')")

┌──────────────┬───────────────────────────────┐
│ count_star() │ count(DISTINCT department_id) │
│    int64     │             int64             │
├──────────────┼───────────────────────────────┤
│           28 │                            27 │
└──────────────┴───────────────────────────────┘

**Nulls or empty**

In [202]:
db.sql('''
   --   SELECT * FROM (
       SELECT
       dep.*
       ,CASE WHEN dep.department_id is null then 1 else 0 end as isnull_depid 
       ,CASE WHEN dep.DEPARTMENT_NAME is null then 1 else 0 end as isnull_name
       ,CASE WHEN dep.MANAGER_ID is null then 1 else 0 end as isnull_man
       ,CASE WHEN dep.LOCATION_ID is null then 1 else 0 end as isnull_loc
       FROM read_csv_auto('hr_dwh/seeds/departments_dbt.csv') as dep
    --   )
    --   WHERE 
    --   isnull_depid = 1 OR isnull_name = 1 OR isnull_man = 1  OR isnull_loc = 1
       ''')

┌───────────────┬──────────────────┬────────────┬─────────────┬──────────────┬─────────────┬────────────┬────────────┐
│ DEPARTMENT_ID │ DEPARTMENT_NAME  │ MANAGER_ID │ LOCATION_ID │ isnull_depid │ isnull_name │ isnull_man │ isnull_loc │
│     int64     │     varchar      │  varchar   │   varchar   │    int32     │    int32    │   int32    │   int32    │
├───────────────┼──────────────────┼────────────┼─────────────┼──────────────┼─────────────┼────────────┼────────────┤
│            10 │ Administration   │ 200        │ 1700        │            0 │           0 │          0 │          0 │
│            20 │ Marketing        │ 201        │ 1800        │            0 │           0 │          0 │          0 │
│            30 │ Purchasing       │ 114        │ 1700        │            0 │           0 │          0 │          0 │
│            40 │ Human Resources  │ XXX        │ 2400        │            0 │           0 │          0 │          0 │
│            50 │ shipping         │ 121        