# Exploratory data analysis - Technical - OS Level Analysis

## Departments CSV

In [None]:
dep_path = "hr_dwh/seeds/raw_departments.csv"

In [None]:
!echo var=$dep_path
!echo File-check: $(file $dep_path)
!echo Last Update: $(ls -lh $dep_path)
!echo Row number: $(wc -l $dep_path)
!echo Size: $(du -h hr_dwh/seeds/raw_departments.csv)
!echo 'Headers:'
!head -n 1 $dep_path

var=hr_dwh/seeds/departments_dbt.csv


File-check: hr_dwh/seeds/departments_dbt.csv: CSV text
Last Update: -rwxrwxrwx 1 arek arek 766 Apr 1 14:10 hr_dwh/seeds/departments_dbt.csv
Row number: 29 hr_dwh/seeds/departments_dbt.csv
Size: 4.0K hr_dwh/seeds/departments_dbt.csv
Headers:
"DEPARTMENT_ID","DEPARTMENT_NAME","MANAGER_ID","LOCATION_ID"


In [None]:
!echo 'Validate structure - count number of columns:'
!awk -F',' '{print NF}'  hr_dwh/seeds/raw_departments.csv | sort | uniq -c

Validate structure - count number of columns:
     29 4


In [133]:
!echo "Head & Tail of csv"
!echo "-----------"
!head -5 $dep_path | column -t -s ',' && echo ... && tail -5 $dep_path | column -t -s ','
!echo
!echo "Middle of csv"
!echo "-----------"
!sed -n "$(( $(wc -l < $dep_path) / 2 - 3 )),$(( $(wc -l < $dep_path) / 2 + 3 ))p" $dep_path | column -t -s ','

Head & Tail of csv
-----------
"DEPARTMENT_ID"  "DEPARTMENT_NAME"  "MANAGER_ID"  "LOCATION_ID"
10               "Administration"   200           1700
20               "Marketing"        201           1800
30               "Purchasing"       114           1700
40               "Human Resources"  XXX           2400
...
230  "IT Helpdesk"         1700
240  "Government Sales"    1700
250  "Retail Sales"        1700
260  "Recruiting"          1700
270  "Payroll"             1700

Middle of csv
-----------
100  "Finance"               108   1700
110  "Accounting"            205   1700
120  "Treasury"                    1700
130  "Corporate Tax"               abc
140  "Control And Credit"          1700
150  "Shareholder Services"  null  1700
160  "Benefits"                    1700


In [3]:
import pandas as pd
import duckdb as db

df = pd.read_csv("hr_dwh/seeds/raw_departments.csv")

In [4]:
db.register('dep',df)

<duckdb.duckdb.DuckDBPyConnection at 0x7fe426829a30>

In [5]:
db.sql("DESCRIBE SELECT * FROM read_csv_auto('hr_dwh/seeds/raw_departments.csv')")


┌─────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│   column_name   │ column_type │  null   │   key   │ default │  extra  │
│     varchar     │   varchar   │ varchar │ varchar │ varchar │ varchar │
├─────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ DEPARTMENT_ID   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ DEPARTMENT_NAME │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ MANAGER_ID      │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ LOCATION_ID     │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
└─────────────────┴─────────────┴─────────┴─────────┴─────────┴─────────┘

In [203]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   DEPARTMENT_ID    28 non-null     int64 
 1   DEPARTMENT_NAME  27 non-null     object
 2   MANAGER_ID       11 non-null     object
 3   LOCATION_ID      27 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.0+ KB


In [6]:
 df.describe()

Unnamed: 0,DEPARTMENT_ID
count,28.0
mean,142.142857
std,78.709835
min,10.0
25%,77.5
50%,145.0
75%,202.5
max,270.0


**Table preview**

In [154]:
db.sql("desc dep").df()

Unnamed: 0,column_name,column_type,null,key,default,extra
0,DEPARTMENT_ID,BIGINT,YES,,,
1,DEPARTMENT_NAME,VARCHAR,YES,,,
2,MANAGER_ID,DOUBLE,YES,,,
3,LOCATION_ID,BIGINT,YES,,,


In [172]:
db.sql("SELECT * FROM dep LIMIT 100").df().head(10)

Unnamed: 0,DEPARTMENT_ID,DEPARTMENT_NAME,MANAGER_ID,LOCATION_ID
0,10,Administration,200.0,1700
1,20,Marketing,201.0,1800
2,30,Purchasing,114.0,1700
3,40,Human Resources,203.0,2400
4,50,Shipping,121.0,1500
5,60,IT,103.0,1400
6,70,Public Relations,204.0,2700
7,80,Sales,145.0,2500
8,90,Executive,100.0,1700
9,100,Finance,108.0,1700


## Employees CSV

**OS Level Analysis**

In [None]:
emp_path = "data/emp.csv"

In [121]:
!echo var=$emp_path
!echo File-check: $(file $emp_path)
!echo Last Update: $(ls -lh $emp_path)
!echo Row number: $(wc -l $emp_path)
!echo Size: $(du -h $emp_path)
!echo 'Validate structure - count number of columns:'
!awk -F',' '{print NF}' $emp_path | sort | uniq -c
!echo 'Headers:'
!head -n 1 $emp_path

var=data/emp.csv
File-check: data/emp.csv: CSV text
Last Update: -rwxrwxrwx 1 arek arek 8.9K Mar 31 03:14 data/emp.csv
Row number: 107 data/emp.csv
Size: 12K data/emp.csv
Validate structure - count number of columns:
^C
Headers:
"EMPLOYEE_ID","FIRST_NAME","LAST_NAME","EMAIL","PHONE_NUMBER","HIRE_DATE","JOB_ID","SALARY","COMMISSION_PCT","MANAGER_ID","DEPARTMENT_ID"


In [140]:
!echo 'Validate structure - count number of columns:'
!awk -F',' '{print NF}'  data/emp.csv | sort | uniq -c

Validate structure - count number of columns:
    108 11


In [119]:
!echo "Head & Tail of csv"
!echo "-----------"
!head -5 $emp_path | column -t -s ',' && echo ... && tail -5 $emp_path | column -t -s ','
!echo
!echo "Middle of csv"
!echo "-----------"
!sed -n "$(( $(wc -l < $emp_path) / 2 - 3 )),$(( $(wc -l < $emp_path) / 2 + 3 ))p" $emp_path | column -t -s ','

Head & Tail of csv


-----------
"EMPLOYEE_ID"  "FIRST_NAME"  "LAST_NAME"  "EMAIL"    "PHONE_NUMBER"    "HIRE_DATE"  "JOB_ID"   "SALARY"  "COMMISSION_PCT"  "MANAGER_ID"  "DEPARTMENT_ID"
100            "Steven"      "King"       "SKING"    "1.515.555.0100"  17-JUN-13    "AD_PRES"  24000                                     90
101            "Neena"       "Yang"       "NYANG"    "1.515.555.0101"  21-SEP-15    "AD_VP"    17000                       100           90
102            "Lex"         "Garcia"     "LGARCIA"  "1.515.555.0102"  13-JAN-11    "AD_VP"    17000                       100           90
103            "Alexander"   "James"      "AJAMES"   "1.590.555.0103"  03-JAN-16    "IT_PROG"  9000                        102           60
...
202  "Pat"      "Davis"    "PDAVIS"    "1.603.555.0167"  17-AUG-15  "MK_REP"      6000     201  20
203  "Susan"    "Jacobs"   "SJACOBS"   "1.515.555.0168"  07-JUN-12  "HR_REP"      6500     101  40
204  "Hermann"  "Brown"    "HBROWN"    "1.515.555.0169"  07-JUN-12  "PR_R