In [1]:
import great_expectations as ge

In [2]:
import numpy as np
import pandas as pd
import os 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
dx_df = ge.read_csv(
    "../../rwd_data_quality/data/diagnosis.csv",
     sep="\t")
dx_df.head()

Unnamed: 0,pt_id,dos,dx_code
0,13151,2018-04-10,H35.3231
1,13151,2018-05-22,H35.3231
2,13151,2018-07-17,H35.3220
3,13151,2018-09-25,H35.3220
4,13151,2018-12-18,H35.3220


In [4]:
dx_df.dos.value_counts()

2016-06-29    60
2016-01-08    50
2016-08-16    48
2015-12-21    48
2015-11-23    45
              ..
2017-11-27     1
2015-08-17     1
2019-11-14     1
2019-10-31     1
2019-11-15     1
Name: dos, Length: 685, dtype: int64

## Diagnosis code of age-related macular degeneration 

* [ICD-10-CM](https://mdinteractive.com/MIPS_ophthalmology)

In [5]:
wamd_re = r"(H35.?32)|(362.?52)|(414173003)|(h35.?32)"
#wamd_re = [r"(H35.?32)", r"(362.?52)", r"(414173003)"]
dx_df.expect_column_values_to_match_regex('dx_code', wamd_re, mostly=1)

{
  "result": {
    "element_count": 4672,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 3379,
    "unexpected_percent": 72.32448630136986,
    "unexpected_percent_total": 72.32448630136986,
    "unexpected_percent_nonmissing": 72.32448630136986,
    "partial_unexpected_list": [
      "312912001",
      "312912001",
      "H35.3112",
      "312912001",
      "H43.811",
      "H34.12",
      "H35.81",
      "H02.403",
      "362.02",
      "362.02",
      "362.02",
      "362.02",
      "374.3",
      "H02.403",
      "H43.812",
      "H04.123",
      "H35.81",
      "H25.13",
      "H25.13",
      "H43.813"
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": false
}

In [8]:
dx_df.expect_column_values_to_not_be_null('dx_code')

{
  "result": {
    "element_count": 4672,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true
}

## 2. Accessment of Patient Data

In [9]:
pt_df = ge.read_csv(
    "../../rwd_data_quality/data/patient.csv",
     sep="\t")
pt_df.head()
pt_df.shape

Unnamed: 0,patientid,sex
0,13151,Female
1,43463,Female
2,42834,Male
3,37531,Female
4,31613,Female


(945, 2)

In [10]:
pt_df.expect_column_values_to_be_in_set('sex', ['Male', "Female", 'Not Reported'], mostly=1)

{
  "result": {
    "element_count": 945,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 24,
    "unexpected_percent": 2.5396825396825395,
    "unexpected_percent_total": 2.5396825396825395,
    "unexpected_percent_nonmissing": 2.5396825396825395,
    "partial_unexpected_list": [
      "female",
      "f",
      "male",
      "female",
      "female",
      "f",
      "female",
      "female",
      "male",
      "male",
      "male",
      "female",
      "male",
      "male",
      "male",
      "male",
      "male",
      "f",
      "M",
      "female"
    ]
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": false
}

## 2. Accessment of Procedure Data

In [11]:
pc_df = ge.read_csv(
    "../../rwd_data_quality/data/procedure.csv",
     sep="\t")
pc_df.head()
pc_df.shape

Unnamed: 0,patient_id,dos,procedure_code,mod
0,10026,12/14/19,2027F,
1,10026,11/13/19,92014,25.0
2,10026,11/6/19,G8427,
3,10026,10/2/19,92134,
4,10026,9/23/19,92014,25.0


(5222, 4)

In [12]:
pc_df.expect_column_values_to_not_be_null('mod', mostly=1)
pc_df.expect_column_values_to_not_be_null('patient_id', mostly=1)
pc_df.expect_column_values_to_not_be_null('dos', mostly=1)
pc_df.expect_column_values_to_not_be_null('procedure_code', mostly=1)

{
  "result": {
    "element_count": 5222,
    "unexpected_count": 3760,
    "unexpected_percent": 72.00306396016852,
    "unexpected_percent_total": 72.00306396016852,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": false
}

{
  "result": {
    "element_count": 5222,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true
}

{
  "result": {
    "element_count": 5222,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true
}

{
  "result": {
    "element_count": 5222,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true
}