In [2]:
# Import Libraries
import great_expectations as gx
import pandas as pd

In [4]:
# Load cleaned data
df = pd.read_csv('P2M3_wawan_data_clean.csv')
gx_df = gx.from_pandas(df)

In [6]:
df.head(3)

Unnamed: 0,personid,gender,age,occupation,sleepduration,qualityofsleep,physicalactivitylevel,stresslevel,bmicategory,bloodpressure,heartrate,dailysteps,sleepdisorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   personid               374 non-null    int64  
 1   gender                 374 non-null    object 
 2   age                    374 non-null    int64  
 3   occupation             374 non-null    object 
 4   sleepduration          374 non-null    float64
 5   qualityofsleep         374 non-null    int64  
 6   physicalactivitylevel  374 non-null    int64  
 7   stresslevel            374 non-null    int64  
 8   bmicategory            374 non-null    object 
 9   bloodpressure          374 non-null    object 
 10  heartrate              374 non-null    int64  
 11  dailysteps             374 non-null    int64  
 12  sleepdisorder          155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [7]:
# Expectation 1: to be unique
gx_df.expect_column_values_to_be_unique('personid')

{
  "result": {
    "element_count": 374,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 2: to be between min_value and max_value
gx_df.expect_column_values_to_be_between('qualityofsleep', min_value=1, max_value=10)

{
  "result": {
    "element_count": 374,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Expectation 3: to be in set
gx_df.expect_column_values_to_be_in_set('bmicategory', ['Overweight', 'Normal', 'Obese', 'Normal Weight'])

{
  "result": {
    "element_count": 374,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [33]:
# Expectation 4: to be in type list
gx_df.expect_column_values_to_be_in_type_list('stresslevel', ['int64', 'float'])

{
  "result": {
    "observed_value": "int64"
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [19]:
# Expectation 5: to not be null
gx_df.expect_column_values_to_not_be_null('personid')

{
  "result": {
    "element_count": 374,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [32]:
# Expectation 6: contain no whitespace characters
gx_df.expect_column_values_to_not_match_regex('occupation',  r'^\s*$')


{
  "result": {
    "element_count": 374,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [37]:
# Expectation 7: Expect column values to not be Man nor Woman
gx_df.expect_column_values_to_not_be_in_set('gender', ['Woman', 'Man'])

{
  "result": {
    "element_count": 374,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}