In [17]:
import pandas as pd
import great_expectations as ge
from great_expectations.dataset import PandasDataset


In [40]:
# Load your CSV file into a DataFrame
df = pd.read_csv('data_clean.csv.csv')

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            9994 non-null   int64  
 1   ship_mode     9994 non-null   object 
 2   segment       9994 non-null   object 
 3   country       9994 non-null   object 
 4   city          9994 non-null   object 
 5   state         9994 non-null   object 
 6   postal_code   9994 non-null   int64  
 7   region        9994 non-null   object 
 8   category      9994 non-null   object 
 9   sub_category  9994 non-null   object 
 10  sales         9994 non-null   float64
 11  quantity      9994 non-null   int64  
 12  discount      9994 non-null   float64
 13  profit        9994 non-null   float64
 14  created_at    9994 non-null   object 
dtypes: float64(3), int64(3), object(9)
memory usage: 1.1+ MB


In [8]:
# Create a Great Expectations dataset
class CustomDataset(PandasDataset):
    # Optionally, define custom expectations or methods here
    pass

# Instantiate the Great Expectations dataset
ge_df = CustomDataset(df)

In [23]:
# convert the pandas dataframe to a great_expectations dataset
df_ge = ge.from_pandas(df)

In [24]:
# Expect the unique_id to be unique
result_unique = df_ge.expect_column_values_to_be_unique(column="id")
result_unique

{
  "success": true,
  "result": {
    "element_count": 9994,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [27]:
# between 0-300
result_between = df_ge.expect_column_values_to_be_between(column="quantity", min_value=0, max_value=300)
result_between

{
  "success": true,
  "result": {
    "element_count": 9994,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [30]:
# Category should be within a predefined set of values
result_in_set = df_ge.expect_column_values_to_be_in_set(column="category", value_set=["Technology", "Furniture", "Office Supplies"])
result_in_set

{
  "success": true,
  "result": {
    "element_count": 9994,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [31]:
# Should be type int (integer)
result_in_type = df_ge.expect_column_values_to_be_of_type(column="quantity", type_="int")
result_in_type

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [33]:
# Expect a column to exist
result_column_exist = df_ge.expect_column_to_exist(column="segment")
result_column_exist


{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [37]:
# Expect unique values across multiple columns
result_multi_unique = df_ge.expect_column_values_to_be_unique(column="id")
result_multi_unique

{
  "success": true,
  "result": {
    "element_count": 9994,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [39]:
# Expect a column to be of a certain data type
result_column_type = df_ge.expect_column_most_common_value_to_be_in_set(column="segment", value_set=["Consumer", "Corporate", "Home Office"])
result_column_type

{
  "success": true,
  "result": {
    "observed_value": [
      "Consumer"
    ],
    "element_count": 9994,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}