In [1]:
# import libraries
from great_expectations.data_context import FileDataContext
import great_expectations as gx

In [2]:
# create a context
context = FileDataContext.create(project_root_dir='./')

In [3]:
# naming the datasource
ds_name = 'data_clean'

# define the datasource as pandas
datasource = context.sources.add_pandas(ds_name)

# naming the asset
asset_name = 'data1'

# define the file path
path = 'C:\\Users\\Asus\\github-classroom\\FTDS-assignment-bay\\p2-ftds026-rmt-m3-yoigil\\P2M3_adriel_julius_sutanto_data_clean.csv'

# define the asset
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path)

# Build batch request
batch_request = asset.build_batch_request()

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-1'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,rank,name,platform,year,genre,publisher,n_sales,e_sales,j_sales,other_sales,global_sales
0,0,1,Wii Sports,Wii,2006,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,1,2,Super Mario Bros.,NES,1985,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,2,3,Mario Kart Wii,Wii,2008,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,3,4,Wii Sports Resort,Wii,2009,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,4,5,Pokemon Red/Pokemon Blue,GB,1996,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [5]:
# expect to be not null
validator.expect_column_values_to_not_be_null('name')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 16598,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

From this validation, it can be confirmed that the dataset does not contain any missing values.

In [6]:
# expect to be unique
validator.expect_column_values_to_be_unique('rank')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 16598,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

From this validation, it can be confirmed that the column rank does not contain any duplicate values. Since the column rank tells the ranking of sales by video games, it appeared that there are no games that having the same rank, in other words, all games in the list have different values of sales amount.

In [7]:
# expect column year to be between 1980 and 2020
validator.expect_column_values_to_be_between(
    column='year', min_value=1980, max_value=2020
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 16598,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

From this validation, it can be confirmed that the column year in within the range of 1980 and 2020. Since the column year tells the year of which a game is released, it appeared that there are the games in this list are released in between 1980 and 2020.

In [8]:
# expect column genre to be in set of strings
validator.expect_column_values_to_be_in_set(
    'genre', ['Platform', 'Role-Playing', 'Sports', 'Misc', 'Racing', 'Action', 'Puzzle', 'Simulation', 'Shooter', 'Fighting', 'Adventure', 'Strategy']
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 16598,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

From this validation, it can be confirmed that the column genre has 12 unique values. Since the column genre tells the genre of which a game is categorized, it appeared that each game is categorized to one of these classes.

In [9]:
# expect column global_sales to be in type of integer/float
validator.expect_column_values_to_be_in_type_list('global_sales', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

From this validation, it can be confirmed that the column global_sales in within the range of 1980 and 2020. Since the column global_sales tells the total worldwide sales of a game, it means that it can be confirmed that the all data entries are in the type of either integer or float.

In [10]:
# expect standard deviation of n_sales to be between 0 and 1
validator.expect_column_stdev_to_be_between(column='n_sales', min_value=0, max_value=1)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.8166830292988796
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [11]:
# expect standard deviation of j_sales to be between 0 and 1
validator.expect_column_stdev_to_be_between(column='j_sales', min_value=0, max_value=1)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.30929064808220297
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [12]:
# expect standard deviation of e_sales to be between 0 and 1
validator.expect_column_stdev_to_be_between(column='e_sales', min_value=0, max_value=1)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.5053512312869116
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

As I checked the standard deviation of n_sales, e_sales, and j_sales, it seems that all of the standard deviations are below 1. This means that sales in all North America, European Union, and Japan have a low standard deviation. In other words, all data points of these columns are clustered near the mean of each respective column. Therefore, the variance of these columns are also low.

In [13]:
# expect the table row to be equal to 16598
validator.expect_table_row_count_to_equal(16598)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 16598
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

From this validation, it can be confirmed that the dataset has data entries of 16598 rows.

In [14]:
# expect the minimum value of j_sales to be between 0 and 0.1
validator.expect_column_min_to_be_between(column='j_sales', min_value=0, max_value=0.1)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [15]:
# expect the minimum value of n_sales to be between 0 and 0.1
validator.expect_column_min_to_be_between(column='n_sales', min_value=0, max_value=0.1)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

In [16]:
# expect the minimum value of e_sales to be between 0 and 0.1
validator.expect_column_min_to_be_between(column='e_sales', min_value=0, max_value=0.1)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 0.0
  },
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {}
}

As I checked the minimum values of n_sales, e_sales, and j_sales, it seems that there are data points that have 0 as their value. This indicated that there are games that does not get sold at all either in North American market, European market, or Japanese market.

In [17]:
# Save into Expectation Suite
validator.save_expectation_suite(discard_failed_expectations=False)

In [18]:
# Create a checkpoint
checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [19]:
# Run a checkpoint
checkpoint_result = checkpoint_1.run()
checkpoint_result

Calculating Metrics:   0%|          | 0/35 [00:00<?, ?it/s]

{
  "run_id": {
    "run_time": "2024-01-29T17:10:20.574454+08:00",
    "run_name": null
  },
  "run_results": {
    "ValidationResultIdentifier::expectation-1/__none__/20240129T091020.574454Z/data_clean-data1": {
      "validation_result": {
        "evaluation_parameters": {},
        "success": true,
        "statistics": {
          "evaluated_expectations": 12,
          "successful_expectations": 12,
          "unsuccessful_expectations": 0,
          "success_percent": 100.0
        },
        "results": [
          {
            "success": true,
            "result": {
              "element_count": 16598,
              "unexpected_count": 0,
              "unexpected_percent": 0.0,
              "partial_unexpected_list": [],
              "partial_unexpected_counts": [],
              "partial_unexpected_index_list": []
            },
            "exception_info": {
              "raised_exception": false,
              "exception_traceback": null,
              "exception_me