```
=================================================
Program ini bertujuan untuk memastikan kualitas data dalam proses ETL dengan validasi otomatis menggunakan Great Expectations, sehingga data lebih bersih, akurat, dan efisien untuk analisis.
=================================================
```

# **Data Ethics & Data Validation**

In [144]:
pip install -q "great-expectations==0.18.19"

Note: you may need to restart the kernel to use updated packages.


In [165]:
# Import library yang diperlukan
from great_expectations.data_context import FileDataContext

# Membuat data context untuk proyek Great Expectations
context = FileDataContext.create(project_root_dir='./')

In [None]:
# Import library yang diperlukan
from great_expectations.data_context import FileDataContext

# Membuat data context untuk proyek Great Expectations
context = FileDataContext.create(project_root_dir='./')

# Menambahkan Datasource dengan nama unik
datasource_name = 'csv-data-goods'
datasource = context.sources.add_pandas(datasource_name)

# Menentukan nama aset data dan lokasi file CSV
asset_name = 'goods_data' 
path_to_data = "D:\Bootcamp\#MILESTONE 3\p2-ftds024-hck-m3-yinkasinulingga\dags\data_clean.csv" # sesuiakn path
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Membangun batch request untuk mengambil data
batch_request = asset.build_batch_request()

# Membuat expectation suite untuk validasi data
expectation_suite_name = 'expectation-goods-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Membuat validator berdasarkan batch request dan expectation suite
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)

# Menampilkan beberapa baris pertama dari data yang divalidasi
validator.head()


Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,location,number_of_cows,farm_size,date,product_name,brand,quantity,price_per_unit,total_value,shelf_life,quantity_sold,price_per_unit_sold,total_revenue,customer_location,sales_channel,quantity_in_stock,id
0,Uttar Pradesh,44,Large,2021-12-01,Milk,Amul,687,42.61,29293.5228,22,558,39.24,21895.92,Kerala,Wholesale,129,1
1,Tamil Nadu,24,Medium,2022-02-28,Yogurt,Dodla Dairy,503,36.5,18377.02,30,256,33.81,8655.36,Madhya Pradesh,Online,247,2
2,Telangana,89,Small,2019-06-09,Cheese,Britannia Industries,823,26.52,21835.5072,72,601,28.92,17380.92,Rajasthan,Online,222,3
3,Maharashtra,21,Medium,2020-12-14,Buttermilk,Mother Dairy,147,83.85,12390.5145,11,145,83.07,12045.15,Jharkhand,Retail,2,4
4,Telangana,51,Medium,2019-01-07,Curd,Raj,593,85.54,50803.9168,5,74,84.75,6271.5,Gujarat,Retail,519,5


## **Expectation 1 : Expect Column Values to be Unique**


In [167]:
# Memastikan bahwa nilai dalam kolom 'id' bersifat unik
validator.expect_column_values_to_be_unique('id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4258,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## **Expectation 2 : Expect Column Values to be Between Minimum Value and Maximum Value**

In [168]:
# Nilai dalam kolom `quantity_sold` harus berada dalam rentang 1 hingga 960
validator.expect_column_values_to_be_between(
    column='quantity_sold', min_value=1, max_value=960
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4258,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## **Expectation 3 : Expect Column Values to be in Set**

In [169]:
#  Kolom `location` harus berisi salah satu dari nilai yang telah ditentukan
validator.expect_column_values_to_be_in_set('location', ['Telangana', 'Uttar Pradesh', 'Tamil Nadu', 'Maharashtra',
       'Karnataka', 'Bihar', 'West Bengal', 'Madhya Pradesh',
       'Chandigarh', 'Delhi', 'Gujarat', 'Kerala', 'Jharkhand',
       'Rajasthan', 'Haryana'])


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4258,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [170]:
# List semua expectations
expectations = {
    'farm_size': ['Medium', 'Large', 'Small'],
    'brand': ['Dodla Dairy', 'Amul', 'Britannia Industries', 'Mother Dairy',
              'Raj', 'Dynamix Dairies', 'Sudha', 'Passion Cheese', 'Warana',
              'Palle2patnam', 'Parag Milk Foods'],
    'product_name': ['Ice Cream', 'Milk', 'Yogurt', 'Cheese', 'Buttermilk', 'Curd',
                     'Paneer', 'Lassi', 'Ghee', 'Butter'],
    'customer_location': ['Madhya Pradesh', 'Kerala', 'Rajasthan', 'Jharkhand', 'Gujarat',
                          'Karnataka', 'Haryana', 'Tamil Nadu', 'West Bengal', 'Telangana',
                          'Chandigarh', 'Maharashtra', 'Delhi', 'Bihar', 'Uttar Pradesh'],
    'sales_channel':['Wholesale', 'Online', 'Retail']
}

# Loop untuk mengecek setiap kolom dan mencetak hasil success atau failure
for column, valid_values in expectations.items():
    result = validator.expect_column_values_to_be_in_set(column, valid_values)
    print(f"Expectation for '{column}': {result['success']}")


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Expectation for 'farm_size': True


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Expectation for 'brand': True


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Expectation for 'product_name': True


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Expectation for 'customer_location': True


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Expectation for 'sales_channel': True


## **Expectation 4 : Expect Column Values to be in Type List**

In [171]:
# Kolom `number_of_cows` harus bertipe data integer (`int64`)
validator.expect_column_values_to_be_in_type_list('number_of_cows', ['int64'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [172]:
expectations_quantity =['quantity', 'quantity_sold', 'quantity_in_stock']

for column in expectations_quantity:
    result = validator.expect_column_values_to_be_in_type_list(column, ['int64'])
    print(f"Expectation for '{column}': {result['success']}")

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Expectation for 'quantity': True


Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Expectation for 'quantity_sold': True


Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Expectation for 'quantity_in_stock': True


In [173]:
validator.expect_column_values_to_be_in_type_list('price_per_unit', ['float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## **Expectation 5 : Expect the Column can not Contain Missing Values**

In [174]:
# Kolom `location` tidak boleh memiliki nilai null (kosong)
validator.expect_column_values_to_not_be_null('location')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4258,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [175]:
# Loop untuk mengecek semua kolom agar tidak ada nilai null
for kolom in validator.active_batch.data.dataframe.columns:
    expectation = validator.expect_column_values_to_not_be_null(column)
    print(f"Expectation result for '{kolom}': {expectation['success']}")

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'location': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'number_of_cows': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'farm_size': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'date': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'product_name': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'brand': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'quantity': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'price_per_unit': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'total_value': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'shelf_life': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'quantity_sold': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'price_per_unit_sold': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'total_revenue': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'customer_location': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'sales_channel': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'quantity_in_stock': True


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Expectation result for 'id': True


## **Expectation 6 : Expect the Table Column Count to be Equal**

In [None]:
# Jumlah kolom dalam tabel berjumlah 17
validator.expect_table_column_count_to_equal(value = 17)

Calculating Metrics:   0%|          | 0/3 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 17
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## **Expectation 7 : Expect the Values in Column A to be Greater Than Column B**

In [176]:
# Nilai dalam kolom `quantity` harus lebih besar atau sama dengan `quantity_sold`
validator.expect_column_pair_values_A_to_be_greater_than_B(
    column_A="quantity",
    column_B="quantity_sold",
    or_equal=True
)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4258,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}