# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from langdetect import detect
from great_expectations.data_context import FileDataContext

# Load data

In [2]:
# Menggabungkan data hasil scrapping
df1 = pd.read_csv('job1to5.csv')
df2 = pd.read_csv('job6to7.csv')
df3 = pd.read_csv('job8to10.csv')
df4 = pd.read_csv('job11to15.csv')
df5 = pd.read_csv('job16to20.csv')
df = pd.concat([df1,df2,df3,df4,df5])

# Cleaning Data

In [3]:
# Hanya mengambil 61 karakter awal dari urlnya 
df['job_url'] = df['job_url'].str[:61]

In [4]:
# Cek duplikat dan data yang missing value
print(df.duplicated().sum())
print(df.isna().sum())

6832
company_name       435
job_location       435
job_title          435
job_description    435
job_url            435
dtype: int64


In [5]:
# Menghapus data duplikat dan missing value
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

In [6]:
# Fungsi untuk mendeteksi bahasa dalam dataset
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unknown"

# Membuat kolom baru untuk identifikasi bahasanya
df['lang'] = df['job_description'].apply(detect_lang)

In [7]:
# Menampilkan banyak data dengan bahasa tertentu
df['lang'].value_counts()

lang
en       3497
id        427
vi         12
zh-cn       8
de          7
es          2
nl          2
th          1
fr          1
ko          1
Name: count, dtype: int64

In [8]:
# Hanya mengambil data yang berbahasa inggris
df_en_only = df.query('lang == "en"').reset_index(drop=True)
df_en_only

Unnamed: 0,company_name,job_location,job_title,job_description,job_url,lang
0,English 1,"Jakarta, Indonesia",Hiring Now! Entry-Level English Teacher (ESL) ...,About the job\nJoin English 1 (formerly EF Eng...,https://www.linkedin.com/jobs/search/?currentJ...,en
1,Path Finder Investment,Jakarta Metropolitan Area,Marketing Strategist,About the job\nMarketing Strategist\nThe Canop...,https://www.linkedin.com/jobs/search/?currentJ...,en
2,deVere Group,"Jakarta, Indonesia",Wealth Management Advisor - Jakarta,About the job\nJoin Our Team as a Financial Sa...,https://www.linkedin.com/jobs/search/?currentJ...,en
3,Media Minds,"Central Jakarta, Jakarta, Indonesia",Service Technician,About the job\nCompany Description\n AAA Tradi...,https://www.linkedin.com/jobs/search/?currentJ...,en
4,SLB,"North Cikarang, West Java, Indonesia",Sales Representative - Valve Services,About the job\nAbout Us\n\nWe are a global tec...,https://www.linkedin.com/jobs/search/?currentJ...,en
...,...,...,...,...,...,...
3492,PT Good Sale Tech,"Jakarta, Indonesia",Talent Acquisition & Employer Branding,About the job\nGet To Know Us!\nPT. Good Sale ...,https://www.linkedin.com/jobs/search/?currentJ...,en
3493,Triputra Edukasi Nusantara (TEN),"Setiabudi, Jakarta, Indonesia",Information Technology Specialist,About the job\nCompany Description\n Triputra ...,https://www.linkedin.com/jobs/search/?currentJ...,en
3494,Confidential Careers,"Makassar, South Sulawesi, Indonesia",Area Business Manager,About the job\nJob Description\nRepresent the ...,https://www.linkedin.com/jobs/search/?currentJ...,en
3495,BNI Ventures,Jakarta Metropolitan Area,Marketing & Corporate Communication,About the job\nCompany Description\nBNI Ventur...,https://www.linkedin.com/jobs/search/?currentJ...,en


In [9]:
# Menyimpan data yang berbahasa inggris ke dalam csv
df_en_only.to_csv('data_clean.csv', index=False)

# SkillMatch, Data Validation Great Expectations

In [10]:
context = FileDataContext.create(project_root_dir='./gx_data_context')

# Connect to data source

In [11]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-data-job'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'job-november'
path_to_data = 'data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# Create Expectation Suite

In [12]:
# Creat an expectation suite
expectation_suite_name = 'expectation-job-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,company_name,job_location,job_title,job_description,job_url,lang
0,English 1,"Jakarta, Indonesia",Hiring Now! Entry-Level English Teacher (ESL) ...,About the job\nJoin English 1 (formerly EF Eng...,https://www.linkedin.com/jobs/search/?currentJ...,en
1,Path Finder Investment,Jakarta Metropolitan Area,Marketing Strategist,About the job\nMarketing Strategist\nThe Canop...,https://www.linkedin.com/jobs/search/?currentJ...,en
2,deVere Group,"Jakarta, Indonesia",Wealth Management Advisor - Jakarta,About the job\nJoin Our Team as a Financial Sa...,https://www.linkedin.com/jobs/search/?currentJ...,en
3,Media Minds,"Central Jakarta, Jakarta, Indonesia",Service Technician,About the job\nCompany Description\n AAA Tradi...,https://www.linkedin.com/jobs/search/?currentJ...,en
4,SLB,"North Cikarang, West Java, Indonesia",Sales Representative - Valve Services,About the job\nAbout Us\n\nWe are a global tec...,https://www.linkedin.com/jobs/search/?currentJ...,en


## Expectations

In [13]:
# Expectation 1 : Kolom `job_description` harus unik/tidak ada duplicate

validator.expect_compound_columns_to_be_unique(
    column_list=['company_name', 'job_location', 'job_title', 'job_description', 'job_url']
)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3497,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [14]:
# Expectation 2 : Kolom `job_description` harus ada pada dataset

validator.expect_column_to_exist(column='job_description')

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "result": {},
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# Expectation 3 : Seluruh kolom tidak ada missing value

validator.expect_column_values_to_not_be_null('company_name')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3497,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [16]:
validator.expect_column_values_to_not_be_null('job_location')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3497,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [17]:
validator.expect_column_values_to_not_be_null('job_title')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3497,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [18]:
validator.expect_column_values_to_not_be_null('job_description')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3497,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [19]:
validator.expect_column_values_to_not_be_null('job_description')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 3497,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Save expectations suite

In [20]:
# Save into Expectation Suite
validator.save_expectation_suite(discard_failed_expectations=False)

## Checkpoint

In [21]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [22]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()
checkpoint_result

Calculating Metrics:   0%|          | 0/29 [00:00<?, ?it/s]

{
  "run_id": {
    "run_name": null,
    "run_time": "2025-11-07T01:15:39.243307+07:00"
  },
  "run_results": {
    "ValidationResultIdentifier::expectation-job-dataset/__none__/20251106T181539.243307Z/csv-data-job-job-november": {
      "validation_result": {
        "success": true,
        "results": [
          {
            "success": true,
            "expectation_config": {
              "expectation_type": "expect_compound_columns_to_be_unique",
              "kwargs": {
                "column_list": [
                  "company_name",
                  "job_location",
                  "job_title",
                  "job_description",
                  "job_url"
                ],
                "batch_id": "csv-data-job-job-november"
              },
              "meta": {}
            },
            "result": {
              "element_count": 3497,
              "unexpected_count": 0,
              "unexpected_percent": 0.0,
              "partial_unexpected_list": [],
  

## Data Docs
Kemudian data docs akan di buat sehingga dapat ditampilkan pada dashboard yang dapat diakses pada folder `gx_data_context` yang telah di buat pada create data context sebelumnya, tepatnya di direktori `gx_data_context/uncommitted/data_docs/index.html`

In [23]:
# Build data docs

context.build_data_docs()

{'local_site': 'file://d:\\Hactiv8\\final project\\Final project\\Data\\gx_data_context\\gx\\uncommitted/data_docs/local_site/index.html'}