# 01 - Data Cleaning

**Team3 - Singapore Jobs Analytics**

This notebook handles loading and cleaning the raw SGJobData dataset (~1M+ rows).

**Objectives:**
- Load the raw CSV dataset and verify its structure
- Identify and remove empty/invalid rows (NULL IDs, placeholder salaries)
- Analyze missing values and data quality issues
- Export the cleaned dataset for downstream analysis

**Tools:** DuckDB (in-memory OLAP), Pandas, Matplotlib

---
## 1. Setup & Data Loading

In [9]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os

warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid', palette='muted')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100

# Connect to DuckDB in-memory
con = duckdb.connect(':memory:')
print('DuckDB version:', duckdb.__version__)

DuckDB version: 1.1.3


In [10]:
# Load CSV data into staging table (no filters — raw data as-is)
con.execute("""
    CREATE TABLE jobs_staging AS
    SELECT * FROM read_csv_auto('../data/raw/SGJobData.csv', header=true, sample_size=-1)
""")

raw_count = con.execute('SELECT COUNT(*) FROM jobs_staging').fetchone()[0]
print(f'Loaded {raw_count:,} rows into jobs_staging (raw, unfiltered)')

Loaded 1,048,585 rows into jobs_staging (raw, unfiltered)


In [11]:
# Verify raw row count and basic table info
col_count = con.execute("SELECT COUNT(*) FROM information_schema.columns WHERE table_name = 'jobs_staging'").fetchone()[0]
print(f'Raw dataset: {raw_count:,} rows x {col_count} columns')

Raw dataset: 1,048,585 rows x 22 columns


---
## 2. Identify & Remove Empty/Invalid Rows

Before analysis, we inspect the raw data for empty, placeholder, or invalid rows and clean them.
This step ensures downstream analysis is not skewed by garbage data.

In [5]:
# Step 1: Identify empty/invalid rows in the raw data
print('=== Empty / Invalid Row Analysis ===\n')

checks = [
    ("metadata_jobPostId IS NULL", "NULL job ID (no identifier)"),
    ("title IS NULL OR TRIM(title) = ''", "Empty title"),
    ("postedCompany_name IS NULL OR TRIM(postedCompany_name) = ''", "Empty company name"),
    ("salary_minimum = 0 AND salary_maximum = 0", "Both salaries = 0"),
    ("average_salary = 0", "Zero average salary"),
    ("salary_minimum = 1 AND salary_maximum = 1 AND average_salary = 1", "Placeholder salary (min=1, max=1, avg=1)"),
    ("numberOfVacancies = 0", "Zero vacancies"),
    ("categories IS NULL OR TRIM(categories) = '' OR categories = '[]'", "Empty categories"),
]

results = []
for condition, label in checks:
    cnt = con.execute(f"SELECT COUNT(*) FROM jobs_staging WHERE {condition}").fetchone()[0]
    pct = cnt / raw_count * 100
    results.append({'Issue': label, 'Row Count': cnt, '% of Total': round(pct, 2)})
    print(f'  {label:45s} {cnt:>10,} rows ({pct:.2f}%)')

print(f'\n  {"Total raw rows":45s} {raw_count:>10,}')

issues_df = pd.DataFrame(results)
issues_df

=== Empty / Invalid Row Analysis ===

  NULL job ID (no identifier)                        3,988 rows (0.38%)
  Empty title                                        3,988 rows (0.38%)
  Empty company name                                 3,988 rows (0.38%)
  Both salaries = 0                                  3,988 rows (0.38%)
  Zero average salary                                3,988 rows (0.38%)
  Placeholder salary (min=1, max=1, avg=1)           1,804 rows (0.17%)
  Zero vacancies                                     3,988 rows (0.38%)
  Empty categories                                   3,988 rows (0.38%)

  Total raw rows                                 1,048,585


Unnamed: 0,Issue,Row Count,% of Total
0,NULL job ID (no identifier),3988,0.38
1,Empty title,3988,0.38
2,Empty company name,3988,0.38
3,Both salaries = 0,3988,0.38
4,Zero average salary,3988,0.38
5,"Placeholder salary (min=1, max=1, avg=1)",1804,0.17
6,Zero vacancies,3988,0.38
7,Empty categories,3988,0.38


In [None]:
# Visualize data quality issues
fig, ax = plt.subplots(figsize=(12, 5))
plot_issues = issues_df[issues_df['Row Count'] > 0].sort_values('Row Count')

colors = ['#EF4444' if cnt > raw_count * 0.01 else '#F59E0B' if cnt > 100 else '#10B981'
          for cnt in plot_issues['Row Count']]
bars = ax.barh(plot_issues['Issue'], plot_issues['Row Count'], color=colors)

for bar, pct in zip(bars, plot_issues['% of Total']):
    ax.text(bar.get_width() + raw_count * 0.003, bar.get_y() + bar.get_height()/2,
            f'{int(bar.get_width()):,} ({pct}%)', va='center', fontsize=9)

ax.set_xlabel('Number of Rows')
ax.set_title('Data Quality Issues Found in Raw Data')
plt.tight_layout()
plt.show()

In [None]:
# Step 2: Inspect the empty rows — are they all the same set?
print('=== Do the empty rows overlap? ===\n')

overlap = con.execute("""
    SELECT COUNT(*) as cnt FROM jobs_staging
    WHERE metadata_jobPostId IS NULL
      AND (title IS NULL OR TRIM(title) = '')
      AND salary_minimum = 0 AND salary_maximum = 0
      AND (postedCompany_name IS NULL OR TRIM(postedCompany_name) = '')
      AND numberOfVacancies = 0
""").fetchone()[0]
print(f'Rows matching ALL empty criteria simultaneously: {overlap:,}')
print(f'Rows with NULL job ID:                           {con.execute("SELECT COUNT(*) FROM jobs_staging WHERE metadata_jobPostId IS NULL").fetchone()[0]:,}')
print(f'=> The {overlap:,} completely empty rows are the SAME set as NULL job ID rows.\n')

# Preview sample empty rows
print('=== Sample of completely empty rows ===')
sample_empty = con.execute("""
    SELECT title, postedCompany_name, salary_minimum, salary_maximum,
           average_salary, numberOfVacancies, categories, metadata_jobPostId
    FROM jobs_staging
    WHERE metadata_jobPostId IS NULL
    LIMIT 5
""").fetchdf()
display(sample_empty)

# Check placeholder salary rows (min=1, max=1) — these survive the NULL filter
print('\n=== Sample of placeholder salary rows (min=1, max=1, avg=1) ===')
sample_placeholder = con.execute("""
    SELECT title, postedCompany_name, salary_minimum, salary_maximum,
           average_salary, positionLevels, categories
    FROM jobs_staging
    WHERE metadata_jobPostId IS NOT NULL
      AND salary_minimum = 1 AND salary_maximum = 1 AND average_salary = 1
    LIMIT 5
""").fetchdf()
display(sample_placeholder)

placeholder_count = con.execute("""
    SELECT COUNT(*) FROM jobs_staging
    WHERE metadata_jobPostId IS NOT NULL
      AND salary_minimum = 1 AND salary_maximum = 1 AND average_salary = 1
""").fetchone()[0]
print(f'\nTotal placeholder salary rows: {placeholder_count:,} ({placeholder_count/raw_count*100:.2f}%)')
print('These are real jobs where salary was not disclosed (SGD $1 is a placeholder).')

In [None]:
# Step 3: Clean the data — remove empty rows and handle placeholder salaries
print('=== Data Cleaning ===\n')

# Remove completely empty rows (NULL job ID) and placeholder salary rows
con.execute("""
    CREATE TABLE jobs_raw AS
    SELECT * FROM jobs_staging
    WHERE metadata_jobPostId IS NOT NULL          -- Remove 3,988 empty rows
      AND NOT (salary_minimum = 1
               AND salary_maximum = 1
               AND average_salary = 1)            -- Remove placeholder salary rows
""")

cleaned_count = con.execute('SELECT COUNT(*) FROM jobs_raw').fetchone()[0]
empty_removed = raw_count - cleaned_count
placeholder_removed = con.execute("""
    SELECT COUNT(*) FROM jobs_staging
    WHERE metadata_jobPostId IS NOT NULL
      AND salary_minimum = 1 AND salary_maximum = 1 AND average_salary = 1
""").fetchone()[0]
null_id_removed = raw_count - con.execute(
    "SELECT COUNT(*) FROM jobs_staging WHERE metadata_jobPostId IS NOT NULL"
).fetchone()[0]

print(f'Cleaning steps applied:')
print(f'  1. Removed rows with NULL job ID (completely empty): {null_id_removed:,} rows')
print(f'  2. Removed placeholder salary rows (min=1, max=1, avg=1): {placeholder_removed:,} rows')
print(f'  ---')
print(f'  Total rows removed: {empty_removed:,} ({empty_removed/raw_count*100:.2f}%)')
print(f'  Remaining rows:     {cleaned_count:,}')

row_count = cleaned_count

In [None]:
# Step 4: Verify cleaning — before vs after comparison
print('=== Before vs After Cleaning ===\n')

verify_checks = [
    ("metadata_jobPostId IS NULL", "NULL job ID"),
    ("title IS NULL OR TRIM(title) = ''", "Empty title"),
    ("postedCompany_name IS NULL OR TRIM(postedCompany_name) = ''", "Empty company"),
    ("salary_minimum = 0 AND salary_maximum = 0", "Both salaries = 0"),
    ("salary_minimum = 1 AND salary_maximum = 1 AND average_salary = 1", "Placeholder salary (1/1/1)"),
    ("average_salary = 0", "Zero average salary"),
    ("numberOfVacancies = 0", "Zero vacancies"),
]

print(f'{"Check":<40s} {"Before":>12s} {"After":>12s} {"Removed":>12s}')
print('=' * 78)
for cond, label in verify_checks:
    before = con.execute(f"SELECT COUNT(*) FROM jobs_staging WHERE {cond}").fetchone()[0]
    after = con.execute(f"SELECT COUNT(*) FROM jobs_raw WHERE {cond}").fetchone()[0]
    print(f'{label:<40s} {before:>12,} {after:>12,} {before - after:>12,}')

print('=' * 78)
print(f'{"TOTAL ROWS":<40s} {raw_count:>12,} {cleaned_count:>12,} {raw_count - cleaned_count:>12,}')

# Visualize before/after
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# Pie chart: kept vs removed
removed = raw_count - cleaned_count
axes[0].pie([cleaned_count, removed],
            labels=[f'Kept\n{cleaned_count:,}', f'Removed\n{removed:,}'],
            colors=['#10B981', '#EF4444'], autopct='%1.2f%%', startangle=90,
            textprops={'fontsize': 11})
axes[0].set_title('Data Cleaning: Rows Kept vs Removed')

# Breakdown of removed rows
removal_data = pd.DataFrame({
    'Reason': ['Empty rows\n(NULL job ID)', 'Placeholder salary\n(min=max=avg=1)'],
    'Count': [null_id_removed, placeholder_removed]
})
bars = axes[1].bar(removal_data['Reason'], removal_data['Count'], color=['#EF4444', '#F59E0B'])
for bar in bars:
    axes[1].text(bar.get_x() + bar.get_width()/2., bar.get_height(),
                 f'{int(bar.get_height()):,}', ha='center', va='bottom', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Rows Removed')
axes[1].set_title('Breakdown of Removed Rows')

plt.tight_layout()
plt.show()

### Data Cleaning Summary

**Raw data:** 1,048,585 rows loaded from `SGJobData.csv`

**Rows removed:**
1. **3,988 completely empty rows** — All fields are NULL/zero: no job ID, no title, no company, no salary. These are garbage rows with no usable information.
2. **~1,700 placeholder salary rows** — `salary_minimum = 1`, `salary_maximum = 1`, `average_salary = 1`. These are real job postings where the employer chose not to disclose salary. The SGD $1 value is a system placeholder that would distort salary analysis.

**Rows kept (not removed):**
- **Zero views / zero applications** — Valid for newly posted or recently closed jobs. Removing them would bias engagement analysis.
- **Zero experience required** — Valid for entry-level positions (10.96% of data).
- **Salary min=1, max > 1** — Valid jobs with non-placeholder salaries (e.g., internships with $1 min and $1,000 max).

**Cleaned dataset:** ~1,042,900 rows (99.5% retained) → stored in `jobs_raw` table for all downstream analysis.

---
## 3. Missing Value Analysis

In [None]:
# Count nulls AND empty values for all columns (broader "missing" definition)
schema = con.execute('DESCRIBE jobs_raw').fetchdf()
columns = schema['column_name'].tolist()
null_queries = [
    f"SUM(CASE WHEN \"{col}\" IS NULL OR CAST(\"{col}\" AS VARCHAR) = '' OR CAST(\"{col}\" AS VARCHAR) = '[]' THEN 1 ELSE 0 END) as \"{col}\""
    for col in columns
]
null_sql = f"SELECT {', '.join(null_queries)} FROM jobs_raw"
null_counts = con.execute(null_sql).fetchdf()

null_df = null_counts.T.reset_index()
null_df.columns = ['column', 'missing_count']
null_df['missing_pct'] = (null_df['missing_count'] / row_count * 100).round(2)
null_df = null_df.sort_values('missing_pct', ascending=True)

print(f'Missing value check (NULL + empty string + empty JSON array):')
print(f'Total rows: {row_count:,}\n')
null_df

In [None]:
# Horizontal bar chart of missing percentages
fig, ax = plt.subplots(figsize=(10, 8))
colors = ['#EF4444' if pct > 50 else '#F59E0B' if pct > 10 else '#10B981' for pct in null_df['missing_pct']]
ax.barh(null_df['column'], null_df['missing_pct'], color=colors)
ax.set_xlabel('% Missing (NULL + empty)')
ax.set_title('Missing / Empty Values by Column')
ax.axvline(x=50, color='red', linestyle='--', alpha=0.5, label='50% threshold')
ax.legend()

for i, (pct, col) in enumerate(zip(null_df['missing_pct'], null_df['column'])):
    ax.text(pct + 0.5, i, f'{pct}%', va='center', fontsize=8)

plt.tight_layout()
plt.show()

# Value distribution for key text fields
print('\n--- Unique value counts for key fields ---')
for field in ['positionLevels', 'employmentTypes', 'salary_type', 'status_jobStatus']:
    dist = con.execute(f"""
        SELECT "{field}", COUNT(*) as cnt,
               ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as pct
        FROM jobs_raw
        GROUP BY "{field}"
        ORDER BY cnt DESC
    """).fetchdf()
    print(f'\n{field} ({len(dist)} unique values):')
    print(dist.to_string(index=False))

### Missing Value Findings

- **100% missing:** `occupationId` is entirely NULL across all 1M+ rows — this column is unusable and excluded from analysis.
- **0% missing:** All other 21 columns are fully populated with no NULLs, empty strings, or empty JSON arrays.
- **`positionLevels`** and **`employmentTypes`** are stored as plain strings (e.g., "Executive", "Permanent"), not JSON arrays — they are fully populated for every row.
- **Salary fields:** `salary_minimum`, `salary_maximum`, and `average_salary` have no nulls.
- **Core fields** (`metadata_jobPostId`, `title`, `postedCompany_name`) have zero nulls, ensuring strong coverage for primary analysis.
- **Engagement metrics** (`views`, `applications`) are fully populated, enabling reliable engagement analysis.

---
## 4. Data Quality Checks

In [None]:
# Check for duplicate job IDs
duplicates = con.execute("""
    SELECT metadata_jobPostId, COUNT(*) as cnt
    FROM jobs_raw
    GROUP BY metadata_jobPostId
    HAVING COUNT(*) > 1
""").fetchdf()

print(f'Duplicate job IDs: {len(duplicates):,}')
if len(duplicates) > 0:
    print(f'Total duplicate rows: {duplicates["cnt"].sum() - len(duplicates):,}')
    print(duplicates.head(10))

In [None]:
# Invalid salary ranges (max < min)
invalid_salary = con.execute("""
    SELECT COUNT(*) as invalid_count,
           ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM jobs_raw WHERE salary_minimum IS NOT NULL AND salary_maximum IS NOT NULL), 2) as pct
    FROM jobs_raw
    WHERE salary_maximum < salary_minimum
      AND salary_minimum IS NOT NULL
      AND salary_maximum IS NOT NULL
""").fetchdf()
print('Invalid salary ranges (max < min):')
print(invalid_salary)

In [None]:
# Empty JSON fields check
json_quality = con.execute("""
    SELECT
        SUM(CASE WHEN categories IS NULL OR categories = '' OR categories = '[]' THEN 1 ELSE 0 END) as empty_categories,
        SUM(CASE WHEN positionLevels IS NULL OR positionLevels = '' OR positionLevels = '[]' THEN 1 ELSE 0 END) as empty_position_levels,
        SUM(CASE WHEN employmentTypes IS NULL OR employmentTypes = '' OR employmentTypes = '[]' THEN 1 ELSE 0 END) as empty_employment_types,
        COUNT(*) as total
    FROM jobs_raw
""").fetchdf()

json_quality_pct = json_quality.copy()
total = json_quality['total'].iloc[0]
for col in ['empty_categories', 'empty_position_levels', 'empty_employment_types']:
    json_quality_pct[col] = (json_quality[col] / total * 100).round(2)
print('Empty JSON fields (% of total rows):')
json_quality_pct[['empty_categories', 'empty_position_levels', 'empty_employment_types']]

In [None]:
# Salary type distribution
salary_types = con.execute("""
    SELECT salary_type, COUNT(*) as cnt,
           ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as pct
    FROM jobs_raw
    GROUP BY salary_type
    ORDER BY cnt DESC
""").fetchdf()
print('Salary type distribution:')
salary_types

---
## 5. Export Cleaned Data

Export the cleaned dataset to `data/processed/` for use by downstream notebooks (EDA, Feature Engineering).

In [None]:
# Export cleaned data to parquet
os.makedirs('../data/processed', exist_ok=True)

con.execute("""
    COPY jobs_raw TO '../data/processed/jobs_cleaned.parquet' (FORMAT PARQUET)
""")

# Verify export
exported_count = con.execute("SELECT COUNT(*) FROM read_parquet('../data/processed/jobs_cleaned.parquet')").fetchone()[0]
print(f'Exported {exported_count:,} rows to ../data/processed/jobs_cleaned.parquet')
print(f'Matches cleaned count: {exported_count == cleaned_count}')

In [None]:
# Clean up
con.close()
print('Data cleaning complete. Cleaned data exported to data/processed/jobs_cleaned.parquet')