#### ETL Pipeline: Extract and Transform
#### DSA 2040A - Lab 3 & Lab 4
#### Student Name: Yar Deng Kuot
#### Student ID: [Your Student ID]

In [None]:
import pandas as pd
import numpy as np

def generate_datasets():
    """Generate full and incremental datasets."""
    # Full dataset
    full_data = pd.DataFrame({
        'order_id': [1001, 1002, 1003, 1004, 1005, 1006, 1003, 1007, 1008, 1009],
        'customer_id': ['C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C003', 'C007', 'C008', 'C009'],
        'order_date': ['2023-10-01', '10/15/2023', '2023-11-01', None, '2023-11-05', '2023-11-10', '2023-11-01', '2023/12/01', '2024-01-01', '2024-01-02'],
        'quantity': [2, 5, 3, 0, np.nan, 4, 3, 2, 1, 6],
        'unit_price': [20.0, 15.5, 25.0, 10.0, 30.0, np.nan, 25.0, 50.0, 22.5, 18.0],
        'product_category': ['Electronics', 'Clothing', 'Electronics', 'Books', None, 'Furniture', 'Electronics', 'Books', 'Clothing', 'Electronics']
    })

    # Incremental dataset
    incremental_data = pd.DataFrame({
        'order_id': [1010, 1011, 1012],
        'customer_id': ['C010', 'C011', 'C012'],
        'order_date': ['2024-01-03', '2024/01/04', '2024-01-05'],
        'quantity': [3, 2, np.nan],
        'unit_price': [40.0, np.nan, 15.0],
        'product_category': ['Furniture', 'Electronics', 'Books']
    })

    # Save raw datasets
    full_data.to_csv('raw_full.csv', index=False)
    incremental_data.to_csv('raw_incremental.csv', index=False)

    return full_data, incremental_data

def transform_data(df):
    """Apply transformations: cleaning, enrichment, and structural changes."""
    # 1. Cleaning: Handle missing values and remove duplicates
    df = df.dropna(subset=['order_date', 'quantity', 'product_category'])
    df = df.fillna({'unit_price': df['unit_price'].mean()})
    df = df.drop_duplicates(subset=['order_id'])

    # 2. Enrichment: Add total_price column
    df['total_price'] = df['quantity'] * df['unit_price']

    # 3. Structural: Standardize date format and convert data types
    df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['quantity'] = df['quantity'].astype(int)
    df['unit_price'] = df['unit_price'].astype(float)

    return df

def main():
    """Main function to run the ETL pipeline."""
    print("Starting ETL Pipeline...")
    
    # Extract data
    full_data, incremental_data = generate_datasets()
    print("\nFull Dataset:")
    print(full_data)
    print("\nIncremental Dataset:")
    print(incremental_data)

    # Transform full data
    transformed_full = transform_data(full_data)
    print("\nTransformed Full Dataset:")
    print(transformed_full)
    transformed_full.to_csv('transformed_full.csv', index=False)

    # Transform incremental data
    transformed_incremental = transform_data(incremental_data)
    print("\nTransformed Incremental Dataset:")
    print(transformed_incremental)
    transformed_incremental.to_csv('transformed_incremental.csv', index=False)

    print("\nETL Pipeline completed. Outputs saved as 'transformed_full.csv' and 'transformed_incremental.csv'.")

if __name__ == "__main__":
    main()


#### Missing Value

In [None]:
# Check Missing Values for ETL Pipeline

import pandas as pd
import numpy as np

def generate_datasets():
    """Generate full and incremental datasets."""
    # Full dataset
    full_data = pd.DataFrame({
        'order_id': [1001, 1002, 1003, 1004, 1005, 1006, 1003, 1007, 1008, 1009],
        'customer_id': ['C001', 'C002', 'C003', 'C004', 'C005', 'C006', 'C003', 'C007', 'C008', 'C009'],
        'order_date': ['2023-10-01', '10/15/2023', '2023-11-01', None, '2023-11-05', '2023-11-10', '2023-11-01', '2023/12/01', '2024-01-01', '2024-01-02'],
        'quantity': [2, 5, 3, 0, np.nan, 4, 3, 2, 1, 6],
        'unit_price': [20.0, 15.5, 25.0, 10.0, 30.0, np.nan, 25.0, 50.0, 22.5, 18.0],
        'product_category': ['Electronics', 'Clothing', 'Electronics', 'Books', None, 'Furniture', 'Electronics', 'Books', 'Clothing', 'Electronics']
    })

    # Incremental dataset
    incremental_data = pd.DataFrame({
        'order_id': [1010, 1011, 1012],
        'customer_id': ['C010', 'C011', 'C012'],
        'order_date': ['2024-01-03', '2024/01/04', '2024-01-05'],
        'quantity': [3, 2, np.nan],
        'unit_price': [40.0, np.nan, 15.0],
        'product_category': ['Furniture', 'Electronics', 'Books']
    })

    return full_data, incremental_data

def check_missing_values(df, dataset_name):
    """Check and summarize missing values in the dataset."""
    print(f"\nChecking Missing Values in {dataset_name} Dataset:")
    
    # Total missing values per column
    missing_counts = df.isna().sum()
    print("\nMissing Values per Column:")
    print(missing_counts)
    
    # Percentage of missing values per column
    missing_percent = (df.isna().sum() / len(df)) * 100
    print("\nPercentage of Missing Values per Column:")
    print(missing_percent)
    
    # Rows with any missing values
    missing_rows = df[df.isna().any(axis=1)]
    print("\nRows with Missing Values:")
    print(missing_rows)
    
    # Summary statistics
    total_missing = df.isna().sum().sum()
    print(f"\nTotal Missing Values in {dataset_name} Dataset: {total_missing}")
    
    # Save missing values summary to a file
    with open(f'missing_values_{dataset_name.lower()}.txt', 'w') as f:
        f.write(f"Missing Values Summary for {dataset_name} Dataset\n\n")
        f.write("Missing Values per Column:\n")
        f.write(missing_counts.to_string())
        f.write("\n\nPercentage of Missing Values per Column:\n")
        f.write(missing_percent.to_string())
        f.write("\n\nTotal Missing Values: {}\n".format(total_missing))

def main():
    """Main function to check missing values."""
    print("Starting Missing Values Analysis...")
    
    # Load datasets
    full_data, incremental_data = generate_datasets()
    
    # Check missing values for both datasets
    check_missing_values(full_data, "Full")
    check_missing_values(incremental_data, "Incremental")
    
    print("\nMissing values analysis completed. Summaries saved as 'missing_values_full.txt' and 'missing_values_incremental.txt'.")

if __name__ == "__main__":
    main()