# SharePoint Export Pipeline

ETL pipeline that:
1. Extracts data from SharePoint CSV and Salesforce Excel exports
2. Transforms data with cleaning, product explosion, and metrics calculation
3. Loads data into Snowflake with incremental or full refresh

---

## 1. Configuration & Imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.backends import default_backend
import os
from datetime import datetime
import logging
from typing import List, Dict, Tuple

# Import product mappings from separate file
from product_mappings import PRODUCT_CONFIGS

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("✓ Imports complete")

In [None]:
# ============================================================================
# SNOWFLAKE CONFIGURATION
# ============================================================================

SNOWFLAKE_CONFIG = {
    'account': "uhgdwaas.east-us-2.azure",
    'user': os.getenv('SF_USERNAME'),
    'password': os.getenv('SF_PW'),
    'role': "AZU_SDRP_CSZNB_PRD_DEVELOPER_ROLE",
    'warehouse': "CSZNB_PRD_ANALYTICS_XS_WH",
    'database': 'CSZNB_PRD_OA_DEV_DB',
    'schema': 'BASE'
}

# ============================================================================
# TABLE NAMES
# ============================================================================

SOURCE_TABLE = 'SHAREPOINT_ANALYTIC_REQUESTS'
TARGET_TABLE = 'FOCUSED_ANALYTIC_REQUESTS'
SALESFORCE_TABLE = 'SALESFORCE_INITIATIVES'

# ============================================================================
# FILE PATHS
# ============================================================================

SHAREPOINT_EXPORT_PATH = Path.home() / "Library/CloudStorage/OneDrive-UHG/Projects/SharePoint/exports/sharepoint_requests.csv"
SALESFORCE_EXPORT_PATH = Path.home() / "Library/CloudStorage/OneDrive-UHG/Projects/SharePoint/exports/salesforce_exports.xlsx"

# ============================================================================
# BUSINESS LOGIC CONSTANTS
# ============================================================================

OPEN_STATUS = ['Not Started', 'In Progress', 'Waiting']
DAYS_ON_STATUS_THRESHOLD = 14

# Client type mapping
CLIENT_TYPE_MAPPING = {
    '1': 'Optum Direct NBEA',
    '2': 'Optum/UHC Cross Carrier NBEA',
    '3': 'UHC NBEA',
    '4': 'Opum Direct',
    '5': 'UHC Cross Carrier',
    '6': 'Prospective',
    '7': 'N/A',
    '8': 'N/A'
}

# Boolean columns
BOOLEAN_COLUMNS = [
    "BARIATRIC", "BH", "CGP", "CSP", "DM", "KIDNEY", "TRANSPLANT", "CHD", "VAD",
    "NICU", "MATERNITY", "FERTILITY", "FOCUSED_ANALYTICS", "OUTPATIENT_REHAB",
    "OHS", "FCR_PROFESSIONAL", "CKS", "CKD", "CARDIOLOGY", "DME", "INPATIENT_REHAB",
    "SPINE_PAIN_JOINT", "SPECIALTY_REDIRECTION", "MEDICAL_REBATES_ONBOARDING",
    "BRS", "DATA_INTAKE", "DATA_QAVC", "SPECIALTY_FUSION", "MBO_IMPLEMENTATION",
    "MSPN_IMPLEMENTATION", "VARIABLE_COPAY", "ACCUMULATOR_ADJUSTMENT",
    "SMGP", "SGP", "SECOND_MD", "KAIA", "MBO_PRESALES", "MSPN_PRESALES",
    "MEDICAL_REBATES_PREDEAL", "MAVEN", "CAR_REPORT", "MSK_MSS",
    "FCR_FACILITY", "RADIATION_ONCOLOGY", "VIRTA_HEALTH", "SMO_PRESALES",
    "SMO_IMPLEMENTATION", "SBO_HEALTH_TRUST_PRESALES", "SBO_HEALTH_TRUST_IMPLEMENTATION",
    "CORE_SBO", "ENHANCE_SBO", "OPTUM_GUIDE", "CYLINDER_HEALTH", "RESOURCE_BRIDGE",
    "PHS", "CANCER", "PODIMETRICS"
]

print("✓ Configuration loaded")
print(f"  - Snowflake database: {SNOWFLAKE_CONFIG['database']}")
print(f"  - Product configs: {len(PRODUCT_CONFIGS)} products")
print(f"  - Boolean columns: {len(BOOLEAN_COLUMNS)} columns")

## 2. Data Extraction Functions

In [None]:
def extract_sharepoint(file_path: Path) -> pd.DataFrame:
    """Load SharePoint CSV and return normalized DataFrame"""
    logger.info(f"Loading SharePoint export from {file_path}...")
    
    try:
        # Use low_memory=False to prevent mixed type warnings
        df = pd.read_csv(file_path, low_memory=False)
    except FileNotFoundError:
        logger.error(f"SharePoint file not found at {file_path}")
        raise
    
    # Normalize column names
    df.columns = df.columns.str.upper()
    
    logger.info(f"Loaded {len(df)} SharePoint records")
    logger.info(f"Columns: {df.columns.tolist()[:10]}...")  # Show first 10 columns
    
    return df


def extract_salesforce(file_path: Path) -> pd.DataFrame:
    """Load Salesforce Excel export and return normalized DataFrame"""
    logger.info(f"Loading Salesforce export from {file_path}...")
    
    try:
        # Read Excel file (assuming first sheet)
        df = pd.read_excel(file_path, sheet_name=0)
    except FileNotFoundError:
        logger.warning(f"Salesforce file not found at {file_path}")
        # Return empty DataFrame with expected columns
        return pd.DataFrame(columns=['SALESFORCE_ID', 'HAS_VALUE'])
    except Exception as e:
        logger.error(f"Error reading Salesforce file: {e}")
        return pd.DataFrame(columns=['SALESFORCE_ID', 'HAS_VALUE'])
    
    # Normalize column names
    df.columns = df.columns.str.upper()
    
    logger.info(f"Loaded {len(df)} Salesforce records")
    return df


print("✓ Extraction functions defined")

## 3. Data Cleaning & Transformation Functions

In [None]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """Clean and normalize raw data"""
    logger.info("Cleaning and normalizing data...")
    
    # Map client types
    df['CLIENT_TYPE_DETAIL'] = (
        df['CLIENT_TYPE_DETAIL']
        .astype(str)
        .map(CLIENT_TYPE_MAPPING)
        .fillna(df['CLIENT_TYPE_DETAIL'])
    )
    
    # Detect Spine Pain & Joint from product text
    df['SPINE_PAIN_JOINT'] = df['PRODUCTS_REQUESTED'].str.contains(
        "Spine Pain & Joint", case=False, na=False
    )
    
    # Fill null values in boolean columns
    df[BOOLEAN_COLUMNS] = df[BOOLEAN_COLUMNS].fillna(False)
    
    # Populate PRODUCTS_REQUESTED from boolean columns where null
    mask_null = df['PRODUCTS_REQUESTED'].isnull()
    if mask_null.any():
        bool_array = df.loc[mask_null, BOOLEAN_COLUMNS].values
        products_list = []
        
        for row_vals in bool_array:
            if row_vals.any():
                selected_cols = df.loc[mask_null, BOOLEAN_COLUMNS].columns[row_vals]
                products_list.append(', '.join(selected_cols.str.title()))
            else:
                products_list.append('None')
        
        df.loc[mask_null, 'PRODUCTS_REQUESTED'] = products_list
    
    logger.info("Data cleaning complete")
    return df


print("✓ Cleaning function defined")

In [None]:
def transform_products(df: pd.DataFrame, df_salesforce: pd.DataFrame) -> pd.DataFrame:
    """Transform wide-format data into product-level records"""
    logger.info("Starting product transformation...")
    
    # Explode products into separate rows
    df_exploded = _explode_products(df)
    
    # Enrich with Salesforce data
    df_enriched = _enrich_with_salesforce(df_exploded, df_salesforce)
    
    # Calculate metrics
    df_enriched = _calculate_metrics(df_enriched)
    
    logger.info(f"Transformation complete: {len(df_exploded)} product-level records created")
    return df_enriched


def _explode_products(df: pd.DataFrame) -> pd.DataFrame:
    """Explode wide-format data into product-level records"""
    records = []
    
    # Log available columns for debugging
    logger.info(f"Available columns in source data: {df.columns.tolist()[:20]}...")  # First 20
    
    for _, row in df.iterrows():
        for product_name, category, field, start_col, end_col, status_col in PRODUCT_CONFIGS:
            # Check if this product is requested
            if field in row.index and row[field]:
                record = {
                    'ID': row.get('ID'),
                    'TITLE': row.get('TITLE'),
                    'REQUEST_DATE': row.get('REQUEST_DATE'),
                    'CLIENT': row.get('CLIENT'),
                    'MARKET': row.get('MARKET'),
                    'REQUESTOR': row.get('REQUESTOR'),
                    'CLIENT_TYPE': row.get('CLIENT_TYPE_DETAIL'),
                    'OVERALL_STATUS': row.get('OVERALL_STATUS'),
                    'PRODUCTS_REQUESTED': row.get('PRODUCTS_REQUESTED'),
                    'SALESFORCE_ID': row.get('SALESFORCE_ID'),
                    'PRODUCT': product_name,
                    'PRODUCT_CATEGORY': category,
                    'START_DATE': row.get(start_col),
                    'COMPLETE_DATE': row.get(end_col),
                    'STATUS': row.get(status_col),
                    'STATUS_CHANGE_DATE': row.get('STATUS_CHANGE_DATE'),
                    'CLOSED_DATE': row.get('CLOSED_DATE'),
                    'PTRR': row.get('PTRR')
                }
                records.append(record)
    
    df_products = pd.DataFrame(records)
    logger.info(f"Exploded {len(df)} requests into {len(df_products)} product records")
    
    # Log sample of first record for debugging
    if len(df_products) > 0:
        logger.info(f"Sample record columns: {df_products.columns.tolist()}")
        logger.info(f"Sample values - TITLE: {df_products.iloc[0]['TITLE']}, CLIENT: {df_products.iloc[0]['CLIENT']}")
    
    return df_products


def _enrich_with_salesforce(df: pd.DataFrame, df_salesforce: pd.DataFrame) -> pd.DataFrame:
    """Join with Salesforce data to enrich records"""
    if df_salesforce is None or df_salesforce.empty:
        logger.warning("No Salesforce data available for enrichment")
        return df
    
    # Normalize Salesforce column names
    df_salesforce.columns = df_salesforce.columns.str.upper()
    
    # Merge on SALESFORCE_ID if available
    if 'SALESFORCE_ID' in df.columns and 'SALESFORCE_ID' in df_salesforce.columns:
        df = df.merge(
            df_salesforce[['SALESFORCE_ID', 'HAS_VALUE']],
            on='SALESFORCE_ID',
            how='left'
        )
        logger.info("Enriched with Salesforce data")
    
    return df


def _calculate_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate derived metrics"""
    today = pd.Timestamp.now()
    
    # Convert date columns to datetime
    date_columns = ['REQUEST_DATE', 'START_DATE', 'COMPLETE_DATE',
                   'STATUS_CHANGE_DATE', 'CLOSED_DATE']
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    
    # Calculate days open
    df['DAYS_OPEN'] = (today - df['REQUEST_DATE']).dt.days
    
    # Calculate product TAT (turnaround time)
    df['PRODUCT_TAT'] = (df['COMPLETE_DATE'] - df['START_DATE']).dt.days
    
    # Mark completed products
    df['COMPLETED_PRODUCT'] = df['STATUS'].isin(['Complete', 'Completed'])
    
    # Extract request type and year
    df['REQUEST_TYPE'] = df['TITLE'].str.extract(r'\[(.*?)\]')[0]
    df['REQUEST_YEAR'] = df['REQUEST_DATE'].dt.year
    
    # Determine if product is open
    df['PRODUCT_OPEN'] = df['STATUS'].isin(OPEN_STATUS)
    
    # Calculate days on current status
    df['DAYS_ON_STATUS'] = (today - df['STATUS_CHANGE_DATE']).dt.days
    df['DAYS_ON_STATUS'] = df['DAYS_ON_STATUS'].fillna(0).astype(int)
    
    # Flag items needing attention (open and on status > threshold)
    df['NEEDS_ATTENTION'] = (
        df['PRODUCT_OPEN'] &
        (df['DAYS_ON_STATUS'] > DAYS_ON_STATUS_THRESHOLD)
    )
    
    # Add HAS_VALUE if not present
    if 'HAS_VALUE' not in df.columns:
        df['HAS_VALUE'] = None
    
    # Generate SharePoint URL
    df['URL'] = df['ID'].apply(
        lambda x: f"https://sharepoint.com/sites/analytics/Lists/Requests/DispForm.aspx?ID={x}"
        if pd.notna(x) else None
    )
    
    logger.info("Calculated all metrics")
    return df


print("✓ Transformation functions defined")

## 4. Snowflake Functions

In [None]:
def get_snowflake_connection():
    """Establish connection to Snowflake"""
    logger.info("Connecting to Snowflake...")
    
    # Optional: Load private key for key-pair authentication
    pkey_pem = os.getenv("MY_SF_PKEY")
    pkey = None
    if pkey_pem:
        pkey = serialization.load_pem_private_key(
            pkey_pem.encode("utf-8"),
            password=None,
            backend=default_backend()
        )
    
    conn = snowflake.connector.connect(**SNOWFLAKE_CONFIG)
    logger.info("Successfully connected to Snowflake")
    return conn


def normalize_dates(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    """
    Convert datetime columns to string format 'YYYY-MM-DD' for reliable Snowflake DATE parsing.
    Returns: (normalized_dataframe, list_of_date_columns)
    """
    df = df.copy()
    date_columns = []
    
    for col in df.columns:
        # Check if column name ends with _DATE or contains DATE-related keywords
        if col.endswith('_DATE') or col in ['REQUEST_DATE', 'START_DATE', 'COMPLETE_DATE',
                                               'CLOSED_DATE', 'STATUS_CHANGE_DATE']:
            try:
                # Convert to datetime64 first (handles strings, floats, NaT, etc.)
                temp_dt = pd.to_datetime(df[col], errors='coerce')
                
                # Convert to string format 'YYYY-MM-DD', keeping NaT as None
                df[col] = temp_dt.apply(lambda x: x.strftime('%Y-%m-%d') if pd.notna(x) else None)
                
                # Explicitly cast to object dtype to ensure write_pandas treats as VARCHAR
                df[col] = df[col].astype('object')
                
                date_columns.append(col)
            except Exception as e:
                logger.warning(f"Could not convert {col} to date string: {e}")
    
    if date_columns:
        logger.info(f"Normalized {len(date_columns)} date columns to string format")
    
    return df, date_columns


def create_table_with_types(conn, table_name: str, df: pd.DataFrame, date_columns: List[str]):
    """
    Create table with explicit DATE column types for date columns.
    Numeric columns created as NUMBER, all other columns as VARCHAR.
    """
    cur = conn.cursor()
    
    database = SNOWFLAKE_CONFIG['database']
    schema = SNOWFLAKE_CONFIG['schema']
    
    # Build column definitions
    column_defs = []
    for col in df.columns:
        if col in date_columns:
            column_defs.append(f"{col} DATE")
        elif pd.api.types.is_numeric_dtype(df[col]):
            # Check if it's an integer or float
            if pd.api.types.is_integer_dtype(df[col]):
                column_defs.append(f"{col} NUMBER(38,0)")
            else:
                column_defs.append(f"{col} NUMBER(38,6)")
        else:
            # Use VARCHAR for all other columns
            column_defs.append(f"{col} VARCHAR")
    
    columns_sql = ",\n    ".join(column_defs)
    
    create_sql = f"""
    CREATE TABLE IF NOT EXISTS {database}.{schema}.{table_name} (
        {columns_sql}
    );
    """
    
    logger.info(f"Creating table {table_name} with {len(date_columns)} DATE columns...")
    
    try:
        cur.execute(create_sql)
        logger.info(f"Successfully created table {table_name}")
    except Exception as e:
        logger.warning(f"Table creation failed (may already exist): {e}")
    
    cur.close()


print("✓ Snowflake connection functions defined")

In [None]:
def load_incremental(conn, df: pd.DataFrame, table_name: str, match_key: str = 'ID'):
    """Load data with incremental MERGE on specified match key"""
    staging_table = f"{table_name}_STAGING"
    cur = conn.cursor()
    
    database = SNOWFLAKE_CONFIG['database']
    schema = SNOWFLAKE_CONFIG['schema']
    
    # Normalize dates
    df, date_columns = normalize_dates(df)
    
    # Ensure target table exists with proper DATE column types
    logger.info(f"Ensuring target table {table_name} exists with proper schema...")
    create_table_with_types(conn, table_name, df, date_columns)
    
    logger.info(f"Creating staging table for {table_name}...")
    # Drop staging table if it exists
    cur.execute(f"DROP TABLE IF EXISTS {database}.{schema}.{staging_table};")
    
    # Create staging table with same schema as target
    create_table_with_types(conn, staging_table, df, date_columns)
    
    logger.info(f"Loading {len(df)} rows into staging...")
    # Load data into pre-created table
    success, nchunks, nrows, _ = write_pandas(
        conn, df, staging_table,
        auto_create_table=False,
        overwrite=False
    )
    
    if not success:
        raise Exception("Failed to write to staging table")
    
    logger.info("Merging data...")
    
    # Build dynamic MERGE SQL
    all_columns = df.columns.tolist()
    update_cols = [col for col in all_columns if col != match_key]
    
    update_set_clause = ", ".join([f"target.{col} = source.{col}" for col in update_cols])
    insert_cols = ", ".join(all_columns)
    insert_vals = ", ".join([f"source.{col}" for col in all_columns])
    
    merge_sql = f"""
    MERGE INTO {database}.{schema}.{table_name} AS target
    USING {database}.{schema}.{staging_table} AS source
    ON target.{match_key} = source.{match_key}
    WHEN MATCHED THEN
        UPDATE SET
        {update_set_clause}
    WHEN NOT MATCHED THEN
        INSERT ({insert_cols})
        VALUES ({insert_vals});
    """
    
    result = cur.execute(merge_sql)
    rows_affected = result.fetchone()[0] if result.rowcount > 0 else 0
    logger.info(f"Merge complete: {rows_affected} rows affected")
    
    # Clean up staging table
    cur.execute(f"DROP TABLE IF EXISTS {database}.{schema}.{staging_table};")
    
    cur.close()
    logger.info(f"Incremental load complete for {table_name}")


def load_full_refresh(conn, df: pd.DataFrame, table_name: str):
    """Load data with full TRUNCATE and reload"""
    cur = conn.cursor()
    
    database = SNOWFLAKE_CONFIG['database']
    schema = SNOWFLAKE_CONFIG['schema']
    
    # Normalize dates
    df, date_columns = normalize_dates(df)
    
    # Ensure table exists with proper DATE column types
    logger.info(f"Ensuring target table {table_name} exists with proper schema...")
    create_table_with_types(conn, table_name, df, date_columns)
    
    logger.info(f"Truncating {table_name} (full reload)...")
    cur.execute(f"TRUNCATE TABLE {database}.{schema}.{table_name};")
    
    logger.info(f"Uploading {len(df)} rows to {table_name}...")
    # Load data into pre-created table
    success, nchunks, nrows, _ = write_pandas(
        conn, df, table_name,
        auto_create_table=False,
        overwrite=False
    )
    
    cur.close()
    logger.info(f"Successfully uploaded {nrows} rows to {table_name}")


print("✓ Snowflake loading functions defined")

## 5. Main Pipeline Execution

In [None]:
def run_pipeline(incremental: bool = True, dry_run: bool = False):
    """
    Execute full pipeline
    
    Args:
        incremental: If True, use incremental MERGE for raw table. If False, full refresh.
        dry_run: If True, skip Snowflake upload
    
    Returns:
        df_raw, df_transformed: DataFrames for inspection
    """
    logger.info("=" * 80)
    logger.info("STARTING SHAREPOINT EXPORT PIPELINE")
    logger.info("=" * 80)
    logger.info(f"Mode: {'INCREMENTAL' if incremental else 'FULL REFRESH'}")
    logger.info(f"Dry Run: {dry_run}")
    logger.info("=" * 80)
    
    start_time = datetime.now()
    
    try:
        # ========================================================================
        # PHASE 1: EXTRACTION
        # ========================================================================
        logger.info("PHASE 1: DATA EXTRACTION")
        
        df_sharepoint = extract_sharepoint(SHAREPOINT_EXPORT_PATH)
        df_salesforce = extract_salesforce(SALESFORCE_EXPORT_PATH)
        
        logger.info(f"Extraction complete: {len(df_sharepoint)} SharePoint, {len(df_salesforce)} Salesforce records")
        
        # ========================================================================
        # PHASE 2: CLEANING
        # ========================================================================
        logger.info("PHASE 2: DATA CLEANING")
        
        df_cleaned = clean_data(df_sharepoint)
        
        logger.info(f"Cleaning complete: {len(df_cleaned)} records")
        
        # ========================================================================
        # PHASE 3: TRANSFORMATION
        # ========================================================================
        logger.info("PHASE 3: DATA TRANSFORMATION")
        
        df_transformed = transform_products(df_cleaned, df_salesforce)
        
        logger.info(f"Transformation complete: {len(df_transformed)} product-level records")
        
        # ========================================================================
        # PHASE 4: LOADING TO SNOWFLAKE
        # ========================================================================
        logger.info("PHASE 4: DATA LOADING")
        
        if dry_run:
            logger.info("[DRY RUN] Skipping Snowflake upload")
        else:
            conn = get_snowflake_connection()
            
            try:
                # Load raw SharePoint data (incremental or full)
                if incremental:
                    logger.info(f"Loading raw data to {SOURCE_TABLE} (INCREMENTAL)...")
                    load_incremental(conn, df_cleaned, SOURCE_TABLE, match_key='ID')
                else:
                    logger.info(f"Loading raw data to {SOURCE_TABLE} (FULL REFRESH)...")
                    load_full_refresh(conn, df_cleaned, SOURCE_TABLE)
                
                # Load Salesforce data (always full refresh - small table)
                logger.info(f"Loading Salesforce data to {SALESFORCE_TABLE} (FULL REFRESH)...")
                load_full_refresh(conn, df_salesforce, SALESFORCE_TABLE)
                
                # Load transformed data (always full refresh for consistency)
                logger.info(f"Loading transformed data to {TARGET_TABLE} (FULL REFRESH)...")
                load_full_refresh(conn, df_transformed, TARGET_TABLE)
                
            finally:
                conn.close()
                logger.info("Snowflake connection closed")
        
        # ========================================================================
        # SUMMARY
        # ========================================================================
        elapsed = datetime.now() - start_time
        
        logger.info("=" * 80)
        logger.info("PIPELINE COMPLETED SUCCESSFULLY")
        logger.info(f"Total execution time: {elapsed}")
        logger.info(f"Records processed: {len(df_cleaned)} raw → {len(df_transformed)} transformed")
        logger.info("=" * 80)
        
        return df_cleaned, df_transformed
        
    except Exception as e:
        logger.error("=" * 80)
        logger.error("PIPELINE FAILED WITH EXCEPTION")
        logger.error(str(e), exc_info=True)
        logger.error("=" * 80)
        raise


print("✓ Main pipeline function defined")

## 6. Execute Pipeline

**Run this cell to execute the full pipeline**

In [None]:
# Execute the pipeline
# Change parameters as needed:
#   incremental=True  -> Incremental MERGE for raw table
#   incremental=False -> Full refresh for raw table
#   dry_run=True      -> Skip Snowflake upload (testing only)

df_raw, df_transformed = run_pipeline(incremental=True, dry_run=False)

## 7. Testing & Debugging

Use these cells to test individual components

In [None]:
# Test: Extract SharePoint data
df_test = extract_sharepoint(SHAREPOINT_EXPORT_PATH)
print(f"Rows: {len(df_test)}")
print(f"Columns: {df_test.columns.tolist()}")
df_test.head()

In [None]:
# Test: Data cleaning
df_cleaned_test = clean_data(df_test)
print(f"Client type mapping sample:")
df_cleaned_test[['CLIENT_TYPE_DETAIL']].head(10)

In [None]:
# Test: Product transformation
df_trans_test = transform_products(df_cleaned_test, pd.DataFrame())
print(f"Product records created: {len(df_trans_test)}")
print(f"Columns: {df_trans_test.columns.tolist()}")
df_trans_test.head()

In [None]:
# Check for null values in key columns
print("Null value counts:")
print(f"TITLE: {df_transformed['TITLE'].isnull().sum()} / {len(df_transformed)}")
print(f"CLIENT: {df_transformed['CLIENT'].isnull().sum()} / {len(df_transformed)}")
print(f"OVERALL_STATUS: {df_transformed['OVERALL_STATUS'].isnull().sum()} / {len(df_transformed)}")
print(f"ID: {df_transformed['ID'].isnull().sum()} / {len(df_transformed)}")

In [None]:
# Inspect sample records
print("Sample transformed records:")
df_transformed[['ID', 'TITLE', 'CLIENT', 'PRODUCT', 'OVERALL_STATUS']].head(10)

In [None]:
# Check data types in final DataFrame
print("Data types:")
df_transformed.dtypes

In [None]:
# Summary statistics
print(f"\nPipeline Summary:")
print(f"  Raw records: {len(df_raw)}")
print(f"  Transformed records: {len(df_transformed)}")
print(f"  Unique products: {df_transformed['PRODUCT'].nunique()}")
print(f"  Date range: {df_transformed['REQUEST_DATE'].min()} to {df_transformed['REQUEST_DATE'].max()}")