In [7]:
# etl_extract.py
# This script handles the extraction phase of the ETL pipeline.
# It loads raw_data.csv and incremental_data.csv, inspects their structure, and saves raw copies to the data/ directory.

import pandas as pd
import os

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

 #Raw data (use the content you provided)
raw_data = pd.DataFrame({
    'order_id': [1, 2, 3, 4, 5, 4, 7, 8, 9, 10, ...],  # Truncated, use full data
    'customer_name': ['Diana', 'Eve', 'Charlie', 'Eve', 'Eve', 'Eve', 'Charlie', 'Charlie', 'Charlie', 'Eve', ...],
    'product': ['Tablet', 'Laptop', 'Laptop', 'Laptop', 'Tablet', 'Laptop', 'Monitor', 'Laptop', 'Monitor', 'Monitor', ...],
    'quantity': [None, None, 2, 2, 3, 2, 2, 3, None, 1, ...],
    'unit_price': [500, None, 250, 750, None, 750, 750, None, 750, 500, ...],
    'order_date': ['1/20/2024', '4/29/2024', '1/8/2024', '1/7/2024', '3/7/2024', '1/7/2024', '2/2/2024', '2/17/2024', '3/16/2024', '2/28/2024', ...],
    'region': ['South', 'North', None, 'West', 'South', 'West', 'West', None, 'West', 'North', ...]
})
incremental_data = pd.DataFrame({
    'order_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'customer_name': ['Alice', None, None, None, 'Heidi', None, None, None, 'Grace', 'Heidi'],
    'product': ['Laptop', 'Laptop', 'Laptop', 'Tablet', 'Tablet', 'Laptop', 'Tablet', 'Laptop', 'Laptop', 'Phone'],
    'quantity': [None, 1, 1, None, 2, 2, 1, None, 2, None],
    'unit_price': [900, 300, 600, 300, 600, 600, 600, 600, 600, None],
    'order_date': ['5/9/2024', '5/7/2024', '5/4/2024', '5/26/2024', '5/21/2024', '5/18/2024', '5/13/2024', '5/11/2024', '5/29/2024', '5/24/2024'],
    'region': ['Central', 'Central', 'Central', 'Central', 'North', 'Central', 'Central', None, 'Central', None]
})

os.makedirs('data', exist_ok=True)
raw_data.to_csv('data/raw_data.csv', index=False)
incremental_data.to_csv('data/incremental_data.csv', index=False)
print("CSV files created in data/ folder")

# Load raw_data.csv
raw_data = pd.read_csv('data/raw_data.csv')

# Display first 5 rows
print('Raw Data Head:')
print(raw_data.head())

# Display data info
print('\nRaw Data Info:')
print(raw_data.info())

# Observations for raw_data.csv
print('\nObservations for raw_data.csv:')
print(f'Missing values:\n{raw_data.isnull().sum()}')
print(f'Duplicates: {raw_data.duplicated().sum()}')
print('Suspicious columns:')
print('- quantity and unit_price have missing values, which may need imputation.')
print('- order_date may need type conversion to datetime.')
print('- customer_name and region have missing values, which may indicate incomplete records.')
print('- Duplicate rows detected, which may need removal.')

# Load incremental_data.csv
incremental_data = pd.read_csv('data/incremental_data.csv')

# Display first 5 rows
print('\nIncremental Data Head:')
print(incremental_data.head())

# Display data info
print('\nIncremental Data Info:')
print(incremental_data.info())

# Observations for incremental_data.csv
print('\nObservations for incremental_data.csv:')
print(f'Missing values:\n{incremental_data.isnull().sum()}')
print(f'Duplicates: {incremental_data.duplicated().sum()}')
print('Suspicious columns:')
print('- customer_name, quantity, unit_price, order_date, and region have missing values.')
print('- region has more missing values, which may require special handling.')
print('- No duplicates detected.')

# Save raw copies to data/ directory
raw_data.to_csv('data/raw_data.csv', index=False)
incremental_data.to_csv('data/incremental_data.csv', index=False)
print('\nRaw data files saved to data/ directory.')

CSV files created in data/ folder
Raw Data Head:
  order_id customer_name product quantity unit_price order_date region
0        1         Diana  Tablet      NaN        500  1/20/2024  South
1        2           Eve  Laptop      NaN        NaN  4/29/2024  North
2        3       Charlie  Laptop        2        250   1/8/2024    NaN
3        4           Eve  Laptop        2        750   1/7/2024   West
4        5           Eve  Tablet        3        NaN   3/7/2024  South

Raw Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   order_id       11 non-null     object
 1   customer_name  11 non-null     object
 2   product        11 non-null     object
 3   quantity       8 non-null      object
 4   unit_price     8 non-null      object
 5   order_date     11 non-null     object
 6   region         9 non-null      object
dtypes: object(7)
me