In [1]:
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get paths from .env file
DATA_DIR_RAW = os.getenv('DATA_DIR_RAW')
DATA_DIR_PROCESSED = os.getenv('DATA_DIR_PROCESSED')

# Create the directories if they do not exist
os.makedirs(DATA_DIR_RAW, exist_ok=True)
os.makedirs(DATA_DIR_PROCESSED, exist_ok=True)

print(f"Ensured directories exist: {DATA_DIR_RAW} and {DATA_DIR_PROCESSED}")

# Now your original code can run without the error
# ...
# df.to_csv(os.path.join(DATA_DIR_RAW, 'sample_data.csv'), index=False)

Ensured directories exist: data/raw and data/processed


In [1]:
# Part 1: Save in Two Formats
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get paths from environment variables
DATA_DIR_RAW = os.getenv('DATA_DIR_RAW')
DATA_DIR_PROCESSED = os.getenv('DATA_DIR_PROCESSED')

print(f"Raw data directory: {DATA_DIR_RAW}")
print(f"Processed data directory: {DATA_DIR_PROCESSED}")

# Create a sample DataFrame
data = {
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'value': [10.5, 20.2, 30.7, 40.1],
    'is_active': [True, False, True, False]
}
df = pd.DataFrame(data)

# Save to CSV in the raw directory
csv_path = os.path.join(DATA_DIR_RAW, 'sample_data.csv')
df.to_csv(csv_path, index=False)
print(f"DataFrame saved to CSV at: {csv_path}")

# Save to Parquet in the processed directory
# You may need to run: pip install pyarrow
parquet_path = os.path.join(DATA_DIR_PROCESSED, 'sample_data.parquet')
df.to_parquet(parquet_path, index=False)
print(f"DataFrame saved to Parquet at: {parquet_path}")

Raw data directory: data/raw
Processed data directory: data/processed
DataFrame saved to CSV at: data/raw\sample_data.csv
DataFrame saved to Parquet at: data/processed\sample_data.parquet


In [3]:
# Part 2: Reload and Validate

# Reload the CSV and Parquet files
df_csv = pd.read_csv(csv_path)
df_parquet = pd.read_parquet(parquet_path)

print("\n--- Validation ---")

# Define a validation function
def validate_dataframes(original_df, reloaded_df, format_name):
    """
    Compares original and reloaded DataFrames to check for consistency.
    """
    print(f"\nValidating {format_name} format...")
    # Check shapes
    shapes_match = original_df.shape == reloaded_df.shape
    print(f"Shapes match: {shapes_match}")

    # Check dtypes for critical columns
    dtypes_match = all(original_df.dtypes == reloaded_df.dtypes)
    # Note: CSV may load 'is_active' as object or bool, while parquet preserves it correctly
    print(f"Dtypes match: {dtypes_match}")
    print("Original dtypes:\n", original_df.dtypes)
    print("Reloaded dtypes:\n", reloaded_df.dtypes)

    if shapes_match and dtypes_match:
        print(f"✅ Validation successful for {format_name}!")
    else:
        print(f"❌ Validation failed for {format_name}.")

# Validate the reloaded DataFrames
validate_dataframes(df, df_csv, "CSV")
validate_dataframes(df, df_parquet, "Parquet")


--- Validation ---

Validating CSV format...
Shapes match: True
Dtypes match: True
Original dtypes:
 id             int64
name          object
value        float64
is_active       bool
dtype: object
Reloaded dtypes:
 id             int64
name          object
value        float64
is_active       bool
dtype: object
✅ Validation successful for CSV!

Validating Parquet format...
Shapes match: True
Dtypes match: True
Original dtypes:
 id             int64
name          object
value        float64
is_active       bool
dtype: object
Reloaded dtypes:
 id             int64
name          object
value        float64
is_active       bool
dtype: object
✅ Validation successful for Parquet!


In [5]:
# Part 3: Refactor to Utilities

import os
import pandas as pd
import warnings

def write_df(df: pd.DataFrame, filepath: str):
    """Writes a DataFrame to a file based on its suffix (.csv, .parquet)."""
    # Create the directory if it doesn't exist
    directory = os.path.dirname(filepath)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

    # Determine file format from suffix
    if filepath.endswith('.csv'):
        df.to_csv(filepath, index=False)
        print(f"DataFrame successfully written to CSV: {filepath}")
    elif filepath.endswith('.parquet'):
        try:
            df.to_parquet(filepath, index=False)
            print(f"DataFrame successfully written to Parquet: {filepath}")
        except ImportError:
            warnings.warn("Parquet engine (e.g., 'pyarrow') not found. Please install it to write Parquet files.")
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

def read_df(filepath: str) -> pd.DataFrame:
    """Reads a DataFrame from a file based on its suffix (.csv, .parquet)."""
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")

    if filepath.endswith('.csv'):
        return pd.read_csv(filepath)
    elif filepath.endswith('.parquet'):
        return pd.read_parquet(filepath)
    else:
        raise ValueError(f"Unsupported file format: {filepath}")

# Example usage of the utility functions
new_df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})

# Write using the utility function
write_df(new_df, os.path.join(DATA_DIR_PROCESSED, 'new_data.parquet'))
write_df(new_df, os.path.join(DATA_DIR_RAW, 'new_data.csv'))

# Read using the utility function
reloaded_parquet_df = read_df(os.path.join(DATA_DIR_PROCESSED, 'new_data.parquet'))
reloaded_csv_df = read_df(os.path.join(DATA_DIR_RAW, 'new_data.csv'))

print("\nReloaded DataFrame from Parquet:\n", reloaded_parquet_df)
print("\nReloaded DataFrame from CSV:\n", reloaded_csv_df)

DataFrame successfully written to Parquet: data/processed\new_data.parquet
DataFrame successfully written to CSV: data/raw\new_data.csv

Reloaded DataFrame from Parquet:
    a  b
0  1  3
1  2  4

Reloaded DataFrame from CSV:
    a  b
0  1  3
1  2  4
