In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [16]:
input_folder = Path(r'C:\Users\USER\OneDrive\Documents\DSC_Project\20250517_資料集\工程訂單')
output_folder = Path(r'C:\Users\USER\OneDrive\Documents\DSC_Project\20250517_資料集\工程訂單\工程訂單_report')
output_folder.mkdir(exist_ok=True)

In [None]:
csv_files = ['main.csv', 'customizations.csv', 'services.csv']

for filename in csv_files:
    csv_path = input_folder / filename
    if not csv_path.exists():
        print(f"File not found: {csv_path}")
        continue
    df = pd.read_csv(csv_path)
    csv_name = csv_path.stem

    num_cols = df.select_dtypes(include='number').columns.tolist()
    cat_cols = df.select_dtypes(include='object').columns.tolist()

    # Datetime detection: treat as datetime if any value parses as datetime
    datetime_cols = []
    for col in df.columns:
        if df[col].dtype == 'object':
            parsed = pd.to_datetime(df[col], format=None, errors='coerce')
            if parsed.notnull().any():
                datetime_cols.append(col)

    cat_cols = [c for c in cat_cols if c not in datetime_cols]


    null_pct = (df.isnull().mean() * 100).round(2)

    # Compose the report (formatted for Obsidian)
    report_lines = []
    report_lines.append(f"# EDA Report for `{csv_name}`\n")
    report_lines.append("---\n")

    # 1. Feature Data Types
    report_lines.append("## 1. Feature Data Types\n")
    report_lines.append(f"**Numerical:** {', '.join(num_cols) if num_cols else 'None'}  \n")
    report_lines.append(f"**Categorical:** {', '.join([c for c in cat_cols if c not in datetime_cols]) if cat_cols else 'None'}  \n")
    report_lines.append(f"**Datetime:** {', '.join(datetime_cols) if datetime_cols else 'None'}\n")
    report_lines.append("\n---\n")

    # 2. Null Percentage of Each Feature
    report_lines.append("## 2. Null Percentage of Each Feature\n")
    report_lines.append(null_pct.to_frame('Null %').to_markdown(index=True))
    report_lines.append("\n---\n")

    # 3. Summary Statistics
    report_lines.append("## 3. Summary Statistics\n")

    # Numerical Features
    if num_cols:
        report_lines.append("### Numerical Features\n")
        report_lines.append(df[num_cols].describe().to_markdown(index=True))
        report_lines.append("\n")
    else:
        report_lines.append("No numerical features found.\n")

    # Categorical Features
    if cat_cols:
        report_lines.append("### Categorical Features\n")
        for col in cat_cols:
            vc = df[col].value_counts().head(10)
            report_lines.append(f"**{col} (Top 10 categories):**\n")
            report_lines.append(vc.to_frame('Count').to_markdown(index=True))
            report_lines.append("\n")
    else:
        report_lines.append("No categorical features found.\n")

    # Datetime Features
    if datetime_cols:
        report_lines.append("### Datetime Features\n")
        for col in datetime_cols:
            parsed = pd.to_datetime(df[col], errors='coerce')
            valid_dates = parsed.dropna()
            report_lines.append(f"**{col}**  ")
            report_lines.append(f"- Non-null count: {parsed.notnull().sum()}")
            valid_dates = pd.to_datetime(df[col], errors='coerce').dropna()
            if not valid_dates.empty:
                # Ensure valid_dates is actually datetime dtype
                valid_dates = pd.Series(valid_dates)
                if hasattr(valid_dates, "dt"):
                    try:
                        valid_dates = valid_dates.dt.tz_localize(None)
                    except AttributeError:
                        # already naive, nothing to do
                        pass
                    report_lines.append(f"- Earliest: {valid_dates.min()}")
                    report_lines.append(f"- Latest: {valid_dates.max()}")
                else:
                    report_lines.append("- Could not parse as datetimelike.")
            else:
                report_lines.append("- No valid datetime values found.")
        else:
            report_lines.append("No datetime features found.\n")

    report_lines.append("---\n")

    # Save Markdown report
    md_path = output_folder / f"{csv_name}_EDA_Report.md"
    with open(md_path, "w", encoding="utf-8") as f:
        f.write("\n".join(report_lines))

  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime(df[col], format=None, errors='coerce')
  parsed = pd.to_datetime

In [18]:
print(f"EDA reports saved to: {output_folder}")

EDA reports saved to: C:\Users\USER\OneDrive\Documents\DSC_Project\20250517_資料集\工程訂單\工程訂單_report
