In [9]:
import os
import subprocess
import pandas as pd
from pathlib import Path
import re
from datetime import datetime

class TiffProcessingPipeline:
    """
    Pipeline to process multiple TIFF files through R script and collect results.
    """

    def __init__(self, r_script_path="DataMiningProjectCode.R", output_dir="pipeline_results", rscript_path=None):
        self.r_script_path = r_script_path
        self.output_dir = output_dir
        self.results = []
        os.makedirs(self.output_dir, exist_ok=True)

        # Find Rscript if not given
        self.rscript_path = rscript_path or self.find_rscript()

    def find_rscript(self):
        paths = [
            r"C:\Program Files\R\R-4.4.2\bin\Rscript.exe",
            r"C:\Program Files\R\R-4.4.1\bin\Rscript.exe",
            r"C:\Program Files\R\R-4.3.3\bin\Rscript.exe"
        ]
        for p in paths:
            if os.path.exists(p):
                return p
        return "Rscript"

    def create_temp_r_script(self, tiff_file, output_subdir):
        with open(self.r_script_path, "r") as f:
            script = f.read()

        tiff_file_r = tiff_file.replace("\\", "/")
        output_subdir_r = output_subdir.replace("\\", "/")
        base_name = Path(tiff_file).stem

        script = re.sub(
            r'image <- rast\(".*?\.tif"\)',
            f'image <- rast("{tiff_file_r}")',
            script
        )
        script = re.sub(
            r'writeRaster\(classified, ".*?\.tif"',
            f'writeRaster(classified, "{output_subdir_r}/{base_name}_classified.tif"',
            script
        )

        # Add: Export class stats from R
        script += f'''
# Export area_per_class summary to CSV ----
write.csv(area_per_class, "{output_subdir_r}/{base_name}_class_stats.csv", row.names = FALSE)
cat("Class stats written to {output_subdir_r}/{base_name}_class_stats.csv\\n")
'''

        temp_script = f"{output_subdir}/temp_script.R"
        with open(temp_script, "w") as f:
            f.write(script)
        return temp_script

    def parse_class_distribution(self, output_subdir, tiff_file):
        """Parse per-class pixel stats CSV written by R."""
        base_name = Path(tiff_file).stem
        csv_path = Path(output_subdir) / f"{base_name}_class_stats.csv"
        if csv_path.exists():
            df = pd.read_csv(csv_path)
            # Adjust to whatever your CSV actually has
            if "count" in df.columns:
                df.rename(columns={"count": "pixel_count"}, inplace=True)
            elif "pixel_count" in df.columns:
                pass  # already correct
            else:
                print(f"Unexpected columns in {csv_path}: {df.columns}")

            df = df[["class_name", "pixel_count", "area_sqm", "percentage"]]

            df["filename"] = base_name
            return df
        else:
            print(f"No class stats CSV found for {base_name}")
            return None

    def parse_r_output(self, output_text, tiff_file, output_subdir):
        """Parse R script console output and extract statistics."""
        result = {
            'filename': Path(tiff_file).name,
            'status': 'success'
        }
        try:
            # Extract cloud cover stats
            total_cloud_pct_match = re.search(r'TOTAL CLOUD COVER:\s*([\d.]+)', output_text)
            if total_cloud_pct_match:
                result['total_cloud_percentage'] = float(total_cloud_pct_match.group(1))

            # Include class distribution parsing
            class_df = self.parse_class_distribution(output_subdir, tiff_file)
            if class_df is not None:
                result['class_distribution'] = class_df.to_dict(orient='records')

        except Exception as e:
            result['status'] = f'parsing_error: {str(e)}'
        return result

    def process_tiff(self, tiff_file):
        print(f"\nProcessing: {tiff_file}")
        base_name = Path(tiff_file).stem
        output_subdir = os.path.join(self.output_dir, base_name)
        os.makedirs(output_subdir, exist_ok=True)

        temp_script = self.create_temp_r_script(tiff_file, output_subdir)
        result = subprocess.run(
            [self.rscript_path, temp_script],
            capture_output=True,
            text=True,
            timeout=300
        )
        output_log = os.path.join(output_subdir, "r_output.log")
        with open(output_log, "w") as f:
            f.write(result.stdout + "\n" + result.stderr)

        if result.returncode == 0:
            stats = self.parse_r_output(result.stdout, tiff_file, output_subdir)
            print(f"✓ Success - {Path(tiff_file).name}")
        else:
            stats = {
                'filename': Path(tiff_file).name,
                'status': 'error',
                'error_message': result.stderr
            }
            print(f"✗ Error on {Path(tiff_file).name}")

        os.remove(temp_script)
        return stats

    def process_directory(self, input_dir, pattern="*.tif"):
        files = list(Path(input_dir).glob(pattern))
        print(f"Found {len(files)} TIFFs in {input_dir}")
        for f in files:
            res = self.process_tiff(str(f))
            self.results.append(res)
        self.generate_summary()

    def generate_summary(self):
        df = pd.DataFrame(self.results)
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        csv_path = os.path.join(self.output_dir, f"summary_{timestamp}.csv")
        df.to_csv(csv_path, index=False)
        print(f"\nSummary saved to: {csv_path}")

        # Optionally combine all class distributions into one table
        class_tables = []
        for r in self.results:
            if 'class_distribution' in r and r['class_distribution']:
                class_tables.extend(r['class_distribution'])
        if class_tables:
            class_df = pd.DataFrame(class_tables)
            class_csv = os.path.join(self.output_dir, f"class_breakdown_{timestamp}.csv")
            class_df.to_csv(class_csv, index=False)
            print(f"Class breakdown saved to: {class_csv}")

        print("\nDone.")

# Example usage
if __name__ == "__main__":
    pipeline = TiffProcessingPipeline(
        r_script_path=r"C:\Users\Marko\WIT\DATA3010\DataMiningProjectCode.R",
        output_dir="pipeline_results"
    )
    pipeline.process_directory(r"C:\Users\Marko\WIT\DATA3010", pattern="*.tif")


Found 8 TIFFs in C:\Users\Marko\WIT\DATA3010

Processing: C:\Users\Marko\WIT\DATA3010\boston_bay_final.tif
✓ Success - boston_bay_final.tif

Processing: C:\Users\Marko\WIT\DATA3010\LC08_CU_031006_20250528_20250721_02.tif
✓ Success - LC08_CU_031006_20250528_20250721_02.tif

Processing: C:\Users\Marko\WIT\DATA3010\LC08_CU_031006_20250613_20250630_02.tif
✓ Success - LC08_CU_031006_20250613_20250630_02.tif

Processing: C:\Users\Marko\WIT\DATA3010\LC08_CU_031006_20250816_20250825_02.tif
✓ Success - LC08_CU_031006_20250816_20250825_02.tif

Processing: C:\Users\Marko\WIT\DATA3010\LC09_CU_031006_20250605_20250609_02.tif
✓ Success - LC09_CU_031006_20250605_20250609_02.tif

Processing: C:\Users\Marko\WIT\DATA3010\LC09_CU_031006_20250707_20250712_02.tif
✓ Success - LC09_CU_031006_20250707_20250712_02.tif

Processing: C:\Users\Marko\WIT\DATA3010\LC09_CU_031006_20250723_20250729_02.tif
✓ Success - LC09_CU_031006_20250723_20250729_02.tif

Processing: C:\Users\Marko\WIT\DATA3010\LC09_CU_031006_202509