In [None]:
# ===================================================================
# AI-POWERED DATA CLEANING PIPELINE FOR PLACEMENT EMAILS
# Using: spaCy Transformer + NER + Pattern Matching
# Much Better Accuracy Than Rule-Based Approach
# ===================================================================

from __future__ import print_function
import os, json, re, time, logging, unicodedata
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from typing import List, Dict, Set, Any, Optional
import threading
import queue

# ===================================================================
# 1. SETUP AND IMPORTS
# ===================================================================
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger("AI_Cleaning")

# Try importing advanced NLP libraries
try:
    import spacy
    from spacy.matcher import Matcher, PhraseMatcher
    from spacy.tokens import Span
    SPACY_AVAILABLE = True
    logger.info("spaCy available")
except ImportError:
    SPACY_AVAILABLE = False
    logger.warning("spaCy not available - install: pip install spacy")

logger.info("AI-Powered Data Cleaning Pipeline")

# ===================================================================
# 2. LOAD AI MODEL
# ===================================================================

def load_nlp_model():
    """Load the best available spaCy model."""
    if not SPACY_AVAILABLE:
        logger.error("spaCy not installed!")
        return None
    
    # Try transformer model first (most accurate)
    try:
        logger.info("Loading spaCy Transformer model (en_core_web_trf)...")
        nlp = spacy.load("en_core_web_trf")
        logger.info("Loaded Transformer model - BEST ACCURACY")
        return nlp
    except:
        pass
    
    # Try large model
    try:
        logger.info("Loading spaCy Large model (en_core_web_lg)...")
        nlp = spacy.load("en_core_web_lg")
        logger.info("Loaded Large model - GOOD ACCURACY")
        return nlp
    except:
        pass
    
    # Fallback to medium
    try:
        logger.info("Loading spaCy Medium model (en_core_web_md)...")
        nlp = spacy.load("en_core_web_md")
        logger.info("Loaded Medium model - MODERATE ACCURACY")
        return nlp
    except:
        pass
    
    # Last resort: small model
    try:
        logger.info("Loading spaCy Small model (en_core_web_sm)...")
        nlp = spacy.load("en_core_web_sm")
        logger.info("Loaded Small model - BASIC ACCURACY")
        return nlp
    except:
        logger.error("No spaCy model found! Install with:")
        logger.error("python -m spacy download en_core_web_trf  # Best")
        logger.error("python -m spacy download en_core_web_lg   # Good")
        logger.error("python -m spacy download en_core_web_sm   # Basic")
        return None

# Load model globally
NLP_MODEL = load_nlp_model()

# ===================================================================
# 3. ENHANCED KNOWLEDGE BASE
# ===================================================================

KNOWLEDGE_BASE = {
    # Technical Skills (Comprehensive)
    "skills": {
        # Programming Languages
        "python", "java", "javascript", "typescript", "c++", "c#", "c", "ruby", 
        "go", "golang", "rust", "swift", "kotlin", "scala", "r", "php", "perl",
        
        # Web Frontend
        "react", "reactjs", "angular", "vue", "vuejs", "html", "css", "sass",
        "bootstrap", "tailwind", "jquery", "webpack", "nextjs", "gatsby",
        
        # Web Backend
        "node.js", "nodejs", "express", "django", "flask", "fastapi", "spring",
        "spring boot", "asp.net", "laravel", "rails", "ruby on rails",
        
        # Databases
        "sql", "mysql", "postgresql", "oracle", "mongodb", "redis", "cassandra",
        "dynamodb", "elasticsearch", "neo4j", "sqlite", "mariadb", "couchdb",
        
        # Cloud & DevOps
        "aws", "azure", "gcp", "google cloud", "docker", "kubernetes", "k8s",
        "jenkins", "gitlab ci", "github actions", "terraform", "ansible", 
        "chef", "puppet", "circleci", "travis ci",
        
        # Data Science & AI
        "machine learning", "ml", "deep learning", "ai", "artificial intelligence",
        "tensorflow", "pytorch", "keras", "scikit-learn", "pandas", "numpy",
        "data science", "nlp", "computer vision", "cv", "neural networks",
        "transformers", "bert", "gpt", "llm",
        
        # Mobile
        "android", "ios", "react native", "flutter", "xamarin", "swift", "kotlin",
        
        # Tools & Platforms
        "git", "github", "gitlab", "bitbucket", "jira", "confluence", "slack",
        "postman", "swagger", "figma", "sketch", "tableau", "power bi",
        
        # Testing
        "selenium", "pytest", "junit", "jest", "cypress", "testing", "qa",
        "test automation", "unit testing", "integration testing",
        
        # Big Data
        "hadoop", "spark", "pyspark", "kafka", "airflow", "hive", "pig",
        "flink", "storm", "hbase",
        
        # Other
        "api", "rest", "restful", "graphql", "microservices", "agile", "scrum",
        "blockchain", "web3", "solidity", "etl", "ci/cd", "linux", "unix",
        "bash", "powershell", "sap", "salesforce", "excel", "vba"
    },
    
    # Job Positions
    "positions": {
        "software engineer", "software developer", "data scientist", "data analyst",
        "data engineer", "machine learning engineer", "ml engineer", "ai engineer",
        "full stack developer", "fullstack developer", "frontend developer",
        "front end developer", "backend developer", "back end developer",
        "devops engineer", "cloud engineer", "cloud architect", "solutions architect",
        "system administrator", "sysadmin", "network engineer", "database administrator",
        "dba", "business analyst", "product manager", "project manager",
        "qa engineer", "test engineer", "sdet", "automation engineer",
        "security engineer", "cyber security analyst", "penetration tester",
        "ui/ux designer", "ui designer", "ux designer", "graphic designer",
        "technical lead", "tech lead", "team lead", "engineering manager",
        "scrum master", "product owner", "consultant", "analyst", "developer",
        "engineer", "architect", "specialist", "coordinator", "associate",
        "intern", "trainee", "graduate trainee", "fresher", "junior", "senior"
    },
    
    # Locations (India + Global)
    "locations": {
        # Metro Cities
        "bangalore", "bengaluru", "mumbai", "delhi", "new delhi", "ncr",
        "hyderabad", "pune", "chennai", "kolkata", "calcutta",
        
        # Tier 2 Cities
        "ahmedabad", "gurgaon", "gurugram", "noida", "greater noida",
        "chandigarh", "jaipur", "kochi", "cochin", "thiruvananthapuram",
        "bhubaneswar", "indore", "coimbatore", "surat", "nagpur", "lucknow",
        "vadodara", "visakhapatnam", "vizag", "mysore", "mysuru",
        
        # Remote
        "remote", "work from home", "wfh", "hybrid", "anywhere", "pan india"
    },
    
    # Degrees
    "degrees": {
        "b.tech", "btech", "be", "b.e", "bachelor of technology",
        "bachelor of engineering", "m.tech", "mtech", "me", "m.e",
        "master of technology", "master of engineering", "bca", "mca",
        "bachelor of computer applications", "master of computer applications",
        "bsc", "b.sc", "bachelor of science", "msc", "m.sc", "master of science",
        "phd", "doctorate", "diploma", "mba", "pgdm"
    }
}

# Company patterns for regex
COMPANY_PATTERNS = [
    r'\b([A-Z][A-Za-z]+(?:\s+[A-Z][A-Za-z]+){0,3})\s+(?:Pvt\.?\s*Ltd\.?|Private\s+Limited|Limited|Ltd\.?|Inc\.?|Incorporated|Corp\.?|Corporation|LLC|Technologies|Systems|Solutions|Software|Consulting|Services|Group|Enterprises|Industries)\b',
    r'\b([A-Z][A-Za-z]{2,}(?:\s+[A-Z][A-Za-z]+)?)\s+(?:Tech|IT|Labs|Studio|Agency)\b'
]

# Salary patterns
SALARY_PATTERNS = [
    r'(\d+(?:\.\d+)?)\s*(?:lpa|lakhs?\s+per\s+annum|l\.?p\.?a\.?)',
    r'(\d+)\s*(?:k|thousand)\s*(?:per\s+month|pm|/month)',
    r'(?:ctc|package|salary)\s*:?\s*(?:rs\.?\s*)?(\d+(?:\.\d+)?)\s*(?:lpa|lakhs?)',
    r'(?:stipend|compensation)\s*:?\s*(?:rs\.?\s*)?(\d+(?:,\d+)?)\s*(?:per\s+month)?'
]

# Experience patterns
EXPERIENCE_PATTERNS = [
    r'(\d+)\s*(?:\+|to|-)\s*(\d+)\s*years?\s+(?:of\s+)?experience',
    r'(\d+)\s*years?\s+(?:of\s+)?experience',
    r'\b(fresher|freshers?)\b',
    r'\b(entry\s+level)\b',
    r'\b0\s*(?:\+|to|-)\s*(\d+)\s*years?\b'
]

# ===================================================================
# 4. AGGRESSIVE TEXT CLEANING
# ===================================================================

# Regex for cleaning
URL_RE = re.compile(r'https?://\S+|www\.\S+')
EMAIL_RE = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
PHONE_RE = re.compile(r'[\+]?[\d][\d\s\-\(\)]{7,}[\d]')
HTML_RE = re.compile(r'<[^>]+>')
NON_PRINTABLE = re.compile(r'[^\x20-\x7E\n\r]+')

REMOVE_PATTERNS = [
    # Signatures
    r'(?:best\s+)?regards?,?.*',
    r'thanks?\s+(?:and\s+)?regards?.*',
    r'sincerely,?.*',
    r'warm\s+regards?.*',
    
    # Email artifacts
    r'------+\s*forwarded\s+message\s*------+.*',
    r'from:.*?(?:sent:|to:|subject:).*',
    r'----+\s*original\s+message\s*----+.*',
    r'on\s+\w+,\s+\w+\s+\d+,\s+\d+.*?wrote:.*',
    r'(?:fwd?|fw|re):.*?\n',
    
    # Disclaimers
    r'this\s+(?:e-?mail|message).*?confidential.*',
    r'disclaimer:.*',
    r'confidentiality\s+notice:.*',
    r'(?:please\s+)?do\s+not\s+reply.*',
    r'this\s+is\s+an?\s+automated.*',
    r'unsubscribe.*',
    r'sent\s+from\s+my.*',
    r'get\s+outlook\s+for.*'
]

def clean_text_ai(text: str) -> str:
    """AI-ready text cleaning - removes junk, keeps semantic content."""
    if not text or not isinstance(text, str):
        return ""
    
    # Unicode normalization
    text = unicodedata.normalize("NFKC", text)
    
    # Remove HTML
    text = HTML_RE.sub(" ", text)
    
    # Remove URLs, emails, phones
    text = URL_RE.sub(" ", text)
    text = EMAIL_RE.sub(" ", text)
    text = PHONE_RE.sub(" ", text)
    
    # Remove all unwanted patterns
    for pattern in REMOVE_PATTERNS:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE | re.DOTALL)
    
    # Remove non-printable
    text = NON_PRINTABLE.sub(" ", text)
    
    # Normalize whitespace
    text = re.sub(r'\n\s*\n+', "\n", text)
    text = re.sub(r'\s+', " ", text)
    
    # Remove very short lines (likely artifacts)
    lines = [line.strip() for line in text.split('\n') if len(line.strip()) > 5]
    text = " ".join(lines)
    
    return text.strip()

# ===================================================================
# 5. AI-POWERED ENTITY EXTRACTION
# ===================================================================

def extract_with_ai(text: str, nlp_model) -> Dict[str, Any]:
    """
    Use spaCy NER + Pattern Matching for superior extraction.
    This is MUCH better than pure regex.
    """
    result = {
        'companies': set(),
        'skills': set(),
        'positions': set(),
        'locations': set(),
        'salary_info': [],
        'experience_required': [],
        'degrees_required': set()
    }
    
    if not nlp_model or not text:
        return result
    
    try:
        # Process with spaCy (uses AI for context understanding)
        doc = nlp_model(text[:5000])  # Limit for performance
        text_lower = text.lower()
        
        # 1. Extract ORGANIZATIONS using NER (AI-powered)
        for ent in doc.ents:
            if ent.label_ == "ORG":
                # Clean and validate
                org = ent.text.strip()
                if len(org) > 2 and not org.lower() in ['we', 'our', 'the', 'a', 'an']:
                    result['companies'].add(org)
        
        # 2. Also use regex for company patterns
        for pattern in COMPANY_PATTERNS:
            matches = re.finditer(pattern, text)
            for match in matches:
                company = match.group(0).strip()
                if len(company) > 3:
                    result['companies'].add(company)
        
        # 3. Extract SKILLS using POS tagging + matching
        # This is smarter than pure keyword matching
        for token in doc:
            token_lower = token.text.lower()
            # Check if token or its lemma matches skills
            if token_lower in KNOWLEDGE_BASE['skills']:
                result['skills'].add(token_lower)
            elif token.lemma_.lower() in KNOWLEDGE_BASE['skills']:
                result['skills'].add(token.lemma_.lower())
        
        # Multi-word skills (using noun chunks for context)
        for chunk in doc.noun_chunks:
            chunk_lower = chunk.text.lower()
            if chunk_lower in KNOWLEDGE_BASE['skills']:
                result['skills'].add(chunk_lower)
        
        # 4. Extract POSITIONS using noun phrases + matching
        for chunk in doc.noun_chunks:
            chunk_lower = chunk.text.lower()
            if chunk_lower in KNOWLEDGE_BASE['positions']:
                result['positions'].add(chunk_lower)
        
        # Also check individual tokens
        for token in doc:
            token_lower = token.text.lower()
            if token_lower in KNOWLEDGE_BASE['positions']:
                result['positions'].add(token_lower)
        
        # 5. Extract LOCATIONS using NER (GPE = Geo-Political Entity)
        for ent in doc.ents:
            if ent.label_ == "GPE":
                location = ent.text.lower().strip()
                if location in KNOWLEDGE_BASE['locations']:
                    result['locations'].add(location)
        
        # Also direct matching for locations
        for location in KNOWLEDGE_BASE['locations']:
            if location in text_lower:
                result['locations'].add(location)
        
        # 6. Extract SALARY using patterns
        for pattern in SALARY_PATTERNS:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                result['salary_info'].append(match.group(0).strip())
        
        # 7. Extract EXPERIENCE using patterns
        for pattern in EXPERIENCE_PATTERNS:
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                result['experience_required'].append(match.group(0).strip())
        
        # 8. Extract DEGREES
        for degree in KNOWLEDGE_BASE['degrees']:
            pattern = r'\b' + re.escape(degree) + r'\b'
            if re.search(pattern, text_lower):
                result['degrees_required'].add(degree)
        
    except Exception as e:
        logger.error(f"AI extraction error: {str(e)[:100]}")
    
    # Convert sets to sorted lists
    result['companies'] = sorted(list(result['companies']))
    result['skills'] = sorted(list(result['skills']))
    result['positions'] = sorted(list(result['positions']))
    result['locations'] = sorted(list(result['locations']))
    result['degrees_required'] = sorted(list(result['degrees_required']))
    
    # Remove duplicates from lists
    result['salary_info'] = sorted(list(set(result['salary_info'])))
    result['experience_required'] = sorted(list(set(result['experience_required'])))
    
    return result

# ===================================================================
# 6. MAIN PROCESSING FUNCTION
# ===================================================================

def process_email_ai(text: str, nlp_model) -> Dict[str, Any]:
    """Process single email with AI model."""
    result = {
        'cleaned_text': '',
        'companies': [],
        'skills': [],
        'positions': [],
        'locations': [],
        'salary_info': [],
        'experience_required': [],
        'degrees_required': [],
        'word_count': 0,
        'char_count': 0,
        'processing_status': 'success'
    }
    
    try:
        # Step 1: Clean text
        cleaned = clean_text_ai(text)
        
        if not cleaned or len(cleaned) < 10:
            result['processing_status'] = 'empty_after_cleaning'
            return result
        
        result['cleaned_text'] = cleaned
        result['word_count'] = len(cleaned.split())
        result['char_count'] = len(cleaned)
        
        # Step 2: AI-powered extraction
        if nlp_model:
            extracted = extract_with_ai(cleaned, nlp_model)
            result.update(extracted)
        else:
            result['processing_status'] = 'no_ai_model'
        
        return result
        
    except Exception as e:
        result['processing_status'] = f'error: {str(e)[:100]}'
        logger.error(f"Processing error: {str(e)}")
        return result

# ===================================================================
# 7. BATCH PROCESSING
# ===================================================================

def process_batch_ai(texts: List[str], nlp_model, batch_size: int = 20) -> List[Dict]:
    """Process emails in batches with AI model."""
    total = len(texts)
    total_batches = (total + batch_size - 1) // batch_size
    
    logger.info(f"\n{'='*70}")
    logger.info(f"AI-POWERED DATA CLEANING")
    logger.info(f"{'='*70}")
    logger.info(f"Total Emails: {total}")
    logger.info(f"Batches: {total_batches} (size: {batch_size})")
    logger.info(f"Model: {nlp_model.meta['name'] if nlp_model else 'None'}")
    logger.info(f"{'='*70}\n")
    
    all_results = []
    stats = {
        'total': 0,
        'with_companies': 0,
        'with_skills': 0,
        'with_positions': 0,
        'with_locations': 0,
        'with_salary': 0,
        'empty': 0
    }
    
    start_time = time.time()
    
    for batch_num in range(0, total, batch_size):
        batch_texts = texts[batch_num:batch_num + batch_size]
        current_batch = batch_num // batch_size + 1
        
        logger.info(f"Batch {current_batch}/{total_batches} ({len(batch_texts)} emails)...")
        
        batch_start = time.time()
        batch_results = []
        
        for text in batch_texts:
            result = process_email_ai(text, nlp_model)
            batch_results.append(result)
            
            # Update stats
            stats['total'] += 1
            if result['companies']: stats['with_companies'] += 1
            if result['skills']: stats['with_skills'] += 1
            if result['positions']: stats['with_positions'] += 1
            if result['locations']: stats['with_locations'] += 1
            if result['salary_info']: stats['with_salary'] += 1
            if result['processing_status'] == 'empty_after_cleaning':
                stats['empty'] += 1
        
        batch_time = time.time() - batch_start
        all_results.extend(batch_results)
        
        logger.info(f"Batch {current_batch}/{total_batches} | Time: {batch_time:.2f}s | "
                   f"Companies: {stats['with_companies']} | Skills: {stats['with_skills']}")
        
        time.sleep(0.05)
    
    total_time = time.time() - start_time
    
    # Final summary
    logger.info(f"\n{'='*70}")
    logger.info(f"AI PROCESSING COMPLETE")
    logger.info(f"{'='*70}")
    logger.info(f"Total Time: {total_time:.2f}s ({total/total_time:.1f} emails/sec)")
    logger.info(f"Total Processed: {stats['total']}")
    logger.info(f"With Companies: {stats['with_companies']} ({stats['with_companies']/stats['total']*100:.1f}%)")
    logger.info(f"With Skills: {stats['with_skills']} ({stats['with_skills']/stats['total']*100:.1f}%)")
    logger.info(f"With Positions: {stats['with_positions']} ({stats['with_positions']/stats['total']*100:.1f}%)")
    logger.info(f"With Locations: {stats['with_locations']} ({stats['with_locations']/stats['total']*100:.1f}%)")
    logger.info(f"With Salary: {stats['with_salary']} ({stats['with_salary']/stats['total']*100:.1f}%)")
    logger.info(f"Empty After Clean: {stats['empty']}")
    logger.info(f"{'='*70}\n")
    
    return all_results

# ===================================================================
# 8. MAIN PIPELINE
# ===================================================================

def main_ai_pipeline(csv_path: str, output_path: str = "ai_cleaned_emails.csv"):
    """Main AI-powered cleaning pipeline."""
    
    logger.info(f"Loading dataset: {csv_path}")
    df = pd.read_csv(csv_path)
    
    # Combine text columns
    df['combined_text'] = (
        df['Subject'].astype(str).fillna('') + ' ' + 
        df['Preview'].astype(str).fillna('') + ' ' + 
        df['Body'].astype(str).fillna('')
    )
    
    logger.info(f"Loaded {len(df)} emails\n")
    
    # Check if model is available
    if not NLP_MODEL:
        logger.error("Cannot proceed without NLP model!")
        logger.error("Install spaCy model with:")
        logger.error("  python -m spacy download en_core_web_trf  # Transformer (Best)")
        logger.error("  python -m spacy download en_core_web_lg   # Large (Good)")
        logger.error("  python -m spacy download en_core_web_sm   # Small (Basic)")
        return None
    
    # Process all emails
    texts = df['combined_text'].tolist()
    results = process_batch_ai(texts, NLP_MODEL, batch_size=20)
    
    # Create enriched DataFrame
    logger.info("Creating enriched dataset...")
    
    df['cleaned_text'] = [r['cleaned_text'] for r in results]
    df['word_count'] = [r['word_count'] for r in results]
    df['char_count'] = [r['char_count'] for r in results]
    df['processing_status'] = [r['processing_status'] for r in results]
    
    # Entity columns
    df['companies_extracted'] = [', '.join(r['companies']) for r in results]
    df['skills_extracted'] = [', '.join(r['skills']) for r in results]
    df['positions_extracted'] = [', '.join(r['positions']) for r in results]
    df['locations_extracted'] = [', '.join(r['locations']) for r in results]
    df['salary_info'] = [', '.join(r['salary_info']) for r in results]
    df['experience_required'] = [', '.join(r['experience_required']) for r in results]
    df['degrees_required'] = [', '.join(r['degrees_required']) for r in results]
    
    # Count columns
    df['company_count'] = [len(r['companies']) for r in results]
    df['skill_count'] = [len(r['skills']) for r in results]
    df['position_count'] = [len(r['positions']) for r in results]
    df['location_count'] = [len(r['locations']) for r in results]
    
    # Save
    df.to_csv(output_path, index=False)
    logger.info(f"Saved to: {output_path}")
    
    # Generate insights
    generate_insights(df)
    
    return df

# ===================================================================
# 9. INSIGHTS GENERATION
# ===================================================================

def generate_insights(df: pd.DataFrame):
    """Generate detailed insights from extracted data."""
    
    logger.info(f"\n{'='*70}")
    logger.info(f"EXTRACTION INSIGHTS")
    logger.info(f"{'='*70}\n")
    
    # Companies
    all_companies = []
    for comp_str in df['companies_extracted']:
        if comp_str:
            all_companies.extend([c.strip() for c in comp_str.split(',')])
    
    if all_companies:
        company_counts = Counter(all_companies)
        logger.info(f"TOP 20 COMPANIES:")
        for company, count in company_counts.most_common(20):
            logger.info(f"   {company}: {count} times")
    
    # Skills
    all_skills = []
    for skill_str in df['skills_extracted']:
        if skill_str:
            all_skills.extend([s.strip() for s in skill_str.split(',')])
    
    if all_skills:
        skill_counts = Counter(all_skills)
        logger.info(f"\nTOP 25 SKILLS:")
        for skill, count in skill_counts.most_common(25):
            logger.info(f"   {skill}: {count} times")
    
    # Positions
    all_positions = []
    for pos_str in df['positions_extracted']:
        if pos_str:
            all_positions.extend([p.strip() for p in pos_str.split(',')])
    
    if all_positions:
        position_counts = Counter(all_positions)
        logger.info(f"\nTOP 15 POSITIONS:")
        for position, count in position_counts.most_common(15):
            logger.info(f"   {position}: {count} times")
    
    # Locations
    all_locations = []
    for loc_str in df['locations_extracted']:
        if loc_str:
            all_locations.extend([l.strip() for l in loc_str.split(',')])
    
    if all_locations:
        location_counts = Counter(all_locations)
        logger.info(f"\nTOP 10 LOCATIONS:")
        for location, count in location_counts.most_common(10):
            logger.info(f"   {location}: {count} times")
    
    logger.info(f"\n{'='*70}\n")

# ===================================================================
# 10. USAGE
# ===================================================================

if __name__ == "__main__":
    CSV_PATH = r"D:\Projects By Month\November 2025\Placement Mail Analysis System\.venv\Phase_scripts\Phase 1\placement_emails.csv"
    OUTPUT_PATH = "ai_cleaned_emails.csv"
    
    df = main_ai_pipeline(CSV_PATH, OUTPUT_PATH)
    
    if df is not None:
        logger.info("AI-powered pipeline complete!")
        logger.info(f"Check output: {OUTPUT_PATH}")