In [5]:
import pandas as pd
import json
import numpy as np
import re
import spacy
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

class EnhancedJobAnomalyDetector:
    def __init__(self):
        """
        Phase 1.0: 双语料库对比法 + 智能通用词过滤
        """
        # 加载德语NLP模型
        try:
            self.nlp = spacy.load("de_core_news_sm")
            print("✅ 已加载德语模型")
        except OSError:
            try:
                self.nlp = spacy.load("en_core_web_sm")
                print("⚠️ 德语模型未找到，使用英语模型")
            except OSError:
                print("❌ 请安装spacy模型: python -m spacy download de_core_news_sm")
                raise
        
        # 扩展的德英停用词表
        self.stop_words = {
            # 德语停用词
            'und', 'der', 'die', 'das', 'für', 'von', 'mit', 'bei', 'den', 'dem', 'des', 
            'ein', 'eine', 'einen', 'einer', 'sich', 'wir', 'sie', 'ihr', 'ich', 'du',
            'ist', 'sind', 'war', 'waren', 'haben', 'hat', 'wird', 'werden', 'kann',
            'soll', 'sollte', 'muss', 'auf', 'zu', 'nach', 'über', 'unter', 'durch',
            
            # 英语停用词
            'the', 'and', 'for', 'with', 'are', 'you', 'will', 'can', 'have', 'your', 
            'our', 'this', 'that', 'work', 'working', 'job', 'position', 'role',
            'also', 'access', 'employees', 'company', 'team', 'opportunity',
            'during', 'right', 'therefore', 'should', 'needed', 'responsible',
            'something', 'experience', 'skills', 'requirements', 'candidates',
            
            # 招聘领域通用词
            'experience', 'skills', 'requirements', 'candidate', 'candidates',
            'position', 'role', 'job', 'work', 'working', 'opportunity',
            'team', 'company', 'employees', 'department', 'organization',
            'responsibilities', 'tasks', 'duties', 'qualifications',
            'background', 'knowledge', 'ability', 'abilities', 'capable',
            
            # 德语招聘通用词
            'stelle', 'position', 'arbeitsplatz', 'mitarbeiter', 'team',
            'unternehmen', 'firma', 'aufgaben', 'anforderungen', 'qualifikationen',
            'erfahrung', 'kenntnisse', 'fähigkeiten', 'verantwortung',
        }
        
        # 公司名常见后缀/标识符
        self.company_suffixes = {
            'gmbh', 'ag', 'kg', 'ohg', 'gbr', 'ug', 'eg', 'ev', 
            'inc', 'corp', 'ltd', 'llc', 'co', 'company', 'group',
            'holding', 'ventures', 'capital', 'partners', 'solutions',
            'technologies', 'systems', 'services', 'consulting',
            'verlag', 'verlagsgruppe', 'media', 'publishing'
        }
        
        # 噪音模式
        self.noise_patterns = [
            r'\w*_date_?\d*',     # publication_date_10
            r'job_id_?\d*',       # job_id_123  
            r'location_\w+',      # location_munich
            r'\d{4,}',            # 长数字ID
            r'publication_date',   # 明确过滤
            r'legal_entity',      # 法律实体
            r'^.{1,2}$',         # 1-2字符的词
            r'^\d+$',            # 纯数字
            r'^[a-z]$',          # 单字母
            r'http[s]?://.*',    # URL
            r'.*@.*\..*',        # 邮箱
        ]
        
        # 有意义的词性标签
        self.meaningful_pos = {
            'NOUN', 'PROPN',     # 名词、专有名词
            'ADJ',               # 形容词  
            'VERB',              # 动词
            'NUM'                # 数字
        }
        
        # 全局IDF缓存
        self.global_idf_cache = None
        
        # 行业分类缓存
        self.industry_cache = {}
        
    def classify_job_industry(self, company_name, job_title, description):
        """
        职位行业分类 - 用于双语料库对比
        """
        cache_key = f"{company_name}_{job_title}"
        if cache_key in self.industry_cache:
            return self.industry_cache[cache_key]
        
        # 组合所有文本进行分析
        combined_text = f"{company_name} {job_title} {description}".lower()
        
        # 行业关键词映射
        industry_keywords = {
            'consulting': [
                'consulting', 'mckinsey', 'bcg', 'bain', 'strategy', 'consultant',
                'beratung', 'strategieberatung', 'unternehmensberatung'
            ],
            'finance': [
                'bank', 'financial', 'investment', 'finance', 'asset', 'trading',
                'fund', 'capital', 'fintech', 'esg', 'sustainable', 'risk',
                'banking', 'wealth', 'portfolio', 'credit'
            ],
            'tech': [
                'software', 'technology', 'tech', 'ai', 'machine learning',
                'data science', 'developer', 'engineering', 'cloud', 'devops',
                'digital', 'innovation', 'platform', 'algorithm'
            ],
            'law': [
                'law', 'legal', 'attorney', 'lawyer', 'litigation', 'compliance',
                'regulatory', 'recht', 'rechtsanwalt', 'kanzlei', 'jurist'
            ],
            'automotive': [
                'automotive', 'car', 'vehicle', 'bmw', 'mercedes', 'porsche',
                'audi', 'volkswagen', 'mobility', 'transport'
            ],
            'healthcare': [
                'health', 'medical', 'pharmaceutical', 'biotech', 'clinical',
                'patient', 'hospital', 'healthcare', 'medicine'
            ],
            'media': [
                'media', 'publishing', 'journalism', 'content', 'editorial',
                'marketing', 'communication', 'pr', 'öffentlichkeitsarbeit'
            ],
            'real_estate': [
                'real estate', 'property', 'immobilien', 'proptech', 'construction',
                'building', 'development', 'housing'
            ]
        }
        
        # 计算每个行业的匹配分数
        industry_scores = {}
        for industry, keywords in industry_keywords.items():
            score = sum(1 for keyword in keywords if keyword in combined_text)
            if score > 0:
                industry_scores[industry] = score
        
        # 返回得分最高的行业，如果没有明确匹配则返回'general'
        if industry_scores:
            classified_industry = max(industry_scores, key=industry_scores.get)
        else:
            classified_industry = 'general'
        
        self.industry_cache[cache_key] = classified_industry
        return classified_industry
    
    def calculate_global_idf(self, jobs_df):
        """
        计算全局IDF值，用于智能通用词过滤
        """
        if self.global_idf_cache is not None:
            return self.global_idf_cache
        
        print("🔍 计算全局IDF值...")
        
        # 收集所有文本
        all_texts = []
        for _, row in jobs_df.iterrows():
            description = str(row.get('description', ''))
            if description and len(description.strip()) > 50:
                all_texts.append(description)
        
        if len(all_texts) < 5:
            print("⚠️ 文本数量不足，跳过IDF计算")
            return {}
        
        # 使用TF-IDF计算IDF值
        vectorizer = TfidfVectorizer(
            max_features=10000,
            stop_words=list(self.stop_words),
            ngram_range=(1, 1),
            min_df=2,  # 至少在2个文档中出现
            token_pattern=r'\b[a-zA-ZäöüÄÖÜß]{3,}\b'
        )
        
        try:
            vectorizer.fit(all_texts)
            
            # 提取IDF值
            feature_names = vectorizer.get_feature_names_out()
            idf_values = vectorizer.idf_
            
            idf_dict = dict(zip(feature_names, idf_values))
            self.global_idf_cache = idf_dict
            
            print(f"✅ 计算完成，获得 {len(idf_dict)} 个词的IDF值")
            
            # 显示一些低IDF（高频通用词）的例子
            sorted_idf = sorted(idf_dict.items(), key=lambda x: x[1])
            print(f"🔍 最通用的词汇 (低IDF): {[word for word, idf in sorted_idf[:10]]}")
            
            return idf_dict
            
        except Exception as e:
            print(f"❌ IDF计算失败: {e}")
            return {}
    
    def is_generic_word(self, word, idf_threshold=2.5):
        """
        基于IDF值判断是否为通用词
        IDF值越低 = 在越多文档中出现 = 越通用
        """
        if not self.global_idf_cache:
            return False
        
        idf_value = self.global_idf_cache.get(word.lower(), float('inf'))
        return idf_value < idf_threshold
    
    def extract_company_terms(self, company_name):
        """
        强化版公司名词汇提取
        """
        if not company_name or pd.isna(company_name):
            return set()
        
        company_terms = set()
        company_name = str(company_name).lower()
        
        # 1. 使用spaCy进行命名实体识别
        try:
            doc = self.nlp(company_name)
            for ent in doc.ents:
                if ent.label_ in ['ORG', 'PERSON']:
                    words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{2,}\b', ent.text.lower())
                    company_terms.update(words)
        except:
            pass
        
        # 2. 直接分解公司名
        cleaned_name = re.sub(r'\([^)]*\)', '', company_name)
        cleaned_name = re.sub(r'[^\w\s\-]', ' ', cleaned_name)
        
        words = re.findall(r'\b[a-zA-ZäöüÄÖÜß]{2,}\b', cleaned_name)
        company_terms.update(words)
        
        # 3. 移除常见后缀，但记录主要部分
        filtered_terms = set()
        for word in company_terms:
            if word not in self.company_suffixes:
                filtered_terms.add(word)
            else:
                filtered_terms.add(word)
        
        return filtered_terms
    
    def advanced_text_preprocessing(self, text, company_terms_to_filter=None):
        """
        高级文本预处理
        """
        if not text or pd.isna(text):
            return [], []
        
        if company_terms_to_filter is None:
            company_terms_to_filter = set()
        
        # 1. 基础清洗
        text = str(text)
        
        # 修复常见的拼接问题
        text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
        text = re.sub(r'([a-zA-Z])(\d)', r'\1 \2', text)
        text = re.sub(r'(\d)([a-zA-Z])', r'\1 \2', text)
        
        # 处理标点和特殊字符
        text = re.sub(r'[^\w\s\-/]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.strip().lower()
        
        # 2. 使用spaCy进行高级处理
        doc = self.nlp(text)
        
        unigrams = []
        bigrams = []
        
        # 过滤tokens
        valid_tokens = []
        for token in doc:
            # 跳过标点、空格、停用词
            if (token.is_punct or token.is_space or 
                token.text.lower() in self.stop_words or
                self.is_noise_pattern(token.text) or
                token.pos_ not in self.meaningful_pos):
                continue
            
            # 使用词形还原
            lemma = token.lemma_.lower().strip()
            
            # 过滤公司名相关词汇和短词
            if lemma in company_terms_to_filter or len(lemma) < 3:
                continue
            
            valid_tokens.append(lemma)
            unigrams.append(lemma)
        
        # 生成bigrams
        for i in range(len(valid_tokens) - 1):
            bigram = f"{valid_tokens[i]} {valid_tokens[i+1]}"
            
            # 检查bigram是否包含公司名词汇
            bigram_contains_company = any(
                company_term in bigram for company_term in company_terms_to_filter
            )
            
            if not bigram_contains_company:
                bigrams.append(bigram)
        
        return unigrams, bigrams
    
    def is_noise_pattern(self, word):
        """检查是否为噪音模式"""
        for pattern in self.noise_patterns:
            if re.match(pattern, word, re.IGNORECASE):
                return True
        return False
    
    def detect_anomalies_dual_corpus(self, target_job, specialist_corpus, general_corpus, 
                                   job_title="", company_name="", industry=""):
        """
        *** 核心改进：双语料库对比法 ***
        
        Args:
            target_job: 目标职位描述
            specialist_corpus: 同行业职位描述列表 (找"特色")
            general_corpus: 其他行业职位描述列表 (找"行业基因")
            job_title: 职位标题
            company_name: 公司名
            industry: 行业分类
        """
        if not target_job:
            return {"specialist_anomalies": [], "industry_markers": [], "metadata": {}}
        
        # 提取公司名相关词汇用于过滤
        company_terms = self.extract_company_terms(company_name)
        
        # 预处理目标职位
        target_unigrams, target_bigrams = self.advanced_text_preprocessing(
            target_job, company_terms
        )
        
        # === 对比A：与"同行"比，找"特色" ===
        specialist_unigrams = []
        specialist_bigrams = []
        
        for job in specialist_corpus:
            if job and not pd.isna(job):
                uni, bi = self.advanced_text_preprocessing(job)
                specialist_unigrams.extend(uni)
                specialist_bigrams.extend(bi)
        
        # === 对比B：与"外行"比，找"行业基因" ===
        general_unigrams = []
        general_bigrams = []
        
        for job in general_corpus:
            if job and not pd.isna(job):
                uni, bi = self.advanced_text_preprocessing(job)
                general_unigrams.extend(uni)
                general_bigrams.extend(bi)
        
        # 计算特色异常词（与同行对比）
        specialist_anomalies = self.calculate_anomalies_enhanced(
            target_unigrams, target_bigrams,
            specialist_unigrams, specialist_bigrams,
            job_title, company_name, comparison_type="specialist"
        )
        
        # 计算行业标志词（与外行对比）
        industry_markers = self.calculate_anomalies_enhanced(
            target_unigrams, target_bigrams,
            general_unigrams, general_bigrams,
            job_title, company_name, comparison_type="industry"
        )
        
        return {
            "specialist_anomalies": specialist_anomalies,
            "industry_markers": industry_markers,
            "metadata": {
                "industry": industry,
                "specialist_corpus_size": len(specialist_corpus),
                "general_corpus_size": len(general_corpus),
                "target_unigrams_count": len(target_unigrams),
                "target_bigrams_count": len(target_bigrams),
                "company_terms_filtered": len(company_terms)
            }
        }
    
    def calculate_anomalies_enhanced(self, target_unigrams, target_bigrams, 
                                   corpus_unigrams, corpus_bigrams,
                                   job_title, company_name, comparison_type):
        """
        增强版异常计算，支持双语料库对比
        """
        results = []
        
        # 处理unigrams
        for word_type, target_words, corpus_words in [
            ("unigrams", target_unigrams, corpus_unigrams),
            ("bigrams", target_bigrams, corpus_bigrams)
        ]:
            
            target_freq = Counter(target_words)
            corpus_freq = Counter(corpus_words)
            
            target_total = len(target_words)
            corpus_total = len(corpus_words)
            
            if target_total == 0 or corpus_total == 0:
                continue
            
            for word, count in target_freq.items():
                target_ratio = count / target_total
                corpus_count = corpus_freq.get(word, 0)
                
                # 调整参数以适应双语料库对比
                min_corpus_freq = 1 if word_type == "bigrams" else 1  # 降低门槛
                if corpus_count < min_corpus_freq:
                    corpus_ratio = 0.0001
                else:
                    corpus_ratio = corpus_count / corpus_total
                
                anomaly_ratio = target_ratio / corpus_ratio if corpus_ratio > 0 else 999
                
                # 根据对比类型调整阈值
                if comparison_type == "specialist":
                    # 与同行对比：找特色，阈值可以低一些
                    min_target_freq = 0.008 if word_type == "bigrams" else 0.004
                    min_anomaly_ratio = 1.2 if word_type == "bigrams" else 1.5
                else:  # industry markers
                    # 与外行对比：找行业标志，阈值要高一些
                    min_target_freq = 0.01 if word_type == "bigrams" else 0.006
                    min_anomaly_ratio = 2.0 if word_type == "bigrams" else 3.0
                
                if (target_ratio >= min_target_freq and 
                    anomaly_ratio >= min_anomaly_ratio and
                    count >= 1):
                    
                    quality_score = self.calculate_quality_score_enhanced(
                        word, job_title, word_type, comparison_type
                    )
                    
                    if quality_score > 0:
                        results.append({
                            'word': str(word),
                            'anomaly_ratio': float(round(anomaly_ratio, 2)),
                            'target_frequency': float(round(target_ratio * 100, 3)),
                            'corpus_frequency': float(round(corpus_ratio * 100, 4)),
                            'target_count': int(count),
                            'corpus_count': int(corpus_count),
                            'quality_score': float(quality_score),
                            'comparison_type': str(comparison_type),
                            'word_type': str(word_type)
                        })
        
        # 排序并返回前8个
        results.sort(key=lambda x: (x['quality_score'], x['anomaly_ratio']), reverse=True)
        return results[:8]
    
    def calculate_quality_score_enhanced(self, word, job_title, word_type, comparison_type):
        """
        增强版质量评分，集成IDF过滤
        """
        score = 1
        
        # *** 关键改进：基于IDF的通用词重罚 ***
        if self.is_generic_word(word):
            score -= 3  # 重罚通用词
            print(f"🚫 通用词惩罚: {word} (IDF过低)")
        
        # 1. 职位标题相关性加分
        if job_title and word.lower() in job_title.lower():
            score += 2
        
        # 2. 技术/技能词汇加分
        tech_indicators = [
            'python', 'java', 'javascript', 'react', 'vue', 'angular', 'node',
            'aws', 'azure', 'docker', 'kubernetes', 'git', 'sql', 'nosql',
            'machine', 'learning', 'ai', 'data', 'analytics', 'science',
            'devops', 'agile', 'scrum', 'api', 'rest', 'graphql',
            'testing', 'automation', 'ci/cd', 'jenkins', 'terraform'
        ]
        
        if any(tech in word.lower() for tech in tech_indicators):
            score += 2
        
        # 3. 行业专业词汇加分  
        industry_terms = [
            'esg', 'sustainable', 'proptech', 'fintech', 'blockchain', 'cryptocurrency',
            'healthcare', 'medical', 'pharmaceutical', 'biotech',
            'automotive', 'manufacturing', 'logistics', 'supply',
            'consulting', 'strategy', 'framework', 'case study',
            'legal', 'compliance', 'regulatory', 'litigation',
            'öffentlichkeitsarbeit', 'kommunikation', 'leadership communications',
            'international', 'global', 'cross border'
        ]
        
        if any(term in word.lower() for term in industry_terms):
            score += 2
        
        # 4. 对比类型特殊加分
        if comparison_type == "industry":
            # 行业标志词更有价值
            score += 1
        
        # 5. Bigrams一般比unigrams更有价值
        if word_type == "bigrams":
            score += 1
        
        # 6. 包含数字的词汇
        if re.search(r'\d', word):
            score += 0.5
        
        # 7. *** 新增：明确的通用词黑名单重罚 ***
        explicit_generic = [
            'digital', 'student', 'kreativ', 'innovative', 'modern',
            'new', 'current', 'future', 'excellent', 'strong', 'good',
            'various', 'different', 'multiple', 'general', 'basic'
        ]
        
        if any(generic in word.lower() for generic in explicit_generic):
            score -= 2
        
        return max(0, score)

def run_dual_corpus_detection(csv_file_path, output_json_path, max_jobs=20):
    """
    运行双语料库异常检测
    """
    print("🚀 开始双语料库对比法异常检测...")
    
    # 加载数据
    jobs_df = load_and_process_jobs(csv_file_path)
    if jobs_df.empty:
        return {}
    
    if len(jobs_df) > max_jobs:
        jobs_df = jobs_df.tail(max_jobs)
        print(f"⚠️ 限制处理前 {max_jobs} 个职位以便快速验证")
    
    detector = EnhancedJobAnomalyDetector()
    
    # *** 关键步骤1：计算全局IDF值 ***
    detector.calculate_global_idf(jobs_df)
    
    # *** 关键步骤2：对所有职位进行行业分类 ***
    print("🏷️ 进行行业分类...")
    jobs_df['industry'] = jobs_df.apply(
        lambda row: detector.classify_job_industry(
            row.get('company_name', ''),
            row.get('job_title', ''),
            row.get('description', '')
        ), axis=1
    )
    
    # 显示行业分布
    industry_counts = jobs_df['industry'].value_counts()
    print(f"📊 行业分布: {dict(industry_counts)}")
    
    results = []
    
    for idx, row in jobs_df.iterrows():
        try:
            job_id = row.get('id', f'job_{idx}')
            company = row.get('company_name', 'Unknown')
            title = row.get('job_title', 'Unknown')
            description = row.get('description', '')
            industry = row.get('industry', 'general')
            
            if not description or len(str(description).strip()) < 100:
                continue
            
            # *** 关键步骤3：构建双语料库 ***
            # 同行语料库：相同行业的其他职位
            specialist_corpus = jobs_df[
                (jobs_df.index != idx) & 
                (jobs_df['industry'] == industry)
            ]['description'].tolist()
            
            # 外行语料库：不同行业的职位
            general_corpus = jobs_df[
                (jobs_df.index != idx) & 
                (jobs_df['industry'] != industry)
            ]['description'].tolist()
            
            if len(specialist_corpus) < 2 or len(general_corpus) < 2:
                print(f"⚠️ 跳过 {company}: 语料库不足 (同行:{len(specialist_corpus)}, 外行:{len(general_corpus)})")
                continue
            
            # *** 关键步骤4：执行双语料库对比 ***
            anomalies = detector.detect_anomalies_dual_corpus(
                description, 
                specialist_corpus,
                general_corpus,
                job_title=title,
                company_name=company,
                industry=industry
            )
            
            # 统计结果
            specialist_count = len(anomalies['specialist_anomalies'])
            industry_count = len(anomalies['industry_markers'])
            total_anomalies = specialist_count + industry_count
            
            # 计算高质量异常词数量
            high_quality_count = sum(1 for item in 
                                   anomalies['specialist_anomalies'] + anomalies['industry_markers']
                                   if item.get('quality_score', 0) >= 2)
            
            job_result = {
                'job_id': str(job_id),
                'company_name': str(company),
                'job_title': str(title),
                'industry': str(industry),
                'description': str(description),
                'dual_corpus_anomalies': anomalies,
                'quality_metrics': {
                    'specialist_anomalies_count': int(specialist_count),
                    'industry_markers_count': int(industry_count),
                    'total_anomalies': int(total_anomalies),
                    'high_quality_anomalies': int(high_quality_count),
                    'quality_ratio': float(round(high_quality_count / total_anomalies, 2)) if total_anomalies > 0 else 0.0
                },
                'dual_corpus_metadata': {
                    'industry': str(anomalies['metadata'].get('industry', '')),
                    'specialist_corpus_size': int(anomalies['metadata'].get('specialist_corpus_size', 0)),
                    'general_corpus_size': int(anomalies['metadata'].get('general_corpus_size', 0)),
                    'target_unigrams_count': int(anomalies['metadata'].get('target_unigrams_count', 0)),
                    'target_bigrams_count': int(anomalies['metadata'].get('target_bigrams_count', 0)),
                    'company_terms_filtered': int(anomalies['metadata'].get('company_terms_filtered', 0))
                }
            }
            
            results.append(job_result)
            print(f"✅ {company} ({industry}) | 特色词:{specialist_count} | 行业词:{industry_count} | 高质量:{high_quality_count}")
            
        except Exception as e:
            print(f"❌ 处理职位 {idx} 时出错: {str(e)}")
            continue
    
    # 生成最终报告
    if results:
        avg_quality_ratio = np.mean([r['quality_metrics']['quality_ratio'] for r in results])
        total_high_quality = sum(r['quality_metrics']['high_quality_anomalies'] for r in results)
        
        output_data = {
            'dual_corpus_summary': {
                'version': 'v2.0 - 双语料库对比法',
                'processing_timestamp': str(pd.Timestamp.now().isoformat()),
                'total_jobs_processed': int(len(results)),
                'average_quality_ratio': float(round(avg_quality_ratio, 3)),
                'total_high_quality_anomalies': int(total_high_quality),
                'improvements_implemented': [
                    "双语料库对比法 (同行 vs 外行)",
                    "基于IDF的智能通用词过滤",
                    "行业自动分类系统",
                    "增强质量评分算法",
                    "上下文感知异常检测"
                ],
                'industry_distribution': {str(k): int(v) for k, v in dict(pd.Series([r['industry'] for r in results]).value_counts()).items()}
            },
            'jobs': results
        }
        
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=2)
        
        print(f"\n🎉 双语料库对比法检测完成!")
        print(f"📊 结果统计:")
        print(f"   - 处理职位数: {len(results)}")
        print(f"   - 平均质量比例: {avg_quality_ratio:.1%}")
        print(f"   - 高质量异常词总数: {total_high_quality}")
        print(f"   - 行业分布: {dict(pd.Series([r['industry'] for r in results]).value_counts())}")
        print(f"📁 结果已保存至: {output_json_path}")
        
        return output_data
    
    else:
        print("❌ 没有成功处理任何职位")
        return {}

def load_and_process_jobs(csv_file_path):
    """加载并预处理职位数据"""
    try:
        df = pd.read_csv(csv_file_path)
        print(f"✅ 成功加载 {len(df)} 条职位数据")
    except Exception as e:
        print(f"❌ 读取CSV文件失败: {e}")
        return pd.DataFrame()
    
    # 技术职位识别逻辑
    tech_keywords_high = [
        'software', 'developer', 'programming', 'python', 'java', 'javascript',
        'react', 'vue', 'angular', 'node.js', 'machine learning', 'data science',
        'ai ', 'ml ', 'devops', 'cloud', 'aws', 'azure', 'docker', 'kubernetes'
    ]
    
    tech_keywords_medium = [
        'technical', 'technology', 'it ', 'computer', 'digital', 'web',
        'mobile', 'app', 'api', 'database', 'sql', 'analytics', 'algorithm'
    ]
    
    def calculate_tech_score(row):
        title = str(row.get('job_title', '')).lower()
        desc = str(row.get('description', '')).lower()
        combined = title + ' ' + desc
        
        high_score = sum(2 for keyword in tech_keywords_high if keyword in combined)
        medium_score = sum(1 for keyword in tech_keywords_medium if keyword in combined)
        
        return high_score + medium_score
    
    df['tech_score'] = df.apply(calculate_tech_score, axis=1)
    tech_jobs = df[df['tech_score'] >= 2].copy()
    
    print(f"✅ 筛选出 {len(tech_jobs)} 个技术相关职位")
    return tech_jobs

def analyze_detection_quality(results, known_important_terms=None):
    """
    分析检测质量，模拟HR评估
    """
    if not results or not results.get('jobs'):
        return {}
    
    print("\n🔍 质量分析报告:")
    
    quality_analysis = {
        'jobs_analyzed': len(results['jobs']),
        'quality_breakdown': {},
        'common_patterns': {
            'high_quality_terms': [],
            'questionable_terms': [],
            'missed_opportunities': []
        }
    }
    
    all_specialist_terms = []
    all_industry_terms = []
    
    for job in results['jobs']:
        company = job['company_name']
        industry = job['industry'] 
        
        specialist_anomalies = job['dual_corpus_anomalies']['specialist_anomalies']
        industry_markers = job['dual_corpus_anomalies']['industry_markers']
        
        high_quality_specialist = [item for item in specialist_anomalies if item['quality_score'] >= 2]
        high_quality_industry = [item for item in industry_markers if item['quality_score'] >= 2]
        
        all_specialist_terms.extend([item['word'] for item in high_quality_specialist])
        all_industry_terms.extend([item['word'] for item in high_quality_industry])
        
        print(f"\n📋 {company} ({industry}):")
        print(f"  🎯 特色词汇: {[item['word'] for item in high_quality_specialist[:3]]}")
        print(f"  🏷️ 行业标志: {[item['word'] for item in high_quality_industry[:3]]}")
    
    # 统计最常见的高质量术语
    specialist_counter = Counter(all_specialist_terms)
    industry_counter = Counter(all_industry_terms)
    
    quality_analysis['common_patterns']['high_quality_specialist'] = specialist_counter.most_common(10)
    quality_analysis['common_patterns']['high_quality_industry'] = industry_counter.most_common(10)
    
    print(f"\n📊 最常检测到的特色术语: {specialist_counter.most_common(5)}")
    print(f"📊 最常检测到的行业术语: {industry_counter.most_common(5)}")
    
    return quality_analysis

# 使用示例和测试函数
if __name__ == "__main__":
    csv_file = "/Users/wenjiaqi/Downloads/job_analyzer_service/sample_data/job_listings_rows.csv"  # 请替换为实际路径
    output_file = "dual_corpus_enhanced_results.json"
    
    # 运行双语料库检测
    results = run_dual_corpus_detection(csv_file, output_file, max_jobs=10)
    
    # 质量分析
    if results:
        quality_report = analyze_detection_quality(results)
        
        # 显示改进效果预览
        print(f"\n=== 双语料库对比法效果预览 ===")
        sample = results['jobs'][0] if results['jobs'] else None
        
        if sample:
            print(f"🏢 公司: {sample['company_name']}")
            print(f"📋 职位: {sample['job_title']}")
            print(f"🏷️ 行业: {sample['industry']}")
            print(f"📊 质量指标: {sample['quality_metrics']}")
            
            specialist_anomalies = sample['dual_corpus_anomalies']['specialist_anomalies']
            industry_markers = sample['dual_corpus_anomalies']['industry_markers']
            
            if specialist_anomalies:
                print(f"\n🎯 与同行对比发现的特色词汇:")
                for item in specialist_anomalies[:3]:
                    print(f"  • {item['word']} (异常比例: {item['anomaly_ratio']}x, 质量分: {item['quality_score']})")
            
            if industry_markers:
                print(f"\n🏷️ 与外行对比发现的行业标志:")
                for item in industry_markers[:3]:
                    print(f"  • {item['word']} (异常比例: {item['anomaly_ratio']}x, 质量分: {item['quality_score']})")
        
        print(f"\n✅ 双语料库对比法实现完成!")
        print(f"🚀 核心改进:")
        print(f"   ✓ 同行对比找特色 (specialist_anomalies)")
        print(f"   ✓ 外行对比找行业基因 (industry_markers)")
        print(f"   ✓ 基于IDF的智能通用词过滤")
        print(f"   ✓ 行业自动分类系统")
        print(f"   ✓ 上下文感知的质量评分")
    
    print("\n🎯 下一步建议:")
    print("1. 在10个HR评估的职位上测试新算法")
    print("2. 对比新旧结果，验证'ESG'、'international'等关键词的检测效果")
    print("3. 调整IDF阈值和质量评分权重")
    print("4. 准备高质量结果输入到LLM进行语义分析")

🚀 开始双语料库对比法异常检测...
✅ 成功加载 217 条职位数据
✅ 筛选出 156 个技术相关职位
⚠️ 限制处理前 10 个职位以便快速验证
✅ 已加载德语模型
🔍 计算全局IDF值...
✅ 计算完成，获得 344 个词的IDF值
🔍 最通用的词汇 (低IDF): ['uns', 'deine', 'einem', 'oder', 'als', 'bereich', 'dein', 'dich', 'dir', 'teams']
🏷️ 进行行业分类...
📊 行业分布: {'tech': np.int64(6), 'media': np.int64(3), 'finance': np.int64(1)}
🚫 通用词惩罚: teil (IDF过低)
🚫 通用词惩罚: teamarbeit (IDF过低)
🚫 通用词惩罚: gelegenheit (IDF过低)
🚫 通用词惩罚: group (IDF过低)
🚫 通用词惩罚: bieten (IDF过低)
🚫 通用词惩罚: bereich (IDF过低)
🚫 通用词惩罚: entwicklung (IDF过低)
🚫 通用词惩罚: fokus (IDF过低)
🚫 通用词惩罚: kund (IDF过低)
🚫 通用词惩罚: unterstützen (IDF过低)
🚫 通用词惩罚: bearbeitung (IDF过低)
🚫 通用词惩罚: unterstützung (IDF过低)
🚫 通用词惩罚: human (IDF过低)
🚫 通用词惩罚: aufbereitung (IDF过低)
🚫 通用词惩罚: auswertung (IDF过低)
🚫 通用词惩罚: tools (IDF过低)
🚫 通用词惩罚: runden (IDF过低)
🚫 通用词惩罚: studiengang (IDF过低)
🚫 通用词惩罚: verfügst (IDF过低)
🚫 通用词惩罚: arbeitsweise (IDF过低)
🚫 通用词惩罚: teamfähigkeit (IDF过低)
🚫 通用词惩罚: münchen (IDF过低)
🚫 通用词惩罚: spaß (IDF过低)
🚫 通用词惩罚: lernen (IDF过低)
🚫 通用词惩罚: bewirb (IDF过低)
🚫 通用词惩罚: mentoring (IDF过低)
🚫 通用词惩罚