In [None]:
# EXEMPLO PR√ÅTICO PARA GOOGLE COLAB
# Execute este c√≥digo c√©lula por c√©lula

# 1. INSTALAR DEPEND√äNCIAS
!pip install pandas colorama difflib

# 2. IMPORTAR E CONFIGURAR
import pandas as pd
import io
from IPython.display import Markdown, display
import re
from typing import List, Dict, Tuple
from difflib import SequenceMatcher
import difflib

class ContentValidator:
    def __init__(self):
        self.original_content = {}
        self.generated_content = ""
        self.missing_content = []
        self.added_content = []
        self.coverage_report = {}

    def load_original_data(self, csv_path: str):
        """Carrega os dados originais do CSV"""
        df = pd.read_csv(csv_path)

        # Filtra apenas as linhas com respostas v√°lidas
        valid_rows = df[
            (df['answer'].notna()) &
            (df['answer'] != '')
        ]

        for idx, row in valid_rows.iterrows():
            key = f"item_{idx}"
            self.original_content[key] = {
                'answer': row['answer']
            }

    def load_generated_content(self, generated_text: str):
        """Carrega o conte√∫do gerado"""
        self.generated_content = generated_text

    def normalize_text(self, text: str) -> str:
        """Normaliza texto para compara√ß√£o mais flex√≠vel"""
        # Remove acentos b√°sicos
        replacements = {
            '√°': 'a', '√†': 'a', '√£': 'a', '√¢': 'a', '√§': 'a',
            '√©': 'e', '√®': 'e', '√™': 'e', '√´': 'e',
            '√≠': 'i', '√¨': 'i', '√Æ': 'i', '√Ø': 'i',
            '√≥': 'o', '√≤': 'o', '√µ': 'o', '√¥': 'o', '√∂': 'o',
            '√∫': 'u', '√π': 'u', '√ª': 'u', '√º': 'u',
            '√ß': 'c', '√±': 'n'
        }

        text = text.lower()
        for old, new in replacements.items():
            text = text.replace(old, new)

        # Remove pontua√ß√£o e normaliza espa√ßos
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def extract_key_concepts(self, text: str) -> List[str]:
        """Extrai conceitos-chave do texto (palavras e frases importantes)"""
        normalized = self.normalize_text(text)

        # Palavras importantes (substantivos, verbos, adjetivos)
        words = normalized.split()

        # Filtrar palavras muito comuns
        stopwords = {
            'a', 'e', 'o', 'de', 'da', 'do', 'em', 'para', 'com', 'por', 'que', 'se', 'na', 'no',
            'um', 'uma', 'os', 'as', 'dos', 'das', 'nos', 'nas', 'ao', 'aos', 'pela', 'pelo',
            'mas', 'ou', 'quando', 'onde', 'como', 'muito', 'mais', 'menos', 'ate', 'seu', 'sua',
            'seus', 'suas', 'meu', 'minha', 'meus', 'minhas', 'era', 'foi', 'ser', 'ter', 'estar',
            'foi', 'tinha', 'tem', 'tinha', 'eram', 'sendo', 'sido', 'ja', 'sempre', 'nunca',
            'tambem', 'ainda', 'depois', 'antes', 'entao', 'isso', 'essa', 'este', 'esta',
            'aquele', 'aquela', 'dele', 'dela', 'deles', 'delas', 'me', 'te', 'lhe', 'nos', 'vos'
        }

        # Palavras importantes (pelo menos 3 caracteres e n√£o stopwords)
        key_words = [word for word in words if len(word) >= 3 and word not in stopwords]

        # Frases importantes (sequ√™ncias de 2-4 palavras)
        key_phrases = []
        for i in range(len(key_words) - 1):
            phrase = ' '.join(key_words[i:i+2])
            if len(phrase) > 8:  # Frases com mais de 8 caracteres
                key_phrases.append(phrase)

        # Combinar palavras e frases
        return key_words + key_phrases

    def calculate_similarity(self, text1: str, text2: str) -> float:
        """Calcula similaridade entre dois textos usando diferentes m√©tricas"""
        norm1 = self.normalize_text(text1)
        norm2 = self.normalize_text(text2)

        # Similaridade b√°sica usando SequenceMatcher
        basic_similarity = SequenceMatcher(None, norm1, norm2).ratio()

        # Similaridade baseada em conceitos-chave
        concepts1 = set(self.extract_key_concepts(text1))
        concepts2 = set(self.extract_key_concepts(text2))

        if not concepts1 or not concepts2:
            concept_similarity = 0.0
        else:
            intersection = len(concepts1 & concepts2)
            union = len(concepts1 | concepts2)
            concept_similarity = intersection / union if union > 0 else 0.0

        # Peso maior para similaridade de conceitos
        final_similarity = (basic_similarity * 0.3) + (concept_similarity * 0.7)

        return final_similarity

    def check_content_presence(self, original_answer: str, generated_text: str) -> Dict:
        """Verifica se o conte√∫do original est√° presente no texto gerado"""
        # Extrai conceitos-chave do texto original
        original_concepts = self.extract_key_concepts(original_answer)

        # Normaliza o texto gerado
        normalized_generated = self.normalize_text(generated_text)

        # Verifica quantos conceitos est√£o presentes
        present_concepts = []
        missing_concepts = []

        for concept in original_concepts:
            if concept in normalized_generated:
                present_concepts.append(concept)
            else:
                # Verifica similaridade parcial
                found_similar = False
                for word in normalized_generated.split():
                    if self.calculate_similarity(concept, word) > 0.8:
                        present_concepts.append(concept)
                        found_similar = True
                        break

                if not found_similar:
                    missing_concepts.append(concept)

        total_concepts = len(original_concepts)
        present_count = len(present_concepts)

        coverage_percent = (present_count / total_concepts) * 100 if total_concepts > 0 else 0

        return {
            'total_concepts': total_concepts,
            'present_concepts': present_count,
            'coverage_percent': coverage_percent,
            'missing_concepts': missing_concepts[:5],  # Apenas os primeiros 5
            'similarity_score': self.calculate_similarity(original_answer, generated_text) * 100
        }

    def compare_character_count(self) -> Dict:
        """Compara a quantidade de caracteres entre original e gerado"""
        # Contar caracteres do conte√∫do original
        original_chars = sum(len(item['answer']) for item in self.original_content.values())
        generated_chars = len(self.generated_content)

        # Calcular diferen√ßa percentual
        if original_chars > 0:
            expansion_ratio = generated_chars / original_chars
            difference_percent = ((generated_chars - original_chars) / original_chars) * 100
        else:
            expansion_ratio = 0
            difference_percent = 0

        return {
            'original_chars': original_chars,
            'generated_chars': generated_chars,
            'expansion_ratio': expansion_ratio,
            'difference_percent': difference_percent,
            'status': self._get_expansion_status(expansion_ratio)
        }

    def _get_expansion_status(self, ratio: float) -> str:
        """Determina o status baseado na raz√£o de expans√£o"""
        if ratio < 0.5:
            return "‚ùå MUITO COMPRIMIDO"
        elif ratio < 0.8:
            return "‚ö†Ô∏è COMPRIMIDO"
        elif ratio < 1.2:
            return "‚úÖ SIMILAR"
        elif ratio < 2.0:
            return "‚úÖ EXPANDIDO"
        elif ratio < 3.0:
            return "‚ö†Ô∏è MUITO EXPANDIDO"
        else:
            return "‚ùå EXTREMAMENTE EXPANDIDO"

    def check_content_coverage(self) -> Dict:
        """Verifica cobertura do conte√∫do com l√≥gica melhorada"""
        coverage_report = {
            'total_original_items': len(self.original_content),
            'covered_items': 0,
            'missing_items': [],
            'coverage_percentage': 0,
            'details': [],
            'character_analysis': self.compare_character_count()
        }

        for key, item in self.original_content.items():
            presence_analysis = self.check_content_presence(item['answer'], self.generated_content)

            item_report = {
                'key': key,
                'answer': item['answer'],
                'coverage_percent': presence_analysis['coverage_percent'],
                'similarity_score': presence_analysis['similarity_score'],
                'present_concepts': presence_analysis['present_concepts'],
                'total_concepts': presence_analysis['total_concepts'],
                'missing_concepts': presence_analysis['missing_concepts']
            }

            coverage_report['details'].append(item_report)

            # Considera coberto se tem boa cobertura de conceitos OU alta similaridade
            if presence_analysis['coverage_percent'] > 40 or presence_analysis['similarity_score'] > 30:
                coverage_report['covered_items'] += 1
            else:
                coverage_report['missing_items'].append(item_report)

        coverage_report['coverage_percentage'] = (
            coverage_report['covered_items'] / coverage_report['total_original_items']
        ) * 100

        return coverage_report

    def generate_diff_report(self) -> str:
        """Gera relat√≥rio melhorado no estilo diff do GitHub"""
        coverage = self.check_content_coverage()
        char_analysis = coverage['character_analysis']

        report = []
        report.append("# üìä RELAT√ìRIO DE VALIDA√á√ÉO DE CONTE√öDO - VERS√ÉO MELHORADA")
        report.append("=" * 70)
        report.append("")

        # Resumo geral
        report.append("## üìà RESUMO GERAL")
        report.append(f"- **Total de itens originais:** {coverage['total_original_items']}")
        report.append(f"- **Itens cobertos:** {coverage['covered_items']}")
        report.append(f"- **Cobertura geral:** {coverage['coverage_percentage']:.1f}%")
        report.append("")

        # An√°lise de caracteres
        report.append("## üìè AN√ÅLISE DE CARACTERES")
        report.append(f"- **Caracteres originais:** {char_analysis['original_chars']:,}")
        report.append(f"- **Caracteres gerados:** {char_analysis['generated_chars']:,}")
        report.append(f"- **Raz√£o de expans√£o:** {char_analysis['expansion_ratio']:.2f}x")
        report.append(f"- **Diferen√ßa percentual:** {char_analysis['difference_percent']:+.1f}%")
        report.append(f"- **Status:** {char_analysis['status']}")
        report.append("")

        # Status da cobertura
        if coverage['coverage_percentage'] >= 80:
            status = "‚úÖ EXCELENTE"
        elif coverage['coverage_percentage'] >= 60:
            status = "‚úÖ BOM"
        elif coverage['coverage_percentage'] >= 40:
            status = "‚ö†Ô∏è REGULAR"
        else:
            status = "‚ùå INSUFICIENTE"

        report.append(f"**Status Geral:** {status}")
        report.append("")

        # Itens bem cobertos
        well_covered = [item for item in coverage['details'] if item['coverage_percent'] >= 60 or item['similarity_score'] >= 40]
        if well_covered:
            report.append("## ‚úÖ CONTE√öDO BEM COBERTO")
            report.append("")
            for item in well_covered:
                report.append(f"### + {item['key']}")
                report.append(f"**Cobertura de conceitos:** {item['coverage_percent']:.1f}% ({item['present_concepts']}/{item['total_concepts']})")
                report.append(f"**Similaridade:** {item['similarity_score']:.1f}%")
                report.append("```diff")
                report.append(f"+ ‚úÖ Conte√∫do bem integrado no cap√≠tulo")
                report.append("```")
                report.append("")

        # Itens com cobertura parcial
        partial_covered = [item for item in coverage['details']
                          if (20 <= item['coverage_percent'] < 60) and item['similarity_score'] < 40]
        if partial_covered:
            report.append("## ‚ö†Ô∏è CONTE√öDO PARCIALMENTE COBERTO")
            report.append("")
            for item in partial_covered:
                report.append(f"### ~ {item['key']}")
                report.append(f"**Cobertura de conceitos:** {item['coverage_percent']:.1f}% ({item['present_concepts']}/{item['total_concepts']})")
                report.append(f"**Similaridade:** {item['similarity_score']:.1f}%")
                report.append("```diff")
                report.append(f"~ ‚ö†Ô∏è Parte do conte√∫do pode estar ausente ou muito reformulado")
                if item['missing_concepts']:
                    report.append("- Conceitos possivelmente ausentes:")
                    for concept in item['missing_concepts']:
                        report.append(f"- {concept}")
                report.append("```")
                report.append("")

        # Itens com baixa cobertura
        low_covered = [item for item in coverage['details']
                      if item['coverage_percent'] < 20 and item['similarity_score'] < 30]
        if low_covered:
            report.append("## ‚ùå CONTE√öDO COM BAIXA COBERTURA")
            report.append("")
            for item in low_covered:
                report.append(f"### - {item['key']}")
                report.append(f"**Cobertura de conceitos:** {item['coverage_percent']:.1f}% ({item['present_concepts']}/{item['total_concepts']})")
                report.append(f"**Similaridade:** {item['similarity_score']:.1f}%")
                report.append("```diff")
                report.append(f"- ‚ùå Conte√∫do significativamente ausente")
                report.append("- Resposta original:")
                answer_preview = item['answer'][:300] + "..." if len(item['answer']) > 300 else item['answer']
                report.append(f"- {answer_preview}")
                report.append("```")
                report.append("")

        # Recomenda√ß√µes
        report.append("## üîç RECOMENDA√á√ïES")
        report.append("")

        if coverage['coverage_percentage'] < 60:
            report.append("```diff")
            report.append("- ‚ö†Ô∏è ATEN√á√ÉO: Cobertura abaixo do ideal")
            report.append("+ Revisar itens com baixa cobertura")
            report.append("+ Verificar se informa√ß√µes importantes foram omitidas")
            report.append("+ Considerar incluir mais detalhes dos itens ausentes")
            report.append("```")
        else:
            report.append("```diff")
            report.append("+ ‚úÖ Cobertura satisfat√≥ria")
            report.append("+ Conte√∫do bem integrado e reformulado")
            report.append("+ Cap√≠tulo pronto para revis√£o final")
            report.append("```")

        # An√°lise da expans√£o
        report.append("")
        report.append("## üìä AN√ÅLISE DE EXPANS√ÉO")
        report.append("")

        if char_analysis['expansion_ratio'] < 0.8:
            report.append("```diff")
            report.append("- ‚ö†Ô∏è Conte√∫do pode estar muito comprimido")
            report.append("+ Considerar adicionar mais detalhes e contexto")
            report.append("```")
        elif char_analysis['expansion_ratio'] > 2.5:
            report.append("```diff")
            report.append("- ‚ö†Ô∏è Conte√∫do pode estar muito expandido")
            report.append("+ Verificar se h√° informa√ß√µes desnecess√°rias")
            report.append("```")
        else:
            report.append("```diff")
            report.append("+ ‚úÖ Expans√£o adequada para um cap√≠tulo narrativo")
            report.append("```")

        report.append("")
        report.append("---")
        report.append("*Relat√≥rio gerado automaticamente pelo ContentValidator v2.0*")
        report.append("*L√≥gica melhorada para an√°lise de conte√∫do reformulado*")

        return "\n".join(report)

    def run_validation(self, csv_path: str, generated_text: str) -> str:
        """Executa valida√ß√£o completa"""
        print("üîÑ Carregando dados originais...")
        self.load_original_data(csv_path)

        print("üîÑ Carregando conte√∫do gerado...")
        self.load_generated_content(generated_text)

        print("üîÑ Analisando cobertura com l√≥gica melhorada...")

        print("üîÑ Gerando relat√≥rio...")
        report = self.generate_diff_report()

        return report

# 3. PREPARAR DADOS DE EXEMPLO
csv_data = "/caminho_para_csv_com_answers"

# 4. DEFINIR CONTE√öDO GERADO (cole seu cap√≠tulo aqui)
generated_text = """
*CONTE√öDO GERADO*
"""

# 5. EXECUTAR VALIDA√á√ÉO
validator = ContentValidator()

# Executar valida√ß√£o
print("üöÄ Iniciando valida√ß√£o com l√≥gica melhorada...")
report = validator.run_validation(csv_data, generated_text)

# 6. EXIBIR RESULTADO
display(Markdown(report))

# 7. FUN√á√ÉO PARA AN√ÅLISE DETALHADA
def detailed_analysis(validator):
    """An√°lise detalhada item por item"""
    print("\n" + "="*60)
    print("AN√ÅLISE DETALHADA ITEM POR ITEM")
    print("="*60)

    coverage = validator.check_content_coverage()

    for i, item in enumerate(coverage['details'], 1):
        print(f"\n{i}. {item['key']}")
        print(f"   Cobertura de conceitos: {item['coverage_percent']:.1f}%")
        print(f"   Similaridade geral: {item['similarity_score']:.1f}%")
        print(f"   Conceitos presentes: {item['present_concepts']}/{item['total_concepts']}")

        if item['coverage_percent'] >= 60 or item['similarity_score'] >= 40:
            print("   Status: ‚úÖ BEM COBERTO")
        elif item['coverage_percent'] >= 20 or item['similarity_score'] >= 20:
            print("   Status: ‚ö†Ô∏è PARCIALMENTE COBERTO")
        else:
            print("   Status: ‚ùå BAIXA COBERTURA")

        if item['missing_concepts']:
            print("   Conceitos ausentes:")
            for concept in item['missing_concepts']:
                print(f"   - {concept}")

# Executar an√°lise detalhada
detailed_analysis(validator)

# 8. FUN√á√ÉO PARA RELAT√ìRIO DE CONCEITOS
def concept_analysis_report(validator):
    """Relat√≥rio focado em conceitos-chave"""
    print("\n" + "="*60)
    print("RELAT√ìRIO DE CONCEITOS-CHAVE")
    print("="*60)

    for key, item in validator.original_content.items():
        print(f"\nüìù {key}:")
        print(f"Texto original: {item['answer'][:200]}...")

        concepts = validator.extract_key_concepts(item['answer'])
        print(f"Conceitos extra√≠dos: {concepts[:10]}")  # Primeiros 10

        presence = validator.check_content_presence(item['answer'], validator.generated_content)
        print(f"Cobertura: {presence['coverage_percent']:.1f}%")
        print(f"Similaridade: {presence['similarity_score']:.1f}%")

# Executar an√°lise de conceitos
concept_analysis_report(validator)