In [None]:


!pip install gdown PyPDF2 spacy
!python -m spacy download pt_core_news_sm

import gdown
import json
import PyPDF2
import spacy
from collections import Counter

# Carregar o modelo de português do spaCy
nlp = spacy.load("pt_core_news_sm")

# Lista de IDs dos arquivos
file_ids = [
    "1XschuJb1UDyozZpW-xZei7otOdOdpAmF",
    "1-RFpN7QFepPAUecHXUqPAMhfffBztM6x",
    "1wth9iGEtAzPcBH68KY9N5Y0QMDJ5n3B8",
    "11MXRfsuBsDaTyqirMCMaUYrmJca9x0eI",
    "19LRt4ZV5of8Dqv8sw39NbduIUYQLrDSL",
    "1P3Jou4cRrUMvI6IEMx2uubplguKmthm6",
    "1BALTGIPAmfYvMqxe0Vajnf_6KRa_Q9vW",
    "17DSZA17USbsvqE-kmDb8R0v4dyJaqgZb",
    "1AFPqaYgGu_PZakm5R8_S1pBkbnhu7HJ1",
    "11zJcFbm-UrtSBDSOCGyt4UTnipRmykB4",
    "1Wn84a43rimFMilx5tz4UJKrTmxwHlzR2",
    "1CJ_9iHaQgTymhl-q1FuVmUjB0Fokj5Vi",
    "1uc2gBvbu8rwCpCoDOvGjmt5WWOu1Z7nb",
    "1HwspF-Xgeu1oV-59HiXCYuaeaNaf21yU",
    "1Hm7DOXhFFBMIc74IvOtZqU9RKT7qNcHK",
    "1xWxy-ki2ipwDLYD4I0fzWtT9x9NNo3Zh",
    "1piBg1AidZjDnsPl_4thMxUELEklyo2t4",
    "18ge6TWOoFS8sVLWII8klOwliU7HA6NUA",
    "1kU7-hrzRkHW_y6X1rmS-VZV882GhYb86",
    "1TlpV42qfBf-lO76iymx4OQRFE03nV0qJ",
    "1dgG2Qf4MkSoD5hwx6SwPGEzvHjA1Z55_",
    "1lIXTptTKJKQm8sjmX1SEzRW-3hufpNS_",
    "1NHPtt3BcxOgzKNo6K7YcECfsr1WWg8_J",
    "1Pl_PRb5JiPcQHvEpU41UJTIDawm5wIPo",
    "1NG-lhZBV3nf7iozy4-eI4rhjPcZGHZsq",
    "1H_7zQVV12wU-c0H-HCyK-SsMWaPgUVKP",
    "1X-LOU4xJ3t3maaTUtl7hLJ4Uzi-kw6kT",
    "1gWGDD-PORqATDIW_kpXTCknXsrLm1hVi",
    "15cc_iQgVPkAx8foLe__Xtp_RYRnB6aSr",
    "1zLZxSxO1J7XpztujVAH4wuRClPSQyBvK"
]

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def process_text(text):
    doc = nlp(text)

    # Métricas básicas
    num_sentencas = len(list(doc.sents))
    num_tokens = len(doc)

    # Tokenização e lematização
    tokens = [token.text for token in doc]
    lemmas = [token.lemma_ for token in doc]
    pos_tags = [token.pos_ for token in doc]

    # Frequência de tokens
    freq_tokens = Counter(tokens)
    top10_tokens = freq_tokens.most_common(10)
    down10_tokens = freq_tokens.most_common()[-10:]

    # Contagem de classes gramaticais
    classes_gramaticais = {
        'NOUN': 0, 'VERB': 0, 'ADP': 0, 'ADJ': 0, 'ADV': 0,
        'PRON': 0, 'DET': 0, 'CCONJ': 0, 'NUM': 0, 'PROPN': 0
    }

    for token in doc:
        if token.pos_ in classes_gramaticais:
            classes_gramaticais[token.pos_] += 1

    # Análise de dependências
    dependencias = [(token.text, token.dep_, token.head.text) for token in doc]

    return {
        "estatisticas": {
            "num_sentencas": num_sentencas,
            "num_tokens": num_tokens,
            "tokens_por_sentenca": num_tokens/num_sentencas if num_sentencas > 0 else 0,
            "top10_tokens": [{"token": t[0], "frequencia": t[1]} for t in top10_tokens],
            "down10_tokens": [{"token": t[0], "frequencia": t[1]} for t in down10_tokens],
            "classes_gramaticais": classes_gramaticais,
            "num_substantivos": classes_gramaticais['NOUN'],
            "num_verbos": classes_gramaticais['VERB'],
            "num_preposicoes": classes_gramaticais['ADP']
        },
        "analise_linguistica": {
            "tokens": tokens,
            "lemmas": lemmas,
            "pos_tags": pos_tags,
            "dependencias": [{
                "token": d[0],
                "relacao": d[1],
                "governante": d[2]
            } for d in dependencias]
        }
    }

# Dicionário principal para armazenar todos os resultados
resultado_final = {
    "documentos_processados": [],
    "estatisticas_consolidadas": {},
    "documentos_com_erro": []
}

# Processar todos os arquivos
for i, file_id in enumerate(file_ids):
    try:
        print(f"Processando documento {i+1}/{len(file_ids)}...")

        # Baixar o arquivo PDF
        pdf_path = f"/content/doc_{i+1}.pdf"
        gdown.download(f"https://drive.google.com/uc?id={file_id}", pdf_path, quiet=True)

        # Extrair e processar texto
        text = extract_text_from_pdf(pdf_path)
        resultados = process_text(text)

        # Adicionar ao resultado final
        resultado_final["documentos_processados"].append({
            "id": file_id,
            "nome_arquivo": f"doc_{i+1}.pdf",
            "estatisticas": resultados["estatisticas"],
            "analise_linguistica": {
                "num_tokens": len(resultados["analise_linguistica"]["tokens"]),
                "num_lemas": len(resultados["analise_linguistica"]["lemmas"]),
                "num_dependencias": len(resultados["analise_linguistica"]["dependencias"])
            }
        })

    except Exception as e:
        print(f"Erro no documento {i+1}: {str(e)}")
        resultado_final["documentos_com_erro"].append({
            "id": file_id,
            "erro": str(e)
        })

# Calcular estatísticas consolidadas
if resultado_final["documentos_processados"]:
    docs_validos = resultado_final["documentos_processados"]

    estatisticas = {
        "total_documentos": len(docs_validos),
        "total_sentencas": sum(d["estatisticas"]["num_sentencas"] for d in docs_validos),
        "total_tokens": sum(d["estatisticas"]["num_tokens"] for d in docs_validos),
        "media_sentencas_por_doc": sum(d["estatisticas"]["num_sentencas"] for d in docs_validos)/len(docs_validos),
        "media_tokens_por_doc": sum(d["estatisticas"]["num_tokens"] for d in docs_validos)/len(docs_validos),
        "total_substantivos": sum(d["estatisticas"]["num_substantivos"] for d in docs_validos),
        "total_verbos": sum(d["estatisticas"]["num_verbos"] for d in docs_validos),
        "total_preposicoes": sum(d["estatisticas"]["num_preposicoes"] for d in docs_validos),
        "top10_tokens_geral": Counter(
            [item["token"] for doc in docs_validos
             for item in doc["estatisticas"]["top10_tokens"]]
        ).most_common(10)
    }

    resultado_final["estatisticas_consolidadas"] = estatisticas

# Salvar resultado final em um único arquivo JSON
with open('corpus_completo.json', 'w', encoding='utf-8') as f:
    json.dump(resultado_final, f, ensure_ascii=False, indent=4)

print("\nProcessamento concluído! Arquivo 'corpus_completo.json' gerado com sucesso.")
print(f"Documentos processados: {len(resultado_final['documentos_processados'])}")
print(f"Documentos com erro: {len(resultado_final['documentos_com_erro'])}")



Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's depende