In [37]:
import os
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
import spacy
import fitz  # PyMuPDF

nlp = spacy.load("es_core_news_sm")

pdfs_dir = "../pdfs"
pdf_files = [pdfs_dir + "/" + f for f in os.listdir(pdfs_dir) if f.endswith(".pdf")]

chunks_dir = "../chunks"
chunks_files = [chunks_dir + "/" + f for f in os.listdir(chunks_dir) if f.endswith(".txt")]

processed_chunks_dir = "../processed_chunks"
processed_chunks_files = [processed_chunks_dir + "/" + f for f in os.listdir(processed_chunks_dir) if f.endswith(".txt")]

print(pdf_files)
print(chunks_files)
print(processed_chunks_files)

['../pdfs/MANUAL IDENTIFICACION Y EVALUACION DE RIESGOS IER_ESP (1).pdf', '../pdfs/MANUAL AUDITORIAS_ESP (2).pdf', '../pdfs/MANUAL PLANES DE EMERGENCIA_ESP (2).pdf', '../pdfs/MANUAL DE AUSENTISMO_ESP (2).pdf', '../pdfs/MANUAL ESTRUCTURA ORGANIZATIVA_ESP (2).pdf', '../pdfs/MANUAL INFORMACIÓN DOCUMENTADA_ESP (2).pdf']
['../chunks/estructura organizativa.txt', '../chunks/riesgos.txt', '../chunks/ausentismo.txt', '../chunks/información.txt', '../chunks/emergencia.txt', '../chunks/auditorias.txt']
['../processed_chunks/estructura organizativa.txt', '../processed_chunks/riesgos.txt', '../processed_chunks/ausentismo.txt', '../processed_chunks/información.txt', '../processed_chunks/emergencia.txt', '../processed_chunks/auditorias.txt']


In [38]:
analysis_data = {
    "Auditorias": {
        "pdf_path": "../pdfs/MANUAL AUDITORIAS_ESP (2).pdf",
        "chunks_path": "../chunks/auditorias.txt",
        "processed_chunks_path": "../processed_chunks/auditorias.txt"
    },
    "Ausentismo": {
        "pdf_path": "../pdfs/MANUAL DE AUSENTISMO_ESP (2).pdf",
        "chunks_path": "../chunks/ausentismo.txt",
        "processed_chunks_path": "../processed_chunks/ausentismo.txt"
    },
    "Estructura Organizativa": {
        "pdf_path": "../pdfs/MANUAL ESTRUCTURA ORGANIZATIVA_ESP (2).pdf",
        "chunks_path": "../chunks/estructura organizativa.txt",
        "processed_chunks_path": "../processed_chunks/estructura organizativa.txt"
    },
    "Riesgos": {
        "pdf_path": "../pdfs/MANUAL IDENTIFICACION Y EVALUACION DE RIESGOS IER_ESP (1).pdf",
        "chunks_path": "../chunks/riesgos.txt",
        "processed_chunks_path": "../processed_chunks/riesgos.txt"
    },
    "Información": {
        "pdf_path": "../pdfs/MANUAL INFORMACIÓN DOCUMENTADA_ESP (2).pdf",
        "chunks_path": "../chunks/información.txt",
        "processed_chunks_path": "../processed_chunks/información.txt"
    
    },
    "Emergencia": {
        "pdf_path": "../pdfs/MANUAL PLANES DE EMERGENCIA_ESP (2).pdf",
        "chunks_path": "../chunks/emergencia.txt",
        "processed_chunks_path": "../processed_chunks/emergencia.txt"
    }
}

# Basic Statistics
- Number of pages
- Total word count
- Unique word count
- Average words per page

## Such values are calculated after these clean up tasks:
- Remove extra spaces and newlines.
- Remove numbers with a + before them. Footers with phone numbers.
- Remove sentences with less than 10 characters.
- Remove sentences with any text with this format xx.xxx.xx.xx. Headers indicating when the document was written.
- Remove sentences with the format Imagen x. Images that we will not be using to create embeddings.
- Remove sentences with more than 5 dots in a row. Index
- Remove sentences that start with a number and have less than 30 characters. Titles.
- Remove sentences that start with Manual de usuario: or Manual de usuario. Titles.

## In summary, the clean up section removes:
- Titles.
- Images with their corresponding headers.
- Headers.
- Footers.
- Index.
- Random spaces.

In [39]:
def get_number_of_pages(pdf_path):
    doc = fitz.open(pdf_path)
    number_of_pages = len(doc)
    return number_of_pages

def extract_words_from_chunk(chunk_file):
    with open(chunk_file, "r") as f:
        text = f.read()
    return text

def analyze_pdf(words, num_pages):
    trimmed_words = words.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
    trimmed_words = trimmed_words.split(' ')
    trimmed_words = [word for word in trimmed_words if len(word) > 0]
    total_word_count = len(trimmed_words)
    unique_word_count = len(set(trimmed_words))
    average_words_per_page = total_word_count / num_pages if num_pages > 0 else 0
    return total_word_count, unique_word_count, average_words_per_page

for key in analysis_data.keys():
    values = analysis_data[key]
    pdf_file = values["pdf_path"]
    chunks_file = values["chunks_path"]
    processed_chunks_file = values["processed_chunks_path"]
    num_pages = get_number_of_pages(pdf_file)
    words = extract_words_from_chunk(chunks_file)  # Assuming you have a function to extract words from a PDF
    total_word_count, unique_word_count, average_words_per_page = analyze_pdf(words, num_pages)
    analysis_data[key]["num_pages"] = num_pages
    analysis_data[key]["total_word_count"] = total_word_count
    analysis_data[key]["unique_word_count"] = unique_word_count
    analysis_data[key]["average_words_per_page"] = average_words_per_page
    analysis_data[key]["key"] = key

print(analysis_data)

{'Auditorias': {'pdf_path': '../pdfs/MANUAL AUDITORIAS_ESP (2).pdf', 'chunks_path': '../chunks/auditorias.txt', 'processed_chunks_path': '../processed_chunks/auditorias.txt', 'num_pages': 38, 'total_word_count': 2650, 'unique_word_count': 627, 'average_words_per_page': 69.73684210526316, 'key': 'Auditorias'}, 'Ausentismo': {'pdf_path': '../pdfs/MANUAL DE AUSENTISMO_ESP (2).pdf', 'chunks_path': '../chunks/ausentismo.txt', 'processed_chunks_path': '../processed_chunks/ausentismo.txt', 'num_pages': 34, 'total_word_count': 1747, 'unique_word_count': 429, 'average_words_per_page': 51.38235294117647, 'key': 'Ausentismo'}, 'Estructura Organizativa': {'pdf_path': '../pdfs/MANUAL ESTRUCTURA ORGANIZATIVA_ESP (2).pdf', 'chunks_path': '../chunks/estructura organizativa.txt', 'processed_chunks_path': '../processed_chunks/estructura organizativa.txt', 'num_pages': 92, 'total_word_count': 7513, 'unique_word_count': 1030, 'average_words_per_page': 81.66304347826087, 'key': 'Estructura Organizativa'}, 

In [40]:
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px


df = pd.DataFrame.from_dict(analysis_data, orient='index')

# Define plot configurations
plot_configs = [
    ('num_pages', 'Numero de paginas'),
    ('total_word_count', 'Palabras totales'),
    ('unique_word_count', 'Palabras unicas'),
    ('average_words_per_page', 'Promedio de palabras por pagina')
]

# Use a color palette with six colors
colors = ['#425B7F', '#9CBED2', '#E0FBFB', '#EE6D4E', '#2D323C', '#ED1C24', '#5AA9FF']
labels = df.index

# Iterate over the plot configurations to create individual plots
for y, title in plot_configs:
    fig = go.Figure()

    trace = go.Bar(x=labels, y=df[y], marker_color=colors[:len(df)], name=title, text=df[y], textposition='auto')
    fig.add_trace(trace)

    # Update layout for each plot
    fig.update_layout(
        title=title,
        showlegend=True,
        colorway=colors
    )

    # Add custom legend for documents
    for i, label in enumerate(labels):
        fig.add_trace(go.Scatter(
            x=[None], y=[None], mode='markers',
            marker=dict(size=10, color=colors[i]),
            legendgroup=label,
            showlegend=True,
            name=label
        ))

    # Show the plot
    fig.show()
