In [1]:
import os
import docx2txt
import tiktoken
import csv

def extract_text_from_docx(file_path):
    """
    Extracts text content from a DOCX file.

    Args:
        file_path (str): The path to the DOCX file.

    Returns:
        str: The extracted text content, including text from tables,
             or None if an error occurs.
    """
    try:
        text = docx2txt.process(file_path)
        return text
    except Exception as e:
        print(f"Error reading DOCX file '{file_path}': {e}")
        return None

def break_down_docx_publication(file_path, chunk_size=1000):
    """
    Extracts text from a DOCX publication and breaks it down into chunks.

    Args:
        file_path (str): The path to the DOCX file.
        chunk_size (int): The maximum number of tokens per chunk.

    Returns:
        list: A list of dictionaries, where each dictionary represents a chunk
              and contains 'source', 'page_number', 'chunk_number', and 'text'.
    """
    text = extract_text_from_docx(file_path)
    if text is None:
        return []

    file_name = os.path.basename(file_path)
    tokenizer = tiktoken.get_encoding("cl100k_base")
    tokens = tokenizer.encode(text)
    chunks = []
    chunk_number = 1
    page_number = 1  # DOCX files don't inherently have page numbers in the same way as PDFs

    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk_tokens)
        chunks.append({
            'source': file_name,
            'page_number': page_number,
            'chunk_number': chunk_number,
            'text': chunk_text
        })
        chunk_number += 1

    return chunks

def save_chunks_to_csv(all_chunks, output_csv_file="docx_chunks.csv"):
    """
    Saves the list of chunks to a CSV file.

    Args:
        all_chunks (list): A list of dictionaries, where each dictionary
                           represents a chunk.
        output_csv_file (str): The name of the CSV file to save to.
    """
    if not all_chunks:
        print("No chunks to save to CSV.")
        return

    fieldnames = all_chunks[0].keys()  # Get the headers from the first chunk
    try:
        with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            writer.writerows(all_chunks)
        print(f"Chunks successfully saved to '{output_csv_file}'")
    except Exception as e:
        print(f"Error saving to CSV file '{output_csv_file}': {e}")

if __name__ == '__main__':
    folder_path = r"C:\Users\salsubhi1\PycharmProjects\Enigmatic Research\Dr.X Files"
    all_chunks = []
    output_csv_filename = "docx_chunks_text.csv"  # Name for the output CSV file

    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path) and filename.lower().endswith('.docx'):
                print(f"Processing DOCX file: {filename}")
                chunks = break_down_docx_publication(file_path, chunk_size=500)  # Adjust chunk size as needed
                all_chunks.extend(chunks)
            elif not os.path.isfile(file_path):
                print(f"Skipping: {filename} (not a file)")
            elif not filename.lower().endswith('.docx'):
                print(f"Skipping: {filename} (not a DOCX file)")

        if all_chunks:
            save_chunks_to_csv(all_chunks, output_csv_filename)
        else:
            print("No DOCX files found to process.")
    else:
        print(f"Error: Folder not found at '{folder_path}'")

Skipping: .ipynb_checkpoints (not a file)
Processing DOCX file: Dataset summaries and citations.docx
Skipping: Loan amortisation schedule1.xlsx (not a DOCX file)
Skipping: Loan analysis.xlsx (not a DOCX file)
Processing DOCX file: M.Sc. Applied Psychology.docx
Skipping: new-approaches-and-procedures-for-cancer-treatment.pdf (not a DOCX file)
Skipping: Ocean_ecogeochemistry_A_review.pdf (not a DOCX file)
Skipping: party budget1.xlsx (not a DOCX file)
Processing DOCX file: Stats.docx
Skipping: The-Alchemist.pdf (not a DOCX file)
Skipping: The_Plan_of_the_Giza_Pyramids.pdf (not a DOCX file)
Chunks successfully saved to 'docx_chunks_text.csv'
