In [5]:
import zipfile
import os
import glob
from collections import defaultdict
from lxml import etree
import matplotlib.pyplot as plt

In [6]:
def analyze_pptx_files(directory):
    pptx_files = glob.glob(os.path.join(directory, '*.pptx'))
    shape_line_numbers = []
    slide_line_numbers = []
    presentation_line_numbers = []

    for pptx_file in pptx_files:
        with zipfile.ZipFile(pptx_file, 'r') as z:
            # Analyze presentation.xml
            try:
                presentation_xml = z.read('ppt/presentation.xml').decode('utf-8')
                presentation_lines = presentation_xml.count('\n') + 1
                presentation_line_numbers.append(presentation_lines)
            except KeyError:
                print(f"Presentation XML not found in {pptx_file}")
                continue

            # Analyze slides
            slide_files = [name for name in z.namelist() if name.startswith('ppt/slides/slide')]
            for slide_file in slide_files:
                slide_xml = z.read(slide_file).decode('utf-8')
                slide_lines = slide_xml.count('\n') + 1
                slide_line_numbers.append(slide_lines)

                # Parse XML and get line numbers of shapes
                parser = etree.XMLParser(recover=True, huge_tree=True)
                root = etree.fromstring(slide_xml.encode('utf-8'), parser=parser)
                for elem in root.iter():
                    if elem.tag.endswith('}sp'):  # Shape element
                        if elem.sourceline is not None:
                            shape_line_numbers.append(elem.sourceline)

    # Debug: Print sample data
    print("\nSample presentation line numbers:", presentation_line_numbers[:10])
    print("Sample slide line numbers:", slide_line_numbers[:10])
    print("Sample shape line numbers:", shape_line_numbers[:10])

    # Analyze distributions and plot graphs
    print("\nAnalyzing distributions and generating graphs...")
    analyze_and_plot_distribution(presentation_line_numbers, "Presentation XML Line Numbers", "presentation_line_numbers.png")
    analyze_and_plot_distribution(slide_line_numbers, "Slide XML Line Numbers", "slide_line_numbers.png")
    analyze_and_plot_distribution(shape_line_numbers, "Shape Element Line Numbers", "shape_line_numbers.png")
    print("Graphs have been saved as PNG files.")


In [7]:

def analyze_and_plot_distribution(line_numbers, title, filename):
    if not line_numbers:
        print(f"No data to analyze for {title}.")
        return

    total = len(line_numbers)
    line_numbers.sort()
    min_line = line_numbers[0]
    max_line = line_numbers[-1]
    average_line = sum(line_numbers) / total

    print(f"\n{title}:")
    print(f"Total count: {total}")
    print(f"Minimum line number: {min_line}")
    print(f"Maximum line number: {max_line}")
    print(f"Average line number: {average_line:.2f}")

    # Plotting the distribution as a histogram
    plt.figure(figsize=(10, 6))
    plt.hist(line_numbers, bins=30, edgecolor='black', alpha=0.7)
    plt.title(f"Distribution of {title}")
    plt.xlabel('Line Number')
    plt.ylabel('Frequency')
    plt.grid(axis='y', alpha=0.75)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"Graph saved as {filename}")

In [8]:
dir_path = "/Users/tyrionhuu/projects/research_projects/PPTBench/dataset/pptx"
analyze_pptx_files(dir_path)


Sample presentation line numbers: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Sample slide line numbers: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
Sample shape line numbers: [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]

Analyzing distributions and generating graphs...

Presentation XML Line Numbers:
Total count: 958
Minimum line number: 2
Maximum line number: 2
Average line number: 2.00
Graph saved as presentation_line_numbers.png

Slide XML Line Numbers:
Total count: 22899
Minimum line number: 2
Maximum line number: 2
Average line number: 2.00
Graph saved as slide_line_numbers.png

Shape Element Line Numbers:
Total count: 182585
Minimum line number: 2
Maximum line number: 2
Average line number: 2.00
Graph saved as shape_line_numbers.png
Graphs have been saved as PNG files.
