In [1]:
import PIL.Image
import PIL.ImageDraw
import PIL.ImageFont
from wand.image import Image
from wand.color import Color
import os
import uuid
import nbformat
import cv2

DEFAULT_IMAGE_SIZE = 400
MAX_LINES_PER_IMAGE= 20

In [2]:
def generate_unique_filename(directory, extension):
    while True:
        unique_name = f"{uuid.uuid4()}.{extension}"
        filename = os.path.join(directory, unique_name)
        if not os.path.exists(filename):
            return filename

def is_text_file(file_path, block_size=512):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            file.read(block_size)
        return True
    except UnicodeDecodeError:
        return False
    except IOError:
        return False

In [3]:
try:
    font = PIL.ImageFont.truetype("D2Coding-Ver1.3.2-20180524-all.ttc", 12)
except IOError:
    font = PIL.ImageFont.load_default()

In [4]:
from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE

# shape locations use EMU unit.
# 1 point = 12700 EMUs
def draw_pptx_text(shape, line_index, text, width, height, draw, font, font_size=20):
    left = int(shape.left / 12700)
    # TODO: how to get line height
    top = int(shape.top / 12700) + line_index * font_size
    draw.text((left, top), text, fill=(0, 0, 0), font=font)

def create_pptx_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE, font_size=20):
    presentation = Presentation(input_file)
    slide = presentation.slides[0]

    image = PIL.Image.new('RGB', (width, height), color=(255, 255, 255))
    draw = PIL.ImageDraw.Draw(image)

    for shape in slide.shapes:
        if shape.shape_type == MSO_SHAPE_TYPE.TEXT_BOX or shape.has_text_frame:
            line_index = 0
            for paragraph in shape.text_frame.paragraphs:
                draw_pptx_text(shape, line_index, paragraph.text, width, height, draw, font)
                line_index += 1
        elif shape.shape_type == MSO_SHAPE_TYPE.AUTO_SHAPE:
            if shape.has_text_frame:
                draw_pptx_text(shape, shape.text_frame.text, width, height, draw, font)

    image.save(output_file)

In [5]:
from docx import Document
import textwrap

def create_docx_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE, font_size=20, text_wrap = 80):
    doc = Document(input_file)
    
    image = PIL.Image.new('RGB', (width, height), color=(255, 255, 255))
    draw = PIL.ImageDraw.Draw(image)
    
    # TODO: How to get line height from font_size
    line_height = font_size
    y = 0

    for paragraph in doc.paragraphs:
        text = paragraph.text
        if not text:
            continue
        
        # Wrap text to fit within image width
        lines = textwrap.wrap(text, width=text_wrap)
        
        for line in lines:
            draw.text((0, y), line, fill=(0, 0, 0), font=font)
            y += line_height
        
        y += line_height
    
    # Save the image
    image.save(output_file)

In [6]:
import openpyxl

def create_excel_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE, start_cell='A1', end_cell='Z20'):
    wb = openpyxl.load_workbook(input_file)
    sheet = wb.active

    cell_range = sheet[start_cell:end_cell]
    cell_width = 100
    cell_height = 30
    img = PIL.Image.new('RGB', (width, height), color=(255, 255, 255))
    draw = PIL.ImageDraw.Draw(img)

    for row_index, row in enumerate(cell_range):
        for col_index, cell in enumerate(row):
            cell_value = str(cell.value) if cell.value is not None else ""
            x = col_index * cell_width
            y = row_index * cell_height
            draw.rectangle([x, y, x + cell_width, y + cell_height], outline=(0, 0, 0))
            draw.text((x + 5, y + 5), cell_value, font=font, fill=(0, 0, 0))
    img.save(output_file)

In [7]:
def create_image_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE):
    with Image(filename=f"{input_file}[{width}x{height}]") as img:
        img.save(filename=output_file)

def create_pdf_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE):
    # TODO: Fail to find a way to extract first page and size transform in a single Image call
    with Image(filename=f'{input_file}[0]', resolution=300) as img:
        img = img.sequence[0]
        with Image(img) as single_page:
            single_page.format = 'png'
            single_page.background_color = single_page.background_color
            single_page.alpha_channel = 'remove'
            single_page.transform(resize=f'{width}x{height}>')
            single_page.save(filename=output_file)

def image_to_text(text, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE, font_size=20):
    image = PIL.Image.new('RGB', (width, height), color=(255, 255, 255))
    draw = PIL.ImageDraw.Draw(image)
    
    text_lines = text.split('\n')
    y_text = 10
    for line in text_lines:
        bbox = draw.textbbox((0, 0), line, font=font)
        width_text = bbox[2] - bbox[0]
        height_text = bbox[3] - bbox[1]
        
        if y_text + height_text > height:
            break  
        
        draw.text((10, y_text), line, font=font, fill=(0, 0, 0))
        y_text += height_text
    return image

def create_text_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE, font_size=20):
    with open(input_file, 'r', encoding='utf-8') as file:
        # TODO: Read a few lines to save time when file is too big.
        text = file.read()
    image_to_text(text, width, height, font_size).save(output_file)

def create_ipynb_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE, font_size=20):
    with open(input_file, 'r', encoding='utf-8') as file:
        notebook = nbformat.read(file, as_version=4)

    text = ""
    number_of_lines = 0
    for cell in notebook.cells:
        if cell.cell_type == 'code':
            text += f"### Code Cell\n{cell.source}\n"
            number_of_lines += 2

            if 'outputs' in cell:
                for output in cell.outputs:
                    if 'text' in output:
                        text += f"### Output\n{output['text']}\n"
                        number_of_lines += 2
                    elif 'data' in output and 'text/plain' in output['data']:
                        text += f"### Output\n{output['data']['text/plain']}\n"
                        number_of_lines += 2
                    if number_of_lines > MAX_LINES_PER_IMAGE:
                        break
        elif cell.cell_type == 'markdown':
            text += f"### Markdown Cell\n{cell.source}\n"
            number_of_lines += 2
        if number_of_lines > MAX_LINES_PER_IMAGE:
            break
    image_to_text(text, width, height, font_size).save(output_file)

def create_video_preview(input_file, output_file, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE):
    video = cv2.VideoCapture(input_file)
    success, frame = video.read()
    
    if success:
        resized_frame = cv2.resize(frame, (width, height))
        cv2.imwrite(output_file, resized_frame)
    else:
        raise ValueError(f"Fail to extract frame from video: {input_file}")
    
    video.release()

def create_preview(input_file, output_file=None, width=DEFAULT_IMAGE_SIZE, height=DEFAULT_IMAGE_SIZE):
    if not output_file:
        output_file = generate_unique_filename("./output", "jpg")
    file_extension = input_file.split('.')[-1].lower()
    
    if file_extension in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'svg']:
        create_image_preview(input_file, output_file, width, height)
    elif file_extension == 'pdf':
        create_pdf_preview(input_file, output_file, width, height)
    elif file_extension in ['txt', 'md']:
        create_text_preview(input_file, output_file, width, height)
    elif file_extension in ['ipynb']:
        create_ipynb_preview(input_file, output_file, width, height)
    elif file_extension in ['mov', 'mp4']:
        create_video_preview(input_file, output_file, width, height)
    elif file_extension in ['xlsx']:
        create_excel_preview(input_file, output_file, width, height)
    elif file_extension in ['docx']:
        create_docx_preview(input_file, output_file, width, height)
    elif file_extension in ['pptx']:
        create_pptx_preview(input_file, output_file, width, height)
    else:
        if is_text_file(input_file):
            create_text_preview(input_file, output_file, width, height)
        else:
            raise ValueError(f"Unsupported file type: {file_extension}")
    return output_file


In [8]:
print(create_preview('./data/[숙제]네이버 영화_ 클레멘타인 영화평점_댓글 가져오기.ipynb'))
print(create_preview('./data/2304.12244.pdf'))
print(create_preview('./data/changelog.txt'))
print(create_preview('./data/KTC_User_Guide-24.03.pdf'))
print(create_preview('./data/LICENSE'))
print(create_preview('./data/output.jsonl'))
print(create_preview('./data/prof.jpg'))
print(create_preview('./data/pyproject.toml'))
print(create_preview('./data/README.md'))
print(create_preview('./data/SVG_Logo.svg'))
print(create_preview('./data/unicode.txt'))
print(create_preview('./data/ts_480.mov'))
print(create_preview('./data/ts.mp4'))
print(create_preview('./data/utils.py'))
print(create_preview('./data/환자표본자료 변수설명서(2020년).xlsx'))
print(create_preview('./data/Symbola.docx'))
print(create_preview('./data/SAP_MES연동.pptx'))


./output/3ca27a6d-46bd-4844-9239-85a91b22dfe9.jpg
./output/02cd61ef-f9a5-45bb-86e7-04f754ce6d02.jpg
./output/ed12c966-98e6-4b8a-abed-784622612b02.jpg
./output/18064b4d-69a8-45fc-a508-15911e36ca32.jpg
./output/cf485196-5995-4144-ab02-a7a1bbe4d3ce.jpg
./output/03ee0eb3-7b60-409a-a25c-cf0b7912c098.jpg
./output/c1454570-b952-4154-9d9f-5c939095ec6b.jpg
./output/09c81d37-4abe-47e4-a64d-961623a6e9c7.jpg
./output/acbfaadf-7664-4000-ad0b-ec73cabbfefe.jpg
./output/6cf37eb5-90b3-4bfc-9a80-d7b03b0b02f4.jpg
./output/7d4554dc-d665-4c75-a567-ea0b558114d1.jpg
./output/e0f54c4a-2972-4419-9439-65fa48ef9043.jpg
./output/556b4529-4f15-48e0-98c6-fad35eeba07b.jpg
./output/081b0741-ba79-4685-8814-12338a90684a.jpg
./output/90c2c502-7be8-4116-8846-c020c8324cdf.jpg
./output/fdc18115-c4ed-41ec-953f-655355ec01ae.jpg
./output/6ea7e1a2-351b-4030-837a-b550fb9284d9.jpg
