In [None]:
#1-5 조건으로 만든 xml

import re
from xml.etree.ElementTree import Element, SubElement, tostring, ElementTree

def convert_txt_to_xml(txt_file, xml_file):
    def is_speaker(line):
        return line.isupper() and not (line.startswith("INT.") or line.startswith("EXT.") or line.startswith("CONTINUED"))

    with open(txt_file, 'r') as file:
        lines = file.readlines()

    play = Element('PLAY')
    current_speaker = None
    previous_speaker = None
    speech_lines = []

    for line in lines:
        stripped_line = line.strip()

        if not stripped_line:
            continue

        if is_speaker(stripped_line):
            if current_speaker is not None:
                if speech_lines:
                    speech_element = SubElement(play, 'SPEECH')
                    speaker_element = SubElement(speech_element, 'SPEAKER')
                    speaker_element.text = current_speaker
                    receiver_element = SubElement(speech_element, 'RECEIVER')
                    receiver_element.text = previous_speaker
                    line_element = SubElement(speech_element, 'LINE')
                    line_element.text = '\n'.join(speech_lines)
                    speech_lines = []

            previous_speaker = current_speaker
            current_speaker = stripped_line
        else:
            if current_speaker:
                speech_lines.append(stripped_line)

    # Add the last speech
    if current_speaker and speech_lines:
        speech_element = SubElement(play, 'SPEECH')
        speaker_element = SubElement(speech_element, 'SPEAKER')
        speaker_element.text = current_speaker
        receiver_element = SubElement(speech_element, 'RECEIVER')
        receiver_element.text = previous_speaker
        line_element = SubElement(speech_element, 'LINE')
        line_element.text = '\n'.join(speech_lines)

    # Write to XML file
    tree = ElementTree(play)
    tree.write(xml_file, encoding='utf-8', xml_declaration=True)

# Example usage
convert_txt_to_xml('avengers.txt', 'output.xml')


In [None]:
##1-6 조건으로 만든 xml

import re

def convert_txt_to_xml(txt_file, xml_file):
    with open(txt_file, 'r') as f:
        lines = f.readlines()

    speeches = []
    current_speaker = None
    current_line = []

    def add_speech(current_receiver=None):  # Add current_receiver as parameter
        if current_speaker and current_line:
            # Determine the receiver, defaulting to the previous speaker if none found
            if current_receiver is None:
                current_receiver = previous_speaker if speeches else "UNKNOWN"

            speeches.append({
                "speaker": current_speaker,
                "receiver": current_receiver,
                "line": ' '.join(current_line).strip()
            })

    previous_speaker = None
    ignore_until_next_speaker = False

    for line in lines:
        line = line.strip()

        if line.startswith(("CONTINUED", "INT.", "EXT.")):
            ignore_until_next_speaker = True
            continue

        if ignore_until_next_speaker:
            if re.match(r'^[A-Z\s]+$', line):
                ignore_until_next_speaker = False
            else:
                continue

        # Identify speaker line
        if re.match(r'^[A-Z\s]+$', line):
            add_speech(previous_speaker) # Pass previous speaker as receiver
            previous_speaker = current_speaker
            current_speaker = line
            current_line = []
        else:
            current_line.append(line)

    # Add the last speech
    add_speech(previous_speaker) # Pass previous speaker as receiver

    # Generate XML
    with open(xml_file, 'w') as f:
        f.write('<?xml version="1.0"?>\n')
        f.write('<!DOCTYPE PLAY SYSTEM "play.dtd">\n')
        f.write('<PLAY>\n')
        f.write('\t<TITLE></TITLE>\n')
        for speech in speeches:
            f.write('\t<SPEECH>\n')
            f.write(f'\t\t<SPEAKER>{speech["speaker"]}</SPEAKER>\n')
            f.write(f'\t\t<RECEIVER>{speech["receiver"]}</RECEIVER>\n')
            f.write(f'\t\t<LINE>{speech["line"]}</LINE>\n')
            f.write('\t</SPEECH>\n')
        f.write('</PLAY>\n')

# Example usage
convert_txt_to_xml('avengers_endgame.txt', 'avengers_endgame.xml')

In [None]:
#pdf 파일로 해서 xml 만든

!pip install pymupdf
import fitz  # PyMuPDF
import xml.etree.ElementTree as ET
import re

def convert_pdf_to_xml(pdf_path):
    # PDF 파일 열기
    doc = fitz.open(pdf_path)
    root = ET.Element("movie")
    current_scene = None
    last_speaker = None
    current_speaker = None
    buffer = []

    def add_dialogue(scene, speaker, line, receiver):
        dialogue_elem = ET.SubElement(scene, "speech")
        speaker_elem = ET.SubElement(dialogue_elem, "speaker")
        speaker_elem.text = speaker
        line_elem = ET.SubElement(dialogue_elem, "line")
        line_elem.text = line
        receiver_elem = ET.SubElement(dialogue_elem, "receiver")
        receiver_elem.text = receiver

    for page in doc:
        text = page.get_text("text")
        for line in text.split('\n'):
            line = line.strip()

            # Scene 시작 인식
            if re.match(r"^(INT\.|EXT\.)", line, re.IGNORECASE):
                if current_scene is not None:
                    root.append(current_scene)
                current_scene = ET.Element("scene")
                description = ET.SubElement(current_scene, "description")
                description.text = line
                current_speaker = None
                buffer = []

            # 화자 이름 인식 및 CONTINUED 제외
            elif line.isupper() and not line.startswith(("INT", "EXT")) and "CONTINUED" not in line:
                if current_speaker is not None and buffer:
                    add_dialogue(current_scene, current_speaker, ' '.join(buffer), last_speaker if last_speaker else current_speaker)
                    last_speaker = current_speaker

                current_speaker = line.strip()
                buffer = []

            # 대사 부분 - 줄을 이어서 읽음
            elif current_speaker:
                buffer.append(line.strip())

            # 그 외 설명 부분
            else:
                if current_scene is not None and line:
                    description = ET.SubElement(current_scene, "description")
                    description.text = line

    # 마지막 화자와 대사 추가
    if current_speaker is not None and buffer:
        add_dialogue(current_scene, current_speaker, ' '.join(buffer), last_speaker if last_speaker else current_speaker)

    if current_scene is not None:
        root.append(current_scene)

    tree = ET.ElementTree(root)
    return tree

def main():
    pdf_file_path = "/content/the-avengers-2012.pdf"  # PDF 파일 경로 설정
    output_file_path = "/content/output.xml"  # 출력 XML 파일 경로 설정

    tree = convert_pdf_to_xml(pdf_file_path)
    tree.write(output_file_path, encoding='utf-8', xml_declaration=True)
    print(f"XML 파일이 {output_file_path}에 저장되었습니다.")

if __name__ == "__main__":
    main()


Collecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.3 pymupdf-1.24.5
XML 파일이 /content/output.xml에 저장되었습니다.
