In [7]:
import unittest

In [36]:
def calculate_speeds(srt_content):
    blocks = srt_content.strip().split('\n\n')
    speeds = []
    for block in blocks:
        lines = block.split('\n')
        if len(lines) < 3: continue  # Skip malformed blocks
        
        # Parse start and end times
        start, end = lines[1].split(' --> ')
        start_time = datetime.strptime(start, '%H:%M:%S,%f')
        end_time = datetime.strptime(end, '%H:%M:%S,%f')
        duration = (end_time - start_time).total_seconds()
        
        # Calculate speed: characters per second
        text = '\n'.join(lines[2:])
        speed = len(text.replace('\n', '')) / duration if duration > 0 else 0
        speeds.append((text, speed))
    
    return speeds

def smart_join_paragraphs(text_blocks, average_speed):
    output_text = ""
    for i, (block, speed) in enumerate(text_blocks):
        if i > 0:
            # Determine if a line break is needed due to speed being 20% slower
            line_break_needed = speed < average_speed * 0.8
            # Check if a space is needed between the previous and current block, respecting CJK
            if not is_cjk(output_text[-1]) and not is_cjk(block[0]):
                output_text += " " if not line_break_needed else "\n"
            elif line_break_needed:
                output_text += "\n"
        output_text += block
    return output_text

def srt_to_text(srt_content):
    speeds = calculate_speeds(srt_content)
    if not speeds:
        return ""
    
    # Calculate average speed
    average_speed = sum(speed for _, speed in speeds) / len(speeds) if speeds else 0
    
    # Now, smart_join_paragraphs also takes into account the speeds for line breaks
    return smart_join_paragraphs(speeds, average_speed).strip()


In [37]:
class TestSrtToText(unittest.TestCase):
    def test_empty_srt(self):
        self.assertEqual(srt_to_text(""), "")

    def test_single_record_srt(self):
        srt_content = """1
00:00:01,000 --> 00:00:03,000
Hello, world!"""
        expected_text = "Hello, world!"
        self.assertEqual(srt_to_text(srt_content), expected_text)

    def test_two_record_srt(self):
        srt_content = """1
00:00:01,000 --> 00:00:03,000
Hello, world!

2
00:00:04,000 --> 00:00:06,000
This is a test."""
        expected_text = "Hello, world! This is a test."
        self.assertEqual(srt_to_text(srt_content), expected_text)
    
    def test_numeric_content_srt(self):
        srt_content = """1
00:00:01,000 --> 00:00:03,000
12345"""
        expected_text = "12345"
        self.assertEqual(srt_to_text(srt_content), expected_text)
    def test_subtitle_with_line_breaks(self):
        srt_content = """1
00:00:01,000 --> 00:00:03,000
First line
Second line"""
        expected_text = "First line\nSecond line"
        self.assertEqual(srt_to_text(srt_content), expected_text)
    def test_cjk_handling(self):
        srt_content = """1
00:00:01,000 --> 00:00:03,000
これは日本語です

2
00:00:04,000 --> 00:00:06,000
This is English."""
        expected_text = "これは日本語です This is English."
        self.assertEqual(srt_to_text(srt_content), expected_text)

    def test_cjk_joint_handling(self):
        srt_content = """1
00:00:01,000 --> 00:00:03,000
结束了

2
00:00:04,000 --> 00:00:06,000
开始了"""
        # No space should be added between CJK characters across subtitle entries
        expected_text = "结束了开始了"
        self.assertEqual(srt_to_text(srt_content), expected_text)
    def test_add_line_break_for_slower_speech(self):
        srt_content = """1
00:00:01,000 --> 00:00:02,000
Fast speech here.

2
00:00:03,000 --> 00:00:07,000
This is significantly slower speech.

3
00:00:08,000 --> 00:00:09,000
Fast again."""
        # Assuming 'Fast speech here.' and 'Fast again.' have a higher SPS than the average and
        # 'This is significantly slower speech.' has a lower SPS than the average,
        # a line break should be added after the slower speech subtitle.
        expected_text = "Fast speech here. This is significantly slower speech.\nFast again."
        self.assertEqual(srt_to_text(srt_content), expected_text)

In [38]:
unittest.main(argv=['first-arg-is-ignored'], exit=False)

F.......
FAIL: test_add_line_break_for_slower_speech (__main__.TestSrtToText)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_85905/3145460868.py", line 74, in test_add_line_break_for_slower_speech
    self.assertEqual(srt_to_text(srt_content), expected_text)
AssertionError: 'Fast speech here.\nThis is significantly slower speech. Fast again.' != 'Fast speech here. This is significantly slower speech.\nFast again.'
- Fast speech here.
- This is significantly slower speech. Fast again.+ Fast speech here. This is significantly slower speech.
+ Fast again.

----------------------------------------------------------------------
Ran 8 tests in 0.019s

FAILED (failures=1)


<unittest.main.TestProgram at 0x106e9e290>