In [34]:
def reconstruct_text(text_elem):
    """
    Reconstruct the full surface text (title + body) for one <text> element.
    Uses 'word' + 'spaces' columns from token lines.
    Paragraphs are separated by a newline.
    """
    parts = []
    for para in text_elem.findall('.//paragraph'):
        para_tokens = []

        # Includes content inside <sentence>, <ne>, etc.
        for chunk in para.itertext():
            for line in chunk.splitlines():
                line = line.rstrip('\n')
                if not line.strip():
                    continue
                cols = line.split('\t')
                if len(cols) < 9:
                    continue  # not a token line
                word = cols[0]
                para_tokens.append(word)

        if para_tokens:
            parts.append(' '.join(para_tokens).strip())

    return '\n'.join(p for p in parts if p)


In [35]:
import csv
from lxml import etree

def process_text_block(text_block_str):
    """
    Parse a single <text>...</text> block string with lxml,
    extract attributes and reconstructed text, and return a dict.
    """
    # Wrap the <text> in a root to make it well-formed XML
    wrapped = "<root>\n" + text_block_str + "\n</root>"
    root = etree.fromstring(wrapped.encode('utf-8'))
    text_elem = root.find('text')
    if text_elem is None:
        return None

    msg_id = text_elem.get('id')          # unique text identifier
    msg_type = text_elem.get('msg_type')  # 'thread_start' or 'comment'
    thread_id = text_elem.get('thread_id')
    comment_id = text_elem.get('comment_id')
    datetime_str = text_elem.get('datetime')
    title = text_elem.get('title')

    full_text = reconstruct_text(text_elem)

    return {
        'id': msg_id,
        'msg_type': msg_type,
        'thread_id': thread_id,
        'comment_id': comment_id,
        'datetime': datetime_str,
        'title': title,
        'text': full_text,
    }


In [41]:
def export_vrt_to_csv_streaming(vrt_path, csv_path, total_line=None):
    """
    Stream through a VRT file line by line.
    For each <text>...</text> block, parse it and write one CSV row.
    """
    from tqdm import tqdm
    with open(vrt_path, 'r', encoding='utf-8') as fin, \
         open(csv_path, 'w', newline='', encoding='utf-8') as fout:

        writer = csv.writer(fout)
        writer.writerow([
            'id',
            'msg_type',
            'thread_id',
            'comment_id',
            'datetime',
            'title',
            'text',
        ])

        in_text = False
        current_block_lines = []

        for line in tqdm(fin, total=total_line):
            # Detect start of a <text> element
            stripped = line.lstrip()
            if not in_text and stripped.startswith('<text'):
                in_text = True
                current_block_lines = [line]
                continue

            if in_text:
                current_block_lines.append(line)
                # Detect end of this <text> element
                if stripped.startswith('</text'):
                    # We have a complete <text> block
                    block_str = ''.join(current_block_lines)
                    data = process_text_block(block_str)
                    if data is not None:
                        writer.writerow([
                            data['id'],
                            data['msg_type'],
                            data['thread_id'],
                            data['comment_id'],
                            data['datetime'],
                            data['title'],
                            data['text'],
                        ])
                    # Reset for next block
                    in_text = False
                    current_block_lines = []
            # Lines outside <text> are ignored (headers, comments, etc.)

In [40]:
vrt_path = "../external_data/suomi24-2021-2023-vrt/data/s24_2021.vrt"
csv_path = "../external_data/suomi24-2021-2023-vrt/data/s24_2021.csv"
export_vrt_to_csv_streaming(vrt_path, csv_path)
                

100%|██████████| 222218367/222218367 [12:04<00:00, 306598.18it/s]


In [42]:
from datasets import load_dataset

suomi24_2021 = load_dataset("csv", data_files="../external_data/suomi24-2021-2023-vrt/data/s24_2021.csv", split="train")

Generating train split: 4762970 examples [00:25, 187776.06 examples/s]


In [48]:
suomi24_2021["text"][4]

'K supermarketissa kassa ei voi itse päättää , kun pitää palauttaa väärin perityt eurot takaisin , kassa soittaa kauppiaalle ja kysyy mitä tehdään , kun itse ei saa päättä , siinä menee aikaa jostoinenkin . Lidlissä käyn pari kertaa viikossa , ei valittamista , kivat myyjät ja hyvät tuotteet .'