In [1]:
import csv
import random
import re

# Read the input text file
input_file = 'dataset.txt'
output_file = 'dataset.csv'

In [2]:
with open(input_file, 'r', encoding='utf-8') as f:
    data_text = f.read()

# Parse the data
parsed_data = []

# Regular expression to match the pattern: ("text", label)
# This handles high, medium, and low labels
pattern = r'\("([^"]+)",\s*(high|medium|low)\)'

matches = re.findall(pattern, data_text)

for text, label in matches:
    parsed_data.append({
        'text': text.strip(),
        'label': label.strip()
    })

In [3]:
# Randomly shuffle the documents
random.shuffle(parsed_data)

# Write to CSV
with open(output_file, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['text', 'label'])
    writer.writeheader()
    writer.writerows(parsed_data)

print(f"✓ Converted {len(parsed_data)} documents to CSV format")
print(f"✓ Input file: {input_file}")
print(f"✓ Output file: {output_file}")
print(f"✓ Documents have been randomly shuffled")

# Show label distribution
label_counts = {}
for row in parsed_data:
    label = row['label']
    label_counts[label] = label_counts.get(label, 0) + 1

print(f"\nLabel distribution:")
for label, count in sorted(label_counts.items()):
    print(f"  {label}: {count}")

print(f"\nFirst 3 rows (preview):")
for i, row in enumerate(parsed_data[:3], 1):
    print(f"{i}. Label: {row['label']}, Text: {row['text'][:60]}...")

✓ Converted 705 documents to CSV format
✓ Input file: dataset.txt
✓ Output file: dataset.csv
✓ Documents have been randomly shuffled

Label distribution:
  high: 239
  low: 219
  medium: 247

First 3 rows (preview):
1. Label: high, Text: शरीरबाट निरन्तर रगत बगेको रिपोर्ट गरिएको छ...
2. Label: medium, Text: थोरै कम पिसाब र कम उमंगे...
3. Label: medium, Text: अलि कम तौल र बराबर वृद्धिमा कमी...
