In [1]:
import csv
import sys
import os

def truncate_csv(input_file, output_file, max_length=255):
    # Track statistics
    truncation_count = 0
    processed_rows = 0
    
    print(f"Reading from: {input_file}")
    print(f"Writing to: {output_file}")
    
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' does not exist!")
        return
    
    try:
        with open(input_file, 'r', newline='', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            
            # Get header row (already in lowercase)
            header = next(reader)
            
            with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
                writer = csv.writer(outfile)
                
                # Write the header (no changes needed)
                writer.writerow(header)
                
                # Process each row
                for row in reader:
                    processed_rows += 1
                    new_row = []
                    
                    for value in row:
                        if len(value) > max_length:
                            truncation_count += 1
                            value = value[:max_length-3] + '...'
                        new_row.append(value)
                    
                    writer.writerow(new_row)
        
        print(f"CSV processing complete:")
        print(f"- Processed {processed_rows} rows")
        print(f"- Truncated {truncation_count} values that exceeded {max_length} characters")
        print(f"- Output saved to {output_file}")
    
    except Exception as e:
        print(f"Error processing CSV: {e}")

# Use these filenames - paste.txt for input and cdr_trials_cleaned.csv for output
input_file = "mdr_trials_cleaned.csv"  
output_file = "cdr_trials_cleaned.csv"  

truncate_csv(input_file, output_file)

Reading from: mdr_trials_cleaned.csv
Writing to: cdr_trials_cleaned.csv
CSV processing complete:
- Processed 56 rows
- Truncated 43 values that exceeded 255 characters
- Output saved to cdr_trials_cleaned.csv


In [6]:
import csv
import sys
import os
import re

def clean_for_supabase(input_file, output_file, max_length=255):
    # Track statistics
    truncation_count = 0
    cleaning_count = 0
    comma_count = 0
    processed_rows = 0
    
    print(f"Reading from: {input_file}")
    print(f"Writing to: {output_file}")
    
    # Check if input file exists
    if not os.path.exists(input_file):
        print(f"Error: Input file '{input_file}' does not exist!")
        return
    
    def clean_string(value):
        """Clean a string to make it safely parsable by Supabase"""
        if not isinstance(value, str):
            return value
            
        cleaned = 0
        has_comma = False
        
        # Remove commas completely
        if ',' in value:
            original_value = value
            value = value.replace(',', ' ')  # Replace commas with spaces
            has_comma = True
        
        # Replace control characters and null bytes
        value = re.sub(r'[\x00-\x1F\x7F]', '', value)
        
        # Fix unbalanced quotes (replace with single quotes)
        quote_count = value.count('"')
        if quote_count % 2 != 0:
            value = value.replace('"', "'")
            cleaned += 1
            
        # Remove backslashes before quotes which can cause escape issues
        value = value.replace('\\"', '"')
        value = value.replace("\\'", "'")
        
        # Normalize line endings (replace \r\n and \r with \n)
        value = value.replace('\r\n', '\n').replace('\r', '\n')
        
        # Truncate if needed
        if len(value) > max_length:
            value = value[:max_length-3] + '...'
            return value, True, cleaned > 0, has_comma
            
        return value, False, cleaned > 0, has_comma
    
    try:
        with open(input_file, 'r', newline='', encoding='utf-8') as infile:
            reader = csv.reader(infile)
            
            # Get header row (already in lowercase)
            header = next(reader)
            
            with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
                writer = csv.writer(outfile)
                
                # Write the header (no changes needed)
                writer.writerow(header)
                
                # Process each row
                for row in reader:
                    processed_rows += 1
                    new_row = []
                    
                    for value in row:
                        clean_value, was_truncated, was_cleaned, had_comma = clean_string(value)
                        if was_truncated:
                            truncation_count += 1
                        if was_cleaned:
                            cleaning_count += 1
                        if had_comma:
                            comma_count += 1
                        new_row.append(clean_value)
                    
                    writer.writerow(new_row)
        
        print(f"CSV processing complete:")
        print(f"- Processed {processed_rows} rows")
        print(f"- Removed commas from {comma_count} values")
        print(f"- Truncated {truncation_count} values that exceeded {max_length} characters")
        print(f"- Cleaned {cleaning_count} values with problematic characters")
        print(f"- Output saved to {output_file}")
    
    except Exception as e:
        print(f"Error processing CSV: {e}")
        print(f"Exception details: {type(e).__name__}: {str(e)}")

# Use these filenames
input_file = "mdr_trials_cleaned.csv"  
output_file = "cdr_trials_cleaned.csv"  

clean_for_supabase(input_file, output_file)

Reading from: mdr_trials_cleaned.csv
Writing to: cdr_trials_cleaned.csv
CSV processing complete:
- Processed 56 rows
- Removed commas from 218 values
- Truncated 43 values that exceeded 255 characters
- Cleaned 0 values with problematic characters
- Output saved to cdr_trials_cleaned.csv
