In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Set display options for better output
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Set matplotlib style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [13]:

df = pd.read_json("data/sharegpt-portuguese.json")

df = df.drop(columns=["lang"])
source = []
text = []
ids = []
# Rename 'gpt' labels to 'llm' for consistency
source = ['llm' if label == 'gpt' else label for label in source]

df = pd.DataFrame({'label': source, 'text': text})

In [14]:
df_1 = pd.read_csv("data/imdb-reviews-pt-br.csv")
df_1 = df_1.drop(columns=["id", "text_en", "sentiment"])    
df_1 = df_1.rename(columns={"text_pt": "text"})
df_1['label'] = ['llm' for i in range(len(df_1))]


In [15]:
df_2 = pd.read_csv("data/boolq.csv")
label= []
text = []
for i, row in df_2.iterrows():
    label.append('human')
    text.append(row['passage'])
df_2 = pd.DataFrame({'label': label, 'text': text})


In [16]:
df_3 = pd.read_csv("data/validation_bool.csv")
label= []
text = []
for i, row in df_3.iterrows():
    label.append('human')
    text.append(row['passage'])
df_3 = pd.DataFrame({'label': label, 'text': text})


In [17]:
def process_paragraphs_to_text(data_dict):
    """
    Process data in the format {'paragraphs': [['text1'], ['text2'], ...]} 
    into a single text block.
    """
    if 'paragraphs' not in data_dict:
        raise ValueError("Data must contain 'paragraphs' key")
    
    # Extract all text from paragraphs and join them
    all_text = []
    for paragraph in data_dict['paragraphs']:
        # Each paragraph is a list, join its elements if there are multiple
        if isinstance(paragraph, list):
            paragraph_text = ' '.join(paragraph)
        else:
            paragraph_text = str(paragraph)
        all_text.append(paragraph_text)
    
    # Join all paragraphs with double newlines for readability
    return '\n'.join(all_text)

In [None]:
import glob
from tqdm import tqdm
import pandas as pd
import pyarrow.parquet as pq

# Get all parquet files in the data folder
parquet_files = glob.glob("data/brwac/*.parquet")

# Initialize lists outside the loop to avoid memory fragmentation
text = []
label = []

for parquet_file in tqdm(parquet_files, desc="Processing parquet files"):
    # Read parquet file metadata to get row count
    parquet_file_obj = pq.ParquetFile(parquet_file)
    
    # Process in batches using pyarrow
    batch_size = 100000  # Adjust based on your available memory
    for batch in parquet_file_obj.iter_batches(batch_size=batch_size):
        # Convert batch to pandas DataFrame
        df_batch = batch.to_pandas()
        
        # Process batch by batch instead of row by row
        for i, row in df_batch.iterrows():
            processed_text = process_paragraphs_to_text(row['text'])
            text.append(processed_text)
            label.append('human')
        
        # Clear batch from memory after processing
        del df_batch, batch

# Create DataFrame only once at the end
df_4 = pd.DataFrame({'label': label, 'text': text})

print(f"Total labels: {len(label)}")
print(f"Total texts: {len(text)}")

Processing parquet files:  86%|████████▌ | 18/21 [07:35<01:16, 25.51s/it]

In [None]:
labels = []
text = []
parquet_files = glob.glob("data/canarim/*.parquet")
for parquet_file in parquet_files:
    df_temp = pd.read_parquet(parquet_file)
    for i, row in df_temp.iterrows():
        labels.append('llm')
        text.append(row['output'])
df_5 = pd.DataFrame({'label': labels, 'text': text})

In [None]:
# Join all 5 DataFrames, 
df_combined = pd.concat([df, df_1, df_2, df_3, df_4, df_5], ignore_index=True)

# Describe the label distribution
label_counts = df_combined['label'].value_counts()
print("Label distribution:")
print(label_counts)
print(f"\nTotal samples: {len(df_combined)}")
print(f"Human samples: {label_counts.get('human', 0)} ({label_counts.get('human', 0)/len(df_combined)*100:.2f}%)")
print(f"LLM samples: {label_counts.get('llm', 0)} ({label_counts.get('llm', 0)/len(df_combined)*100:.2f}%)")

In [None]:

# Describe the label distribution
label_counts = df_combined['label'].value_counts()
print("Label distribution:")
print(label_counts)
print(f"\nTotal samples: {len(df_combined)}")
print(f"Human samples: {label_counts.get('human', 0)} ({label_counts.get('human', 0)/len(df_combined)*100:.2f}%)")
print(f"LLM samples: {label_counts.get('llm', 0)} ({label_counts.get('llm', 0)/len(df_combined)*100:.2f}%)")

In [None]:
def hybrid_balance(df, target_ratio=0.3):
    """
    Hybrid approach: downsample majority and upsample minority for better balance.
    """
    
    print(f"Original dataset size: {len(df)} rows")
    print("Original label distribution:")
    print(df['label'].value_counts())
    
    # Separate human and LLM samples
    human_samples = df[df['label'] == 'human']
    llm_samples = df[df['label'] == 'llm']
    
    # Target size for each class
    target_size = int((len(human_samples) + len(llm_samples)) * target_ratio)
    
    # Downsample human data
    human_balanced = human_samples.sample(n=target_size)
    
    # Upsample LLM data
    llm_upsampled = llm_samples.sample(n=target_size, replace=True, random_state=42)
    
    # Combine datasets
    df_balanced = pd.concat([human_balanced, llm_upsampled], ignore_index=True)
    
    # Shuffle the final dataset
    df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
    
    print(f"\nBalanced dataset size: {len(df_balanced)} rows")
    print("Balanced label distribution:")
    print(df_balanced['label'].value_counts())
    
    # Save balanced dataset
    df_balanced.to_csv("data/balanced.csv", index=False)
    print(f"\nBalanced dataset saved to data/balanced.csv")

# Usage
hybrid_balance(df_combined, target_ratio=0.3)

In [None]:
df_combined.to_csv("combined.csv", index=False)

In [2]:
df_combined = pd.read_csv("combined.csv")
#df_balanced = pd.read_csv("balanced.csv")

In [3]:
def filter_and_chunk_text(df, min_length=200, max_length=10000, chunk_overlap=0):
    """
    Filter out texts smaller than min_length and split texts longer than max_length into chunks.
    
    Parameters:
    - df: DataFrame with 'text' and 'label' columns
    - min_length: minimum character length to keep (default: 100)
    - max_length: maximum character length before chunking (default: 10000)
    - chunk_overlap: number of characters to overlap between chunks (default: 200)
    
    Returns:
    - DataFrame with filtered and chunked texts
    """
    
    print(f"Original dataset size: {len(df)} rows")
    
    # Filter out texts that are too short
    df_filtered = df[df['text'].str.len() >= min_length].copy()
    removed_short = len(df) - len(df_filtered)
    print(f"Removed {removed_short} entries shorter than {min_length} characters")
    
    # Separate texts that need chunking from those that don't
    df_normal = df_filtered[df_filtered['text'].str.len() <= max_length].copy()
    df_to_chunk = df_filtered[df_filtered['text'].str.len() > max_length].copy()
    
    print(f"Texts within normal range: {len(df_normal)}")
    print(f"Texts requiring chunking: {len(df_to_chunk)}")
    
    # Process chunked texts
    chunked_rows = []
    total_chunks = 0
    
    for idx, row in df_to_chunk.iterrows():
        text = row['text']
        label = row['label']
        
        # Create chunks with overlap
        chunks = create_text_chunks(text, max_length, chunk_overlap)
        total_chunks += len(chunks)
        
        for chunk in chunks:
            chunked_rows.append({
                'text': chunk,
                'label': label,
                'original_length': len(text),
                'chunk_id': f"{idx}_{len(chunked_rows)}"
            })
    
    # Create DataFrame from chunked data
    if chunked_rows:
        df_chunked = pd.DataFrame(chunked_rows)
        # Combine normal and chunked data
        df_final = pd.concat([
            df_normal.assign(original_length=df_normal['text'].str.len(), chunk_id=''),
            df_chunked
        ], ignore_index=True)
    else:
        df_final = df_normal.assign(original_length=df_normal['text'].str.len(), chunk_id='')
    
    print(f"Final dataset size: {len(df_final)} rows")
    print(f"Total chunks created from long texts: {total_chunks}")
    
    return df_final

def create_text_chunks(text, max_length, overlap):
    """
    Split text into chunks with specified overlap.
    
    Parameters:
    - text: input text to chunk
    - max_length: maximum length of each chunk
    - overlap: number of characters to overlap between chunks
    
    Returns:
    - List of text chunks
    """
    if len(text) <= max_length:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        # Calculate end position
        end = start + max_length
        
        # If this is not the last chunk, try to break at a good point
        if end < len(text):
            # Look for a good breaking point (sentence end, paragraph, or space)
            break_points = ['. ', '.\n', '\n\n', ' ']
            best_break = end
            
            for break_char in break_points:
                # Look backward from the end position for a good break
                last_break = text.rfind(break_char, start, end)
                if last_break > start + max_length // 2:  # Don't break too early
                    best_break = last_break + len(break_char)
                    break
            
            chunk = text[start:best_break].strip()
        else:
            # Last chunk - take the rest
            chunk = text[start:].strip()
        
        if chunk:  # Only add non-empty chunks
            chunks.append(chunk)
        
        # Move start position with overlap
        if end >= len(text):
            break
        start = max(start + max_length - overlap, best_break - overlap)
        
        # Ensure we're making progress
        if start <= 0:
            start = max_length - overlap
    
    return chunks


In [None]:
# Apply filtering and chunking to the combined dataset
print("Applying filtering and chunking to df_combined...")
df_processed = filter_and_chunk_text(df_combined, min_length=100, max_length=10000, chunk_overlap=0)

# Display statistics
print("\n" + "="*50)
print("PROCESSING RESULTS")
print("="*50)

# Length distribution analysis
length_stats = df_processed['text'].str.len().describe()
print(f"\nText length statistics after processing:")
print(length_stats)

# Label distribution
label_counts = df_processed['label'].value_counts()
print(f"\nLabel distribution after processing:")
print(label_counts)
print(f"Human samples: {label_counts.get('human', 0)} ({label_counts.get('human', 0)/len(df_processed)*100:.2f}%)")
print(f"LLM samples: {label_counts.get('llm', 0)} ({label_counts.get('llm', 0)/len(df_processed)*100:.2f}%)")

# Chunked entries analysis
chunked_entries = df_processed[df_processed['chunk_id'] != '']
print(f"\nChunked entries: {len(chunked_entries)}")
if len(chunked_entries) > 0:
    print(f"Average original length of chunked texts: {chunked_entries['original_length'].mean():.0f} chars")
    print(f"Max original length: {chunked_entries['original_length'].max():.0f} chars")
    print(f"Unique original texts that were chunked: {chunked_entries['original_length'].nunique()}")

# Save processed dataset
df_processed.to_csv("processed_filtered_chunked.csv", index=False)
print(f"\nProcessed dataset saved to data/processed_filtered_chunked.csv")


Applying filtering and chunking to df_combined...
Original dataset size: 2331317 rows
Removed 171510 entries shorter than 100 characters
Texts within normal range: 1992995
Texts requiring chunking: 166812


In [None]:
# Additional analysis functions for the filtered and chunked data

def analyze_chunking_results(df):
    """
    Provide detailed analysis of the chunking results.
    """
    print("DETAILED CHUNKING ANALYSIS")
    print("="*50)
    
    # Basic statistics
    total_rows = len(df)
    chunked_rows = len(df[df['chunk_id'] != ''])
    original_rows = len(df[df['chunk_id'] == ''])
    
    print(f"Total rows: {total_rows}")
    print(f"Original (non-chunked) rows: {original_rows}")
    print(f"Chunked rows: {chunked_rows}")
    print(f"Chunking ratio: {chunked_rows/total_rows*100:.2f}%")
    
    # Length distribution
    print(f"\nLength distribution:")
    print(f"Min length: {df['text'].str.len().min()}")
    print(f"Max length: {df['text'].str.len().max()}")
    print(f"Mean length: {df['text'].str.len().mean():.1f}")
    print(f"Median length: {df['text'].str.len().median():.1f}")
    
    # Chunked vs original comparison by label
    print(f"\nBreakdown by label:")
    for label in df['label'].unique():
        label_data = df[df['label'] == label]
        label_chunked = len(label_data[label_data['chunk_id'] != ''])
        label_original = len(label_data[label_data['chunk_id'] == ''])
        print(f"  {label.upper()}:")
        print(f"    Original entries: {label_original}")
        print(f"    Chunked entries: {label_chunked}")
        print(f"    Chunking ratio: {label_chunked/(label_chunked + label_original)*100:.2f}%")

def create_balanced_chunked_dataset(df, target_ratio=0.3):
    """
    Create a balanced dataset from the filtered and chunked data.
    """
    print("CREATING BALANCED DATASET FROM CHUNKED DATA")
    print("="*50)
    
    # Separate by label
    human_data = df[df['label'] == 'human']
    llm_data = df[df['label'] == 'llm']
    
    print(f"Human entries: {len(human_data)}")
    print(f"LLM entries: {len(llm_data)}")
    
    # Calculate target sizes
    total_target = int((len(human_data) + len(llm_data)) * target_ratio)
    
    # Sample from each class
    human_sampled = human_data.sample(n=min(total_target, len(human_data)), random_state=42)
    llm_sampled = llm_data.sample(n=min(total_target, len(llm_data)), replace=len(llm_data) < total_target, random_state=42)
    
    # Combine and shuffle
    balanced_df = pd.concat([human_sampled, llm_sampled], ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f"\nBalanced dataset created:")
    print(f"Total size: {len(balanced_df)}")
    print(balanced_df['label'].value_counts())
    
    # Save balanced dataset
    balanced_df.to_csv("    .csv", index=False)
    print(f"\nBalanced dataset saved to data/balanced_filtered_chunked.csv")
    
    return balanced_df

# You can call these functions after running the filtering and chunking
# analyze_chunking_results(df_processed)
# df_balanced_chunked = create_balanced_chunked_dataset(df_processed, target_ratio=0.3)


In [2]:
import os
import gc
from tqdm import tqdm

def filter_and_chunk_text_batch(input_file="combined.csv", output_file="processed_filtered_chunked.csv", 
                                min_length=200, max_length=10000, chunk_overlap=0, 
                                batch_size=50000, intermediate_save_every=5):
    """
    Memory-efficient batch processing version of filter_and_chunk_text.
    Processes data in batches and saves intermediate results.
    
    Parameters:
    - input_file: path to input CSV file
    - output_file: path to output CSV file
    - min_length: minimum character length to keep (default: 200)
    - max_length: maximum character length before chunking (default: 10000)
    - chunk_overlap: number of characters to overlap between chunks (default: 0)
    - batch_size: number of rows to process at once (default: 50000)
    - intermediate_save_every: save intermediate results every N batches (default: 5)
    
    Returns:
    - None (saves results to file)
    """
    
    print(f"Starting batch processing of {input_file}")
    print(f"Batch size: {batch_size}")
    print(f"Intermediate saves every {intermediate_save_every} batches")
    
    # Initialize counters
    total_processed = 0
    total_removed_short = 0
    total_normal = 0
    total_chunked = 0
    total_chunks_created = 0
    batch_number = 0
    
    # Create intermediate directory if it doesn't exist
    intermediate_dir = "intermediate_results"
    os.makedirs(intermediate_dir, exist_ok=True)
    
    # Read file in batches
    chunk_iter = pd.read_csv(input_file, chunksize=batch_size)
    
    processed_batches = []
    
    for batch_df in tqdm(chunk_iter, desc="Processing batches"):
        batch_number += 1
        print(f"\nProcessing batch {batch_number} with {len(batch_df)} rows")
        
        # Process current batch
        batch_result = process_single_batch(batch_df, min_length, max_length, chunk_overlap)
        
        # Update counters
        batch_stats = batch_result['stats']
        total_processed += batch_stats['original_size']
        total_removed_short += batch_stats['removed_short']
        total_normal += batch_stats['normal_texts']
        total_chunked += batch_stats['chunked_texts']
        total_chunks_created += batch_stats['chunks_created']
        
        # Store processed batch
        processed_batches.append(batch_result['data'])
        
        # Intermediate save
        if batch_number % intermediate_save_every == 0:
            print(f"Saving intermediate results after batch {batch_number}...")
            intermediate_file = os.path.join(intermediate_dir, f"processed_batch_{batch_number}.csv")
            
            # Combine all processed batches so far
            combined_batch = pd.concat(processed_batches, ignore_index=True)
            combined_batch.to_csv(intermediate_file, index=False)
            
            print(f"Intermediate results saved to {intermediate_file}")
            print(f"Rows processed so far: {len(combined_batch)}")
            
            # Clear processed batches from memory and force garbage collection
            del combined_batch
            processed_batches = []
            gc.collect()
    
    # Final save - combine any remaining batches
    if processed_batches:
        print("\nSaving final batch...")
        final_batch = pd.concat(processed_batches, ignore_index=True)
        
        # If there are intermediate files, combine them with the final batch
        intermediate_files = [f for f in os.listdir(intermediate_dir) if f.startswith("processed_batch_")]
        
        if intermediate_files:
            print("Combining with intermediate results...")
            all_dfs = [final_batch]
            
            for int_file in sorted(intermediate_files):
                int_path = os.path.join(intermediate_dir, int_file)
                int_df = pd.read_csv(int_path)
                all_dfs.append(int_df)
            
            final_result = pd.concat(all_dfs, ignore_index=True)
        else:
            final_result = final_batch
        
        # Save final result
        final_result.to_csv(output_file, index=False)
        
        # Clean up
        del final_batch, final_result
        if intermediate_files:
            for int_file in intermediate_files:
                os.remove(os.path.join(intermediate_dir, int_file))
    else:
        # Only intermediate files exist
        intermediate_files = [f for f in os.listdir(intermediate_dir) if f.startswith("processed_batch_")]
        if intermediate_files:
            print("Combining intermediate results...")
            all_dfs = []
            
            for int_file in sorted(intermediate_files):
                int_path = os.path.join(intermediate_dir, int_file)
                int_df = pd.read_csv(int_path)
                all_dfs.append(int_df)
            
            final_result = pd.concat(all_dfs, ignore_index=True)
            final_result.to_csv(output_file, index=False)
            
            # Clean up
            for int_file in intermediate_files:
                os.remove(os.path.join(intermediate_dir, int_file))
    
    # Remove intermediate directory if empty
    try:
        os.rmdir(intermediate_dir)
    except:
        pass
    
    # Print final statistics
    print("\n" + "="*60)
    print("BATCH PROCESSING COMPLETED")
    print("="*60)
    print(f"Total rows processed: {total_processed}")
    print(f"Removed short texts: {total_removed_short}")
    print(f"Normal texts: {total_normal}")
    print(f"Texts requiring chunking: {total_chunked}")
    print(f"Total chunks created: {total_chunks_created}")
    print(f"Final dataset saved to: {output_file}")
    
    # Force garbage collection
    gc.collect()


def process_single_batch(batch_df, min_length, max_length, chunk_overlap):
    """
    Process a single batch of data with filtering and chunking.
    
    Parameters:
    - batch_df: DataFrame batch to process
    - min_length, max_length, chunk_overlap: processing parameters
    
    Returns:
    - Dictionary with processed data and statistics
    """
    original_size = len(batch_df)
    
    # Filter out texts that are too short
    batch_filtered = batch_df[batch_df['text'].str.len() >= min_length].copy()
    removed_short = original_size - len(batch_filtered)
    
    # Separate texts that need chunking from those that don't
    batch_normal = batch_filtered[batch_filtered['text'].str.len() <= max_length].copy()
    batch_to_chunk = batch_filtered[batch_filtered['text'].str.len() > max_length].copy()
    
    # Process chunked texts
    chunked_rows = []
    chunks_created = 0
    
    for idx, row in batch_to_chunk.iterrows():
        text = row['text']
        label = row['label']
        
        # Create chunks with overlap
        chunks = create_text_chunks_efficient(text, max_length, chunk_overlap)
        chunks_created += len(chunks)
        
        for i, chunk in enumerate(chunks):
            chunked_rows.append({
                'text': chunk,
                'label': label,
                'original_length': len(text),
                'chunk_id': f"{idx}_{i}"
            })
    
    # Create DataFrame from chunked data
    if chunked_rows:
        batch_chunked = pd.DataFrame(chunked_rows)
        # Combine normal and chunked data
        batch_result = pd.concat([
            batch_normal.assign(original_length=batch_normal['text'].str.len(), chunk_id=''),
            batch_chunked
        ], ignore_index=True)
    else:
        batch_result = batch_normal.assign(original_length=batch_normal['text'].str.len(), chunk_id='')
    
    # Statistics for this batch
    stats = {
        'original_size': original_size,
        'removed_short': removed_short,
        'normal_texts': len(batch_normal),
        'chunked_texts': len(batch_to_chunk),
        'chunks_created': chunks_created,
        'final_size': len(batch_result)
    }
    
    return {
        'data': batch_result,
        'stats': stats
    }


In [3]:
def create_text_chunks_efficient(text, max_length, overlap):
    """
    Memory-efficient version of create_text_chunks.
    Uses generators and minimal memory allocation.
    
    Parameters:
    - text: input text to chunk
    - max_length: maximum length of each chunk
    - overlap: number of characters to overlap between chunks
    
    Returns:
    - List of text chunks
    """
    if len(text) <= max_length:
        return [text]
    
    chunks = []
    start = 0
    text_length = len(text)
    
    # Pre-define break points for efficiency
    break_points = ['. ', '.\n', '\n\n', ' ']
    half_max = max_length // 2
    
    while start < text_length:
        # Calculate end position
        end = min(start + max_length, text_length)
        
        # If this is not the last chunk, try to break at a good point
        if end < text_length:
            best_break = end
            
            # Use more efficient searching
            for break_char in break_points:
                break_char_len = len(break_char)
                # Look backward from the end position for a good break
                search_start = max(start + half_max, start)
                last_break = text.rfind(break_char, search_start, end)
                
                if last_break != -1:  # Found a good break point
                    best_break = last_break + break_char_len
                    break
            
            # Extract chunk efficiently
            chunk = text[start:best_break].strip()
            next_start = max(start + max_length - overlap, best_break - overlap)
        else:
            # Last chunk - take the rest
            chunk = text[start:].strip()
            next_start = text_length  # This will end the loop
        
        # Only add non-empty chunks
        if chunk:
            chunks.append(chunk)
        
        # Update start position and ensure progress
        if next_start <= start:
            next_start = start + max_length - overlap
        start = next_start
    
    return chunks


def estimate_memory_usage(input_file="combined.csv", batch_size=50000):
    """
    Estimate memory usage and provide recommendations for batch processing.
    
    Parameters:
    - input_file: path to input CSV file
    - batch_size: proposed batch size
    
    Returns:
    - Dictionary with memory estimates and recommendations
    """
    print("Estimating memory usage...")
    
    # Read a small sample to estimate row size
    sample_df = pd.read_csv(input_file, nrows=1000)
    
    # Calculate average row size in memory
    memory_usage = sample_df.memory_usage(deep=True).sum()
    avg_row_size = memory_usage / len(sample_df)
    
    # Estimate total file size
    total_rows = sum(1 for _ in open(input_file)) - 1  # Subtract header
    estimated_total_memory = avg_row_size * total_rows
    
    # Estimate batch memory usage
    batch_memory = avg_row_size * batch_size
    
    # Calculate chunking overhead (estimated 2x increase due to chunking)
    chunking_overhead = 2.0
    estimated_batch_memory_with_overhead = batch_memory * chunking_overhead
    
    print(f"\nMEMORY USAGE ESTIMATES")
    print("="*40)
    print(f"Total rows in file: {total_rows:,}")
    print(f"Average row size: {avg_row_size:.2f} bytes")
    print(f"Estimated total file memory: {estimated_total_memory / (1024**2):.1f} MB")
    print(f"Batch size: {batch_size:,} rows")
    print(f"Estimated batch memory: {batch_memory / (1024**2):.1f} MB")
    print(f"Est. batch memory with chunking: {estimated_batch_memory_with_overhead / (1024**2):.1f} MB")
    
    # Recommendations
    recommendations = []
    if estimated_batch_memory_with_overhead > 2 * 1024**3:  # > 2GB
        recommendations.append("Consider reducing batch_size to 25000 or less")
    if estimated_batch_memory_with_overhead > 8 * 1024**3:  # > 8GB
        recommendations.append("WARNING: Batch size may cause out-of-memory errors")
    
    if recommendations:
        print(f"\nRECOMMENDATIONS:")
        for rec in recommendations:
            print(f"- {rec}")
    else:
        print(f"\nBatch size looks good for memory usage!")
    
    return {
        'total_rows': total_rows,
        'avg_row_size': avg_row_size,
        'estimated_total_memory_mb': estimated_total_memory / (1024**2),
        'estimated_batch_memory_mb': estimated_batch_memory_with_overhead / (1024**2),
        'recommendations': recommendations
    }


In [4]:
# Test the memory-efficient batch processing
# First, let's estimate memory usage
print("Estimating memory usage for combined.csv...")
memory_info = estimate_memory_usage("combined.csv", batch_size=50000)

# Run the batch processing with appropriate parameters
print("\n" + "="*60)
print("Starting memory-efficient batch processing...")
print("="*60)

# Adjust batch size based on memory estimates if needed
recommended_batch_size = 50000
if memory_info['estimated_batch_memory_mb'] > 1000:  # > 1GB
    recommended_batch_size = 25000
    print(f"Reducing batch size to {recommended_batch_size} due to memory constraints")

# Run the batch processing
filter_and_chunk_text_batch(
    input_file="combined.csv",
    output_file="processed_filtered_chunked_batch.csv",
    min_length=100,
    max_length=10000,
    chunk_overlap=0,
    batch_size=recommended_batch_size,
    intermediate_save_every=3  # Save every 3 batches to avoid losing progress
)


Estimating memory usage for combined.csv...
Estimating memory usage...

MEMORY USAGE ESTIMATES
Total rows in file: 71,822,334
Average row size: 1520.40 bytes
Estimated total file memory: 104139.8 MB
Batch size: 50,000 rows
Estimated batch memory: 72.5 MB
Est. batch memory with chunking: 145.0 MB

Batch size looks good for memory usage!

Starting memory-efficient batch processing...
Starting batch processing of combined.csv
Batch size: 50000
Intermediate saves every 3 batches


Processing batches: 1it [00:00,  2.00it/s]


Processing batch 1 with 50000 rows

Processing batch 2 with 50000 rows


Processing batches: 2it [00:02,  1.35s/it]


Processing batch 3 with 50000 rows
Saving intermediate results after batch 3...


Processing batches: 3it [00:09,  3.85s/it]

Intermediate results saved to intermediate_results/processed_batch_3.csv
Rows processed so far: 168970

Processing batch 4 with 50000 rows


Processing batches: 4it [00:10,  2.93s/it]


Processing batch 5 with 50000 rows


Processing batches: 5it [00:12,  2.40s/it]


Processing batch 6 with 50000 rows
Saving intermediate results after batch 6...


Processing batches: 6it [00:19,  4.16s/it]

Intermediate results saved to intermediate_results/processed_batch_6.csv
Rows processed so far: 173131

Processing batch 7 with 50000 rows


Processing batches: 8it [00:22,  2.64s/it]


Processing batch 8 with 50000 rows

Processing batch 9 with 50000 rows
Saving intermediate results after batch 9...


Processing batches: 9it [00:30,  4.27s/it]

Intermediate results saved to intermediate_results/processed_batch_9.csv
Rows processed so far: 174971

Processing batch 10 with 50000 rows


Processing batches: 10it [00:32,  3.55s/it]


Processing batch 11 with 50000 rows


Processing batches: 11it [00:34,  3.03s/it]


Processing batch 12 with 50000 rows
Saving intermediate results after batch 12...


Processing batches: 12it [00:42,  4.72s/it]

Intermediate results saved to intermediate_results/processed_batch_12.csv
Rows processed so far: 180324

Processing batch 13 with 50000 rows


Processing batches: 13it [00:44,  3.78s/it]


Processing batch 14 with 50000 rows


Processing batches: 14it [00:45,  3.12s/it]


Processing batch 15 with 50000 rows
Saving intermediate results after batch 15...


Processing batches: 15it [00:54,  4.68s/it]

Intermediate results saved to intermediate_results/processed_batch_15.csv
Rows processed so far: 178472

Processing batch 16 with 50000 rows


Processing batches: 16it [00:56,  3.92s/it]


Processing batch 17 with 50000 rows


Processing batches: 17it [00:58,  3.22s/it]


Processing batch 18 with 50000 rows
Saving intermediate results after batch 18...


Processing batches: 18it [01:06,  4.67s/it]

Intermediate results saved to intermediate_results/processed_batch_18.csv
Rows processed so far: 179664

Processing batch 19 with 50000 rows


Processing batches: 19it [01:07,  3.78s/it]


Processing batch 20 with 50000 rows


Processing batches: 20it [01:09,  3.12s/it]


Processing batch 21 with 50000 rows
Saving intermediate results after batch 21...


Processing batches: 21it [01:16,  4.38s/it]

Intermediate results saved to intermediate_results/processed_batch_21.csv
Rows processed so far: 173778

Processing batch 22 with 50000 rows


Processing batches: 22it [01:18,  3.51s/it]


Processing batch 23 with 50000 rows


Processing batches: 23it [01:19,  2.99s/it]


Processing batch 24 with 50000 rows
Saving intermediate results after batch 24...


Processing batches: 24it [01:28,  4.52s/it]

Intermediate results saved to intermediate_results/processed_batch_24.csv
Rows processed so far: 179550

Processing batch 25 with 50000 rows


Processing batches: 25it [01:30,  3.98s/it]


Processing batch 26 with 50000 rows


Processing batches: 26it [01:33,  3.47s/it]


Processing batch 27 with 50000 rows
Saving intermediate results after batch 27...


Processing batches: 27it [01:43,  5.54s/it]

Intermediate results saved to intermediate_results/processed_batch_27.csv
Rows processed so far: 198815

Processing batch 28 with 50000 rows


Processing batches: 28it [01:45,  4.57s/it]


Processing batch 29 with 50000 rows


Processing batches: 29it [01:47,  3.76s/it]


Processing batch 30 with 50000 rows
Saving intermediate results after batch 30...


Processing batches: 30it [01:56,  5.17s/it]

Intermediate results saved to intermediate_results/processed_batch_30.csv
Rows processed so far: 183798

Processing batch 31 with 50000 rows


Processing batches: 31it [01:57,  4.07s/it]


Processing batch 32 with 50000 rows


Processing batches: 32it [01:59,  3.31s/it]


Processing batch 33 with 50000 rows
Saving intermediate results after batch 33...


Processing batches: 33it [02:06,  4.47s/it]

Intermediate results saved to intermediate_results/processed_batch_33.csv
Rows processed so far: 173071

Processing batch 34 with 50000 rows


Processing batches: 34it [02:07,  3.57s/it]


Processing batch 35 with 50000 rows


Processing batches: 35it [02:09,  3.08s/it]


Processing batch 36 with 50000 rows
Saving intermediate results after batch 36...


Processing batches: 36it [02:18,  4.84s/it]

Intermediate results saved to intermediate_results/processed_batch_36.csv
Rows processed so far: 183387

Processing batch 37 with 50000 rows


Processing batches: 37it [02:20,  3.95s/it]


Processing batch 38 with 50000 rows


Processing batches: 38it [02:22,  3.39s/it]


Processing batch 39 with 50000 rows
Saving intermediate results after batch 39...


Processing batches: 39it [02:32,  5.26s/it]

Intermediate results saved to intermediate_results/processed_batch_39.csv
Rows processed so far: 187793

Processing batch 40 with 50000 rows


Processing batches: 41it [02:34,  3.21s/it]


Processing batch 41 with 50000 rows

Processing batch 42 with 50000 rows
Saving intermediate results after batch 42...


Processing batches: 42it [02:39,  3.49s/it]

Intermediate results saved to intermediate_results/processed_batch_42.csv
Rows processed so far: 142089

Processing batch 43 with 50000 rows


Processing batches: 43it [02:39,  2.49s/it]


Processing batch 44 with 50000 rows

Processing batch 45 with 50000 rows
Saving intermediate results after batch 45...


Processing batches: 47it [02:39,  3.40s/it]

Intermediate results saved to intermediate_results/processed_batch_45.csv
Rows processed so far: 66266

Processing batch 46 with 50000 rows

Processing batch 47 with 31317 rows

Saving final batch...
Combining with intermediate results...






BATCH PROCESSING COMPLETED
Total rows processed: 2331317
Removed short texts: 171510
Normal texts: 1992995
Texts requiring chunking: 166812
Total chunks created: 567293
Final dataset saved to: processed_filtered_chunked_batch.csv


In [None]:
def analyze_processed_results(processed_file="processed_filtered_chunked_batch.csv"):
    """
    Analyze the results of batch processing without loading the entire dataset into memory.
    
    Parameters:
    - processed_file: path to the processed CSV file
    
    Returns:
    - Dictionary with analysis results
    """
    print(f"Analyzing processed results from {processed_file}...")
    
    # Read file in small chunks to analyze without loading everything
    chunk_size = 10000
    total_rows = 0
    label_counts = {'human': 0, 'llm': 0}
    chunk_counts = 0
    length_stats = []
    
    chunk_iter = pd.read_csv(processed_file, chunksize=chunk_size)
    
    for chunk in tqdm(chunk_iter, desc="Analyzing chunks"):
        total_rows += len(chunk)
        
        # Count labels
        chunk_label_counts = chunk['label'].value_counts()
        for label, count in chunk_label_counts.items():
            if label in label_counts:
                label_counts[label] += count
        
        # Count chunked entries
        chunk_counts += len(chunk[chunk['chunk_id'] != ''])
        
        # Sample length statistics
        if len(length_stats) < 50000:  # Collect sample for statistics
            length_stats.extend(chunk['text'].str.len().tolist())
    
    # Calculate statistics
    length_stats = pd.Series(length_stats[:50000])  # Limit for memory
    
    print(f"\nPROCESSED DATA ANALYSIS")
    print("="*40)
    print(f"Total rows: {total_rows:,}")
    print(f"Chunked entries: {chunk_counts:,} ({chunk_counts/total_rows*100:.2f}%)")
    print(f"\nLabel distribution:")
    for label, count in label_counts.items():
        print(f"  {label}: {count:,} ({count/total_rows*100:.2f}%)")
    
    print(f"\nText length statistics (sample):")
    print(f"  Min: {length_stats.min()}")
    print(f"  Max: {length_stats.max()}")
    print(f"  Mean: {length_stats.mean():.1f}")
    print(f"  Median: {length_stats.median():.1f}")
    
    return {
        'total_rows': total_rows,
        'label_counts': label_counts,
        'chunk_counts': chunk_counts,
        'length_stats': {
            'min': length_stats.min(),
            'max': length_stats.max(),
            'mean': length_stats.mean(),
            'median': length_stats.median()
        }
    }


def create_balanced_dataset_batch(input_file="processed_filtered_chunked_batch.csv", 
                                 output_file="balanced_processed.csv",
                                 target_ratio=0.3, batch_size=25000):
    """
    Create a balanced dataset from the processed file using batch processing.
    
    Parameters:
    - input_file: path to processed CSV file
    - output_file: path for balanced output
    - target_ratio: target ratio for balancing
    - batch_size: batch size for processing
    
    Returns:
    - None (saves balanced dataset to file)
    """
    print(f"Creating balanced dataset from {input_file}...")
    
    # First pass: count labels
    label_counts = {'human': 0, 'llm': 0}
    chunk_iter = pd.read_csv(input_file, chunksize=batch_size)
    
    for chunk in tqdm(chunk_iter, desc="Counting labels"):
        chunk_label_counts = chunk['label'].value_counts()
        for label, count in chunk_label_counts.items():
            if label in label_counts:
                label_counts[label] += count
    
    print(f"Label counts: {label_counts}")
    
    # Calculate target sizes
    total_target = int((label_counts['human'] + label_counts['llm']) * target_ratio)
    target_per_label = total_target // 2
    
    print(f"Target size per label: {target_per_label:,}")
    
    # Second pass: sample data
    human_sampled = []
    llm_sampled = []
    human_count = 0
    llm_count = 0
    
    chunk_iter = pd.read_csv(input_file, chunksize=batch_size)
    
    for chunk in tqdm(chunk_iter, desc="Sampling data"):
        # Sample humans
        human_chunk = chunk[chunk['label'] == 'human']
        if len(human_chunk) > 0 and human_count < target_per_label:
            needed = min(len(human_chunk), target_per_label - human_count)
            sampled = human_chunk.sample(n=needed, random_state=42)
            human_sampled.append(sampled)
            human_count += len(sampled)
        
        # Sample LLMs (with replacement if needed)
        llm_chunk = chunk[chunk['label'] == 'llm']
        if len(llm_chunk) > 0 and llm_count < target_per_label:
            needed = min(len(llm_chunk), target_per_label - llm_count)
            # Use replacement if we need more samples than available
            replace = needed > len(llm_chunk)
            sampled = llm_chunk.sample(n=needed, replace=replace, random_state=42)
            llm_sampled.append(sampled)
            llm_count += len(sampled)
        
        # Stop if we have enough samples
        if human_count >= target_per_label and llm_count >= target_per_label:
            break
    
    # Combine and save
    print("Combining and saving balanced dataset...")
    all_human = pd.concat(human_sampled, ignore_index=True)
    all_llm = pd.concat(llm_sampled, ignore_index=True)
    balanced_df = pd.concat([all_human, all_llm], ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    balanced_df.to_csv(output_file, index=False)
    
    print(f"\nBalanced dataset created:")
    print(f"Total size: {len(balanced_df):,}")
    print(balanced_df['label'].value_counts())
    print(f"Saved to: {output_file}")


# Example usage:
# After running the batch processing, you can analyze results and create balanced dataset:
# results = analyze_processed_results("processed_filtered_chunked_batch.csv")
# create_balanced_dataset_batch("processed_filtered_chunked_batch.csv", "balanced_processed.csv", target_ratio=0.3)
