In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Load the original CSV file
data = pd.read_csv('all_data.csv')

# Define the target counts for each relation type
target_counts = {
    'support': 460,
    'attack': 30,
    'no-relation': 610
}

# Create an empty list to store sampled data
sampled_data = []

# Sample from each relation type
for relation_type, count in target_counts.items():
    # Filter data for the current relation type
    relation_data = data[data['relation'] == relation_type]
    
    # Check if we have enough data for sampling
    if len(relation_data) < count:
        print(f"Warning: Only {len(relation_data)} rows available for '{relation_type}', but {count} requested")
        sampled_relation = relation_data
    else:
        # Randomly sample the required number of rows
        sampled_relation = relation_data.sample(n=count, random_state=42)
    
    sampled_data.append(sampled_relation)

# Combine all sampled data
final_data = pd.concat(sampled_data, ignore_index=True)

# Shuffle the final dataset to randomize row order
final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to new CSV file
final_data.to_csv('sampled_data.csv', index=False)

# Display summary statistics
print("Original data distribution:")
print(data['relation'].value_counts())
print("\nNew sampled data distribution:")
print(final_data['relation'].value_counts())
print(f"\nTotal rows in new dataset: {len(final_data)}")
print("New CSV file 'sampled_data.csv' has been created successfully!")


Original data distribution:
relation
no-relation    2925
support        1846
attack          118
Name: count, dtype: int64

New sampled data distribution:
relation
no-relation    610
support        460
attack          30
Name: count, dtype: int64

Total rows in new dataset: 1100
New CSV file 'sampled_data.csv' has been created successfully!
