In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set random seed for reproducibility
np.random.seed(42)

# Define possible values for categorical columns
disease_states = ['Healthy', 'Diseased', 'Remission']
treatment_types = ['Control', 'Treatment A', 'Treatment B', 'Placebo']
tissue_types = ['Blood', 'Liver', 'Kidney', 'Brain', 'Lung']
sequencing_protocols = ['RNA-Seq', 'WGS', 'WES', 'ChIP-Seq']
extraction_methods = ['Column Purification', 'Phenol-Chloroform', 'SPRI Beads', 'Trizol']

# Create sample IDs
sample_count = 50
sample_ids = [f"SAMPLE_{i:04d}" for i in range(1, sample_count + 1)]

# Create collection dates spanning over 3 months
start_date = datetime(2023, 1, 1)
collection_dates = [start_date + timedelta(days=np.random.randint(0, 90)) for _ in range(sample_count)]

# Generate metadata
data = {
    'sample_id': sample_ids,
    'collection_date': collection_dates,
    'disease_state': np.random.choice(disease_states, size=sample_count, p=[0.4, 0.4, 0.2]),
    'treatment': np.random.choice(treatment_types, size=sample_count),
    'tissue_type': np.random.choice(tissue_types, size=sample_count),
    'sequencing_protocol': np.random.choice(sequencing_protocols, size=sample_count),
    'extraction_method': np.random.choice(extraction_methods, size=sample_count),
    'quality_score': np.round(np.random.uniform(5.0, 10.0, size=sample_count), 1),
    'concentration_ng_ul': np.round(np.random.uniform(10.0, 200.0, size=sample_count), 2),
    'patient_age': np.random.randint(18, 85, size=sample_count),
    'patient_sex': np.random.choice(['M', 'F'], size=sample_count),
    'notes': ''
}

# Add some correlation between disease state and other variables
for i, state in enumerate(data['disease_state']):
    # Diseased samples tend to have lower quality scores
    if state == 'Diseased':
        data['quality_score'][i] = max(5.0, data['quality_score'][i] - 1.5)
    
    # Add specific notes for some samples
    if state == 'Diseased':
        data['notes'] += "Advanced stage; "
    elif state == 'Remission':
        data['notes'] += "Post-treatment follow-up; "
    
    # Add some NA values
    if np.random.random() < 0.1:  # 10% chance
        data['concentration_ng_ul'][i] = np.nan
        data['notes'] += "Concentration measurement failed; "

# Create DataFrame
df = pd.DataFrame(data)

# Add some additional flags/correlations
# Flag samples with potential quality issues
df['quality_flag'] = (df['quality_score'] < 7.0)

# Age group categorization
df['age_group'] = pd.cut(df['patient_age'], 
                         bins=[0, 35, 65, 100], 
                         labels=['Young', 'Middle-aged', 'Elderly'])

# Save to CSV in the current directory
df.to_csv('output.csv', index=False)

print("Sample metadata CSV file has been created: sample_metadata.csv")
print("\nFirst 5 rows of the data:")
print(df.head())

Sample metadata CSV file has been created: sample_metadata.csv

First 5 rows of the data:
     sample_id collection_date disease_state    treatment tissue_type  \
0  SAMPLE_0001      2023-02-21      Diseased  Treatment A      Kidney   
1  SAMPLE_0002      2023-01-15       Healthy      Placebo      Kidney   
2  SAMPLE_0003      2023-03-13       Healthy      Control       Brain   
3  SAMPLE_0004      2023-03-02       Healthy  Treatment B       Liver   
4  SAMPLE_0005      2023-01-21       Healthy      Placebo       Liver   

  sequencing_protocol extraction_method  quality_score  concentration_ng_ul  \
0                 WGS        SPRI Beads            8.0               193.66   
1            ChIP-Seq        SPRI Beads            9.4               193.09   
2                 WGS        SPRI Beads            8.9               172.07   
3                 WES            Trizol            8.2                65.95   
4             RNA-Seq            Trizol            5.4                83.17 