In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
def generate_sample_data(n_samples=1000, n_ids=100, n_files=3):
    # Ensure the output directory exists
    os.makedirs('data/input', exist_ok=True)

    for file_num in range(n_files):
        # Generate IDs
        ids = np.random.randint(1, n_ids + 1, n_samples)

        # Generate timestamps
        timestamps = pd.date_range(start='2020-01-01', periods=n_samples, freq='H')
        timestamps = timestamps.astype(int) // 10**9  # Convert to Unix timestamp

        # Generate multiple value columns
        value1 = np.random.randn(n_samples)
        value2 = np.random.randn(n_samples)
        value3 = np.random.randn(n_samples)

        # Generate categorical column
        categories = ['A', 'B', 'C', 'D']
        category = np.random.choice(categories, n_samples)

        # Create DataFrame
        df = pd.DataFrame({
            'id': ids,
            'timestamp': timestamps,
            'value1': value1,
            'value2': value2,
            'value3': value3,
            'category': category
        })

        # Add some missing values
        df.loc[np.random.choice(df.index, size=n_samples//10), 'value1'] = np.nan
        df.loc[np.random.choice(df.index, size=n_samples//10), 'value2'] = np.nan

        # Add some correlated columns
        df['correlated_value1'] = df['value1'] * 2 + np.random.randn(n_samples) * 0.1
        df['correlated_value2'] = df['value2'] * -1.5 + np.random.randn(n_samples) * 0.1

        # For the last file, add an extra column to test flexibility
        if file_num == n_files - 1:
            df['extra_column'] = np.random.randn(n_samples)

        # Save to CSV
        filename = f'data/input/sample_data_{file_num + 1}.csv'
        df.to_csv(filename, index=False)
        print(f"Generated {filename}")

In [3]:
generate_sample_data(n_samples=1000, n_ids=100, n_files=3)

Generated data/input/sample_data_1.csv
Generated data/input/sample_data_2.csv
Generated data/input/sample_data_3.csv


  timestamps = pd.date_range(start='2020-01-01', periods=n_samples, freq='H')
  timestamps = pd.date_range(start='2020-01-01', periods=n_samples, freq='H')
  timestamps = pd.date_range(start='2020-01-01', periods=n_samples, freq='H')
