In [56]:
import pandas as pd
import sys
import os
import numpy as np
from sklearn.datasets import load_iris, load_diabetes
from sklearn.datasets import fetch_openml




In [40]:
import pandas as pd
import numpy as np

def extract_summary_info(df):
    """
    Extract summary information from original DataFrame,
    including numeric mean/std and categorical label probabilities.

    Returns a dict:
    {
        'col_name': {
            'type': 'numeric' or 'categorical',
            'mean': float,           # numeric only
            'std': float,            # numeric only
            'labels': [str, ...],    # categorical only
            'probs': [float, ...]    # categorical only, sum to 1
        },
        ...
    }
    """

    summary = {}

    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            # Numeric column summary
            mean = df[col].mean()
            std = df[col].std()
            summary[col] = {
                'type': 'numeric',
                'mean': mean,
                'std': std if std > 0 else 1e-6  # avoid zero std for sampling
            }
        else:
            # Categorical column summary: get normalized value counts
            counts = df[col].value_counts(normalize=True, dropna=False)
            labels = counts.index.tolist()
            probs = counts.values.tolist()
            summary[col] = {
                'type': 'categorical',
                'labels': labels,
                'probs': probs
            }

    return summary

def generate_synthetic_from_summary(summary_info, n):
    """
    Generate synthetic data from extracted summary info.

    Parameters:
    - summary_info: dict produced by extract_summary_info
    - n: number of rows to generate

    Returns:
    - synthetic_df: pandas DataFrame with synthetic data
    """

    data = {}

    for col, info in summary_info.items():
        if info['type'] == 'numeric':
            mean = info['mean']
            std = info['std']
            # Sample from normal distribution
            samples = np.random.normal(loc=mean, scale=std, size=n)
            data[col] = samples
        elif info['type'] == 'categorical':
            labels = info['labels']
            probs = info['probs']
            samples = np.random.choice(labels, size=n, p=probs)
            data[col] = samples
        else:
            data[col] = [np.nan] * n

    synthetic_df = pd.DataFrame(data)
    return synthetic_df






In [65]:
import sklearn
print("Scikit-learn version:", sklearn.__version__)

Scikit-learn version: 1.7.1


In [70]:
from sklearn.datasets import make_multilabel_classification
import numpy as np

# Define parameters for the synthetic dataset
n_samples = 100  # Number of samples (data points)
n_features = 5  # Total number of features
n_classes = 10  # Number of possible classes/labels
n_labels = 4  # Average number of labels per sample

# Generate the dataset (remove n_informative, n_redundant, n_repeated)
X, Y = make_multilabel_classification(
    n_samples=n_samples,
    n_features=n_features,
    n_classes=n_classes,
    n_labels=n_labels,
    random_state=42  # Seed for reproducibility
)

print("Shape of X (features):", X.shape)
print("Shape of Y (targets):", Y.shape)

print("\nFirst 5 rows of X (features):\n", X[:5])
print("\nFirst 5 rows of Y (targets):\n", Y[:5])

Shape of X (features): (100, 5)
Shape of Y (targets): (100, 10)

First 5 rows of X (features):
 [[12.  9. 10. 12. 13.]
 [13.  8. 11. 12.  6.]
 [16.  8. 10. 12.  4.]
 [ 9.  9.  7.  9. 18.]
 [13.  8. 11.  8. 10.]]

First 5 rows of Y (targets):
 [[0 1 0 0 0 1 0 0 1 1]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 1 1 1 0 0 0 1 1 1]
 [1 1 1 0 0 0 0 0 1 0]]


In [71]:


df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)])
df['target'] = [f'class_{i}' for i in range(n_classes)] * (n_samples // n_classes) + ['class_0'] * (n_samples % n_classes)
df['target'] = df['target'].astype('category')
# Add some categorical features
for i in range(5 ):# Adding 5 categorical features):
    df[f'cat_feature_{i}'] = np.random.choice(['A', 'B', 'C','D','E'], size=n_samples)
df_original = df.describe(include='all')


summary_info = extract_summary_info(df_original)
print("Extracted summary information:")
for col, info in summary_info.items():
    print(f"{col}: {info}")

synthetic_df = generate_synthetic_from_summary(summary_info, n=20)
print("\nSynthetic dataset generated:")
print(synthetic_df)

Extracted summary information:
feature_0: {'type': 'numeric', 'mean': np.float64(21.826031135758775), 'std': np.float64(32.31977684677651)}
feature_1: {'type': 'numeric', 'mean': np.float64(19.496691215824345), 'std': np.float64(32.9567902305616)}
feature_2: {'type': 'numeric', 'mean': np.float64(20.85270647844083), 'std': np.float64(32.65031264275359)}
feature_3: {'type': 'numeric', 'mean': np.float64(19.819411370962296), 'std': np.float64(32.865869891108325)}
feature_4: {'type': 'numeric', 'mean': np.float64(23.04553115226291), 'std': np.float64(32.067311742050975)}
target: {'type': 'categorical', 'labels': [nan, 10, np.int64(100), 'class_0'], 'probs': [0.6363636363636364, 0.18181818181818182, 0.09090909090909091, 0.09090909090909091]}
cat_feature_0: {'type': 'categorical', 'labels': [nan, np.int64(100), 5, 'D', np.int64(23)], 'probs': [0.6363636363636364, 0.09090909090909091, 0.09090909090909091, 0.09090909090909091, 0.09090909090909091]}
cat_feature_1: {'type': 'categorical', 'labe