In [None]:
import pandas as pd
import numpy as np

def create_sample(n=1000, adjustments=None):
    """
    Create a sample of participants with specified variable distributions.

    Parameters:
    n (int): Number of participants in the sample.
    adjustments (dict): Dictionary specifying the target percentage or mean for each variable.

    Returns:
    pd.DataFrame: DataFrame containing the sample with adjusted distributions.
    """

    # Initialize the DataFrame with random data
    data = {
        'sex': np.random.choice(['male', 'female'], size=n),
        'age': np.random.randint(18, 80, size=n),
        'educ': np.random.choice(['primary', 'secondary', 'tertiary'], size=n),
        'income': np.random.choice(['low', 'middle', 'high'], size=n),
        'religion': np.random.choice(['none', 'christian', 'muslim', 'other'], size=n),
        'sg1': np.random.choice(['agree', 'neutral', 'disagree'], size=n),
        'sg9': np.random.choice(['yes', 'no'], size=n),
        'sc1': np.random.choice(['yes', 'no'], size=n),
        'sc7a': np.random.choice(['yes', 'no'], size=n),
        'sc7b': np.random.choice(['yes', 'no'], size=n),
        'pi1': np.random.choice(['low', 'medium', 'high'], size=n),
        'lr1': np.random.randint(1, 11, size=n),
        'pm3': np.random.choice(['support', 'oppose', 'neutral'], size=n),
        'vp1': np.random.choice(['option1', 'option2', 'option3'], size=n),
        'pid1': np.random.choice(['party1', 'party2', 'party3'], size=n),
        'trust1': np.random.choice(['low', 'medium', 'high'], size=n)
    }

    df = pd.DataFrame(data)

    # Adjust variables based on the provided adjustments
    if adjustments:
        for variable, target in adjustments.items():
            if variable in df.columns:
                if isinstance(target, float) and 0 <= target <= 1:
                    # Adjust binary or categorical variable to match target percentage
                    unique_vals = df[variable].unique()
                    if len(unique_vals) == 2:
                        df[variable] = np.where(
                            np.arange(n) < int(target * n),
                            unique_vals[0],
                            unique_vals[1]
                        )
                    else:
                        raise ValueError(f"Cannot adjust {variable} to a percentage because it has more than 2 unique values.")
                elif isinstance(target, dict):
                    # Adjust categorical variable to match target distribution
                    total = sum(target.values())
                    if total != 1:
                        raise ValueError("The sum of the target percentages must be 1.")
                    df[variable] = np.random.choice(
                        list(target.keys()),
                        size=n,
                        p=list(target.values())
                    )
                elif isinstance(target, (int, float)):
                    # Adjust numerical variable to match target mean
                    df[variable] = df[variable] * (target / df[variable].mean())
                else:
                    raise ValueError(f"Invalid target type for {variable}.")

    return df

# Example usage
adjustments = {
    'sex': 0.8,  # 80% women
    'age': 35,  # mean age 35
    'educ': {'primary': 0.3, 'secondary': 0.5, 'tertiary': 0.2},  # distribution of education levels
    'income': {'low': 0.4, 'middle': 0.4, 'high': 0.2}  # distribution of income levels
}

sample_df = create_sample(n=1000, adjustments=adjustments)

# Display the first few rows of the sample DataFrame
print(sample_df.head())

# Display the distribution of sex in the sample DataFrame
print(sample_df['sex'].value_counts(normalize=True))
