In [11]:
import numpy as np
import pandas as pd
np.random.seed(42)
# Parameters
num_samples = 1000  # Number of data points
num_sig_variables = 5
num_variables = 9  # Total number of independent variables
random_coef_uniform_range = (-5, 5)  # Range for uniform random coefficients
random_coef_normal_mean = 0         # Mean for normal random coefficients
random_coef_normal_std = 2          # Standard deviation for normal random coefficients
fixed_coefs = [2, -3]               # Fixed coefficients for some variables
noise_std = 1.0                     # Standard deviation of noise

# Step 1: Define which variables will have uniform and normal coefficients
num_fixed = len(fixed_coefs)
num_uniform = 2  # Number of variables with coefficients drawn from a uniform distribution
num_normal = 1  # Remaining variables use normal distribution
num_dummy = num_variables - num_fixed - num_uniform - num_normal  # Number of dummy variables
# Generate random coefficients
random_coefs_uniform = np.random.uniform(random_coef_uniform_range[0], random_coef_uniform_range[1], num_uniform)
random_coefs_normal = np.random.normal(random_coef_normal_mean, random_coef_normal_std, num_normal)

# Combine fixed, uniform, and normal coefficients
coefficients = np.array(fixed_coefs + list(random_coefs_uniform) + list(random_coefs_normal))

# Step 2: Generate independent variables (X)
X = np.random.uniform(0, 1, (num_samples, num_variables))  # Random values for each variable

# Step 3: Compute the linear combination
linear_combination = np.dot(X[:,:num_sig_variables], coefficients)

# Step 4: Add noise
noise = np.random.normal(0, noise_std, num_samples)
y_linear = linear_combination + noise

# Step 5: Model Poisson outcomes (optional)
# Cap the y_linear values to prevent extremely large λ
y_linear_capped = np.clip(y_linear, a_min=None, a_max=10)  # Cap maximum value to 10
y_poisson = np.random.poisson(np.exp(y_linear_capped))  # Use capped values for λ

# Step 6: Create a mapping of X variables to coefficient types
variable_types = ["Fixed"] * num_fixed + ["Uniform"] * num_uniform + ["Normal"] * num_normal
variable_mapping = pd.DataFrame({
    "Variable": [f"X{i+1}" for i in range(num_sig_variables)],
    "Coefficient_Type": variable_types,
    "Coefficient_Value": coefficients
})

# Combine into a DataFrame
data = pd.DataFrame(X, columns=[f"X{i+1}" for i in range(num_variables)])
#i just want to rename those columns
data.columns = data.columns[:-num_dummy].tolist() + [f"N{i+1}" for i in range(num_variables-num_dummy, num_variables)]
#data["Linear_Target"] = y_linear
data["Y"] = y_poisson

# Display the coefficient mapping and the first few rows of the dataset
print("Variable Mapping:")
print(variable_mapping)
print("\nSynthetic Data (First Few Rows):")
print(data.head())

# Optional: Save to CSV
data.to_csv("d_rp.csv", index=False)
variable_mapping.to_csv("variable_mapping.csv", index=False)

Variable Mapping:
  Variable Coefficient_Type  Coefficient_Value
0       X1            Fixed           2.000000
1       X2            Fixed          -3.000000
2       X3          Uniform          -1.254599
3       X4          Uniform           4.507143
4       X5           Normal           1.295377

Synthetic Data (First Few Rows):
         X1        X2        X3        X4        X5        N6        N7  \
0  0.156019  0.155995  0.058084  0.866176  0.601115  0.708073  0.020584   
1  0.212339  0.181825  0.183405  0.304242  0.524756  0.431945  0.291229   
2  0.292145  0.366362  0.456070  0.785176  0.199674  0.514234  0.592415   
3  0.170524  0.065052  0.948886  0.965632  0.808397  0.304614  0.097672   
4  0.122038  0.495177  0.034389  0.909320  0.258780  0.662522  0.311711   

         N8        N9    Y  
0  0.969910  0.832443  343  
1  0.611853  0.139494    1  
2  0.046450  0.607545   51  
3  0.684233  0.440152  169  
4  0.520068  0.546710    5  


In [1]:
import numpy as np
import pandas as pd

# Step 1: Set random seed for reproducibility
np.random.seed(42)

# Step 2: Define the number of observations
n = 1000  # Number of data points

# Step 3: Generate explanatory variables (X1, X2, X3)
X1 = np.random.normal(0, 1, n)
X2 = np.random.uniform(0, 1, n)
X3 = np.random.exponential(1, n)

# Step 4: Generate heterogeneity variables (Z1, Z2)
Z1 = np.random.normal(1, 0.5, n)  # Heterogeneity term 1
Z2 = np.random.normal(-1, 0.3, n)  # Heterogeneity term 2

# Step 5: Define the mean and standard deviation of the random parameters
# The means depend on heterogeneity variables Z1 and Z2
beta_0_mean = 1 
beta_1_mean = 0.5 
beta_2_mean = -0.3 + 0.1 * Z1
beta_3_mean = 0.8 + 0.3 * Z1

# Standard deviations for random parameters
beta_0_sd = 0
beta_1_sd = 0
beta_2_sd = 0.25
beta_3_sd = 0.35

# Step 6: Generate random parameters (random effects)
beta_0 = np.random.normal(beta_0_mean, beta_0_sd)
beta_1 = np.random.normal(beta_1_mean, beta_1_sd)
beta_2 = np.random.normal(beta_2_mean, beta_2_sd)
beta_3 = np.random.normal(beta_3_mean, beta_3_sd)

# Step 7: Calculate the Poisson mean (lambda) with heterogeneity

mean = np.clip(beta_0 + beta_1 * X1 + beta_2 * X2 + beta_3 * X3, None, 10)  # Clip to prevent extremely large values
lambda_ = np.exp(mean)

# Step 8: Generate the dependent variable (Y) from a Poisson distribution
Y = np.random.poisson(lambda_)

# Step 9: Combine the main dataset into a DataFrame
data = pd.DataFrame({
    "Y": Y,
    "X1": X1,
    "X2": X2,
    "X3": X3,
    "Z1": Z1,
    "Z2": Z2# Save the data

})

# Save the main dataset to a CSV file
data.to_csv("d_hm.csv", index=False)

# Step 10: Create a separate dataset for coefficients and their standard deviations
coefficients = pd.DataFrame({
    "observation": np.arange(1, n + 1),  # Observation index
    "beta_0": beta_0,
    "beta_1": beta_1,
    "beta_2": beta_2,
    "beta_3": beta_3,
})

# Save the coefficients dataset to a CSV file
coefficients.to_csv("d_hm_coedd.csv", index=False)

# Step 11: Save the standard deviations of the random parameters separately
std_devs = pd.DataFrame({
    "parameter": ["beta_0", "beta_1", "beta_2", "beta_3"],
    "std_dev": [beta_0_sd, beta_1_sd, beta_2_sd, beta_3_sd]
})

# Save the standard deviations to a CSV file
std_devs.to_csv("random_parameter_standard_deviations.csv", index=False)

# Display the first few rows of each dataset
print("Main Dataset:")
print(data.head())

print("\nCoefficients Dataset:")
print(coefficients.head())

print("\nStandard Deviations Dataset:")
print(std_devs)

Main Dataset:
    Y        X1        X2        X3        Z1        Z2
0   6  0.496714  0.167483  0.247268  0.696650 -1.292014
1   4 -0.138264  0.104568  0.037413  1.105642 -0.697860
2   5  0.647689  0.636430  0.114318  1.600039 -0.899157
3  10  1.523030  0.706476  0.413791  0.754049 -1.068243
4   8 -0.234153  0.031586  1.622451  0.061724 -1.247181

Coefficients Dataset:
   observation  beta_0  beta_1    beta_2    beta_3
0            1     1.0     0.5 -0.124890  1.048683
1            2     1.0     0.5  0.138112  1.292340
2            3     1.0     0.5 -0.193986  1.214141
3            4     1.0     0.5  0.009774  0.989959
4            5     1.0     0.5  0.114939  0.743661

Standard Deviations Dataset:
  parameter  std_dev
0    beta_0     0.00
1    beta_1     0.00
2    beta_2     0.25
3    beta_3     0.35
