In [1]:
import pandas as pd, numpy as np
from scipy.special import expit as invlogit 

In [3]:
### Data generation for height and spring time collier example ###


# Simulate height and weight from independent normal distributions
n_samples = 100
height = np.random.normal(loc=170, scale=10, size=n_samples).round(0)  # Height in cm
sprint = np.random.normal(loc=14, scale=1.5, size=n_samples).round(1)   # 100m time

# z-scores
hz, sz = (height-170)/10, -(sprint-14)/1.5

# Likes basketball
bp =  np.random.uniform(size=n_samples)<(1/(1+np.exp(-3*(hz+sz+0.3))))

# Likes dodgeball
dp =  np.random.uniform(size=n_samples)<(1/(1+np.exp(-1*sz-0.5*hz)))

# Likes soccer
sp =  np.random.uniform(size=n_samples)<(1/(1+np.exp(-2*(sz-0.5))))

# Combine into a dataframe
df = pd.DataFrame({'height':height,'sprint':sprint,'basketball':bp,'dodgeball':dp,'soccer':sp})

print(df.head())

#df.to_csv('gym_class_big.csv')

   height  sprint  basketball  dodgeball  soccer
0   179.0    15.7        True      False   False
1   181.0    13.2        True       True   False
2   166.0    14.2        True       True    True
3   173.0    13.7        True       True   False
4   169.0    12.9        True       True   False


In [5]:
### Data generation for the political discrimination simulation ###

from scipy.special import expit as invlogit  # expit is the inverse logit function

# Constants
N = 1000  # Number of individuals to simulate
BASELINE_DISCRIMINATION = 0.15
PARTIES = ["Isamaa", "EKRE", "Reform", "Keskerakond", "SDE", "E200", "Parempoolsed"]
BASELINE_PROBS = np.array([0.31, 0.175, 0.165, 0.14, 0.115, 0.05, 0.065])

# Conservative identity probabilities by party - determines what proportion of
# given party members self identify as conservaties
CONSERVATIVE_PROBS = {
    "Isamaa": 0.9,
    "EKRE": 0.95,
    "Reform": 0.2,
    "Keskerakond": 0.8,
    "SDE": 0.08,
    "E200": 0.05,
    "Parempoolsed": 0.3
}

# Ethnicity probabilities by party - determines the proportion of non-estonians
# among the voters of a given party
ETHNICITY_PROBS = {
    'Keskerakond': 0.61,
    'EKRE': 0.12,
    'SDE': 0.08,
    'E200': 0.02,
    'Parempoolsed': 0.04,
    'Reform': 0.06,
    'Isamaa': 0.07
}

# Discrimination adjustments by party - defines the adjustment to the baseline 
# perceived discrimination depending on the party preference
DISCRIMINATION_ADJUSTMENT = {
    "Isamaa": 0.10,
    "EKRE": 0.60,
    "Reform": 0.05,
    "Keskerakond": 0.50,
    "SDE": 0.0,
    "E200": 0.0,
    "Parempoolsed": 0.0
}

def simulate_gender(n):
    """Simulate gender (male/female) assuming equal proportions."""
    return np.random.choice(["male", "female"], size=n)

def simulate_age(n):
    """Simulate age from 18 to 85 using a triangular distribution (mode at 47)."""
    return np.random.triangular(18, 47, 85, size=n).astype(int)

def get_gender_adjusted_probs(gender):
    """Adjust party probabilities based on gender."""
    probs = BASELINE_PROBS.copy()
    if gender == "male":
        probs[1] += 0.07  # Increase EKRE probability for males
    else:
        probs[4] += 0.04  # Increase SDE probability for females
    return probs / probs.sum()  # Normalize probabilities

def assign_party(genders):
    """Assign party for each individual based on gender-specific probabilities."""
    return [np.random.choice(PARTIES, p=get_gender_adjusted_probs(g)) for g in genders]

def simulate_conservative(party_assignments):
    """Simulate conservative identity based on party."""
    return np.array([np.random.binomial(1, CONSERVATIVE_PROBS[p]) for p in party_assignments])

def simulate_ethnicity(party_assignments):
    """Simulate ethnicity based on party."""
    return [np.random.choice(["estonian", "other"], p=[1 - ETHNICITY_PROBS[p], ETHNICITY_PROBS[p]]) for p in party_assignments]

def simulate_discrimination(party_assignments, ethnicity, adjust_for_ethnicity=False):
    """Simulate perceived discrimination based on party and optionally ethnicity. 
       Take care with adjusting by ethnicity, as this creates a collider :) """
    # Baseline discrimination probabilities with party adjustments
    discrimination_probs = np.array([BASELINE_DISCRIMINATION + DISCRIMINATION_ADJUSTMENT[p] for p in party_assignments])
    
    # Add additional bias for non-Estonians if adjust_for_ethnicity is True
    if adjust_for_ethnicity:
        for i, eth in enumerate(ethnicity):
            if eth == "other":
                # Adjust probability using inverse logit (logistic function)
                # Adding a small bias (e.g., 0.5) to increase discrimination probability
                discrimination_probs[i] = invlogit(np.log(discrimination_probs[i] / (1 - discrimination_probs[i])) + 0.5)
    
    return np.random.binomial(1, discrimination_probs)

def create_data(n, adjust_for_ethnicity=False):
    """Combine all simulated variables into a DataFrame."""
    genders = simulate_gender(n)
    age = simulate_age(n)
    party_assignments = assign_party(genders)
    conservative = simulate_conservative(party_assignments)
    ethnicity = simulate_ethnicity(party_assignments)
    discrimination = simulate_discrimination(party_assignments, ethnicity, adjust_for_ethnicity)

    return pd.DataFrame({
        "gender": genders,
        "age": age,
        "ethnicity": ethnicity,
        "party": party_assignments,
        "conservative": conservative,
        "discrimination": discrimination
    })

# Simulate data and create DataFrame with ethnicity adjustment ON
data = create_data(N, adjust_for_ethnicity=False)
print(data.head())


   gender  age ethnicity        party  conservative  discrimination
0  female   66  estonian         E200             0               0
1    male   40     other  Keskerakond             1               1
2    male   52  estonian       Isamaa             1               0
3    male   54  estonian       Isamaa             1               0
4    male   50  estonian       Reform             0               0
