In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Read the dataset into a DataFrame
df0 = pd.read_csv('Data.csv')

# Compute frequencies for all combinations of Sex, Age category, and Highest education level
freq_of_all_combos = df0.groupby(["Sex", "Age_category", "Highest_education_level"]).size().to_dict()


def synthesize(sample_freqs, sample_size, pop_size):
    """
    Synthesize a population based on the provided sample frequencies using stratified sampling.

    :param sample_freqs: Dictionary containing frequencies of sample combinations
    :param sample_size: Size of the sample population
    :param pop_size: Size of the synthesized population
    :return: DataFrame containing the synthesized population
    """

    # Create an empty DataFrame to store the synthesized population
    df = pd.DataFrame(index=range(pop_size), columns=["Sex", "Age_Groups", "Education_Levels"])

    # Pre-allocate memory for the DataFrame columns
    df["Sex"] = np.nan
    df["Age_Groups"] = np.nan
    df["Education_Levels"] = np.nan

    # Iterate over each combination and its frequency in the sample
    for combo, freq in sample_freqs.items():
        # Calculate the number of occurrences of the combination in the synthesized population
        n = int((freq / sample_size) * pop_size)

        # Get the sample indices for the current combination
        sample_indices = df[df["Sex"].isna()].sample(n=n, replace=False).index

        # Assign the current combination to the sample indices
        df.loc[sample_indices, ["Sex", "Age_Groups", "Education_Levels"]] = list(combo)

    return df


# Synthesize a population of 50,000 agents
final_pop = synthesize(freq_of_all_combos, 200, 50000)

# Print frequencies for the specified variables
print("Variable                         Description                 Frequency")
print("----------------------------------------------------------------------")

# For SEX
print(f"SEX                             : Male                       {final_pop['Sex'].value_counts().loc[1]}")
print(f"                                : Female                     {final_pop['Sex'].value_counts().loc[2]}\n")

# For AGE GROUP
print(f"AGE GROUP                       : Below  22 years            {final_pop['Age_Groups'].value_counts().loc[1]}")
print(f"                                : 22 - 60 years              {final_pop['Age_Groups'].value_counts().loc[2]}")
print(f"                                : Above 60 years             {final_pop['Age_Groups'].value_counts().loc[3]}\n")

# For HIGHEST EDUCATION LEVEL
print(f"HIGHEST EDUCATION LEVEL         : No Formal Education        {final_pop['Education_Levels'].value_counts().loc[0]}")
print(f"                                : Primary Education          {final_pop['Education_Levels'].value_counts().loc[1]}")
print(f"                                : Secondary Education        {final_pop['Education_Levels'].value_counts().loc[2]}")
print(f"                                : Graduation and above       {final_pop['Education_Levels'].value_counts().loc[3]}")


Variable                         Description                 Frequency
----------------------------------------------------------------------
SEX                             : Male                       25750
                                : Female                     24250

AGE GROUP                       : Below  22 years            17750
                                : 22 - 60 years              29500
                                : Above 60 years             2750

HIGHEST EDUCATION LEVEL         : No Formal Education        8000
                                : Primary Education          6500
                                : Secondary Education        24500
                                : Graduation and above       11000
