# Discovering Opportunities in New York City’s Discovery Program: Students in Highly Competitive Markets

# File "data_generator.ipynb"

In this file, first we define a function that:
1. Generates N_a samples from a gaussian with mean_a and std_dev_a for non-disadvantaged students;
2. Generates N_d samples from a gaussian with mean_d and std_dev_d for disadvantaged students;
3. For each of the samples, generate an id, a complete random preference list, and a lottery number;
4. Stores the sampled values in an excel file data.xls.

Then the function is called for an iput consistent with the data from the school year 2016-2017, as discussed in Section 3 of the paper.



In [None]:
pip install pandas openpyxl numpy




In [None]:
import numpy as np
import pandas as pd
import random

def generate_excel(mean_a, std_dev_a, N_a, mean_d, std_dev_d, N_d, filename="data.xlsx"):
    data = []
    letters = ['Q', 'T', 'S', 'B', 'L', 'M', 'A', 'R']

    for _ in range(N_a):
        random_id = f"{random.randint(100000, 999999)}"  # 6-digit ID
        gaussian_sample = np.random.normal(loc=mean_a, scale=std_dev_a)
        permuted_letters = ''.join(random.sample(letters, len(letters)))  # Random permutation
        dis = 0
        random_number = random.randint(1, 100000)

        data.append([random_id, gaussian_sample, permuted_letters, dis, random_number])

    for _ in range(N_d):
        random_id = f"{random.randint(100000, 999999)}"  # 6-digit ID
        gaussian_sample = np.random.normal(loc=mean_d, scale=std_dev_d)
        permuted_letters = ''.join(random.sample(letters, len(letters)))  # Random permutation
        dis = 1
        random_number = random.randint(1, 100000)

        data.append([random_id, gaussian_sample, permuted_letters, dis, random_number])

    df = pd.DataFrame(data, columns=["ID", "Score", "Preference", "Disadvantaged", "Lottery"])
    df.to_excel(filename, index=False)
    print(f"Excel file '{filename}' generated with {N} rows.")

if __name__ == "__main__":
    mean_a = float(input("Enter the mean of the Gaussian for non-disadvantaged students: "))
    std_dev_a = float(input("Enter the standard deviation of the Gaussian for non-disadvantaged students: "))
    N_a = int(input("Enter the number of samples for non-disadvantaged students: "))
    mean_d = float(input("Enter the mean of the Gaussian for disadvantaged students: "))
    std_dev_d = float(input("Enter the standard deviation of the Gaussian for disadvantaged students: "))
    N_d = int(input("Enter the number of samples for disadvantaged students: "))

    generate_excel(mean_a, std_dev_a, N_a, mean_d, std_dev_d, N_d)


Enter the mean of the Gaussian for non-disadvantaged students: 408.76
Enter the standard deviation of the Gaussian for non-disadvantaged students: 92.53
Enter the number of samples for non-disadvantaged students: 18723
Enter the mean of the Gaussian for disadvantaged students: 362.40
Enter the standard deviation of the Gaussian for disadvantaged students: 83.13
Enter the number of samples for disadvantaged students: 9132
Excel file 'data.xlsx' generated with 9132 rows.
