# Discovering Opportunities in New York City’s Discovery Program: Students in Highly Competitive Markets

# File "Verify_HCH.ipynb"

In this file, we:
1. Import student data from 'data.xlsx';
2. Initialize school data (name and capacities);
3. Define a function running the requested algorithm (to be chosen from DISC, MR, JSA, BASE) with the requested quota on the data;
4. Use all of the above to verify that the HCH holds for synthetic data consistent with the real data from the school year 2016-2017, as discussed in Section 3 of the paper.


In [None]:
import pandas as pd
pd.options.display.max_rows = 100
import numpy as np
import pickle

In [None]:
# We load the data, count the number of disadvantaged, non-disadvantaged, and total students

df = pd.read_excel("data.xlsx")

n_adv = (df.iloc[:, 3] == 0).sum()
n_dis = (df.iloc[:, 3] == 1).sum()
n_tot=n_adv+n_dis

print(f"Number non-disadvantaged students: {n_adv}")
print(f"Number disadvantaged students: {n_dis}")
print(f"Total number of students: {n_tot}")




Number non-disadvantaged students: 18723
Number disadvantaged students: 9132
Total number of students: 27855


In [None]:
# We initialize school names

sch_info = {
    'B': {'name': 'Bronx High School of Science'},
    'T': {'name': 'Brooklyn Technical High School'},
    'R': {'name': 'Staten Island Technical High School'},
    'L': {'name': 'Brooklyn Latin'},
    'Q': {'name': 'Queens High School for Science at York'},
    'M': {'name': 'High School of Mathematics, Science and Engineering at City College'},
    'S': {'name': 'Stuyvesant High School'},
    'A': {'name': 'High School of American Studies at Lehman College'}
}



In [None]:
# We set the total capacity of schools, as obtained from real-world data

sch_info['B']['capacity'] = 941
sch_info['T']['capacity'] = 1923
sch_info['R']['capacity'] = 339
sch_info['L']['capacity'] = 425
sch_info['Q']['capacity'] = 179
sch_info['M']['capacity'] = 199
sch_info['S']['capacity'] = 926
sch_info['A']['capacity'] = 146

print(sch_info)

{'B': {'name': 'Bronx High School of Science', 'capacity': 941}, 'T': {'name': 'Brooklyn Technical High School', 'capacity': 1923}, 'R': {'name': 'Staten Island Technical High School', 'capacity': 339}, 'L': {'name': 'Brooklyn Latin', 'capacity': 425}, 'Q': {'name': 'Queens High School for Science at York', 'capacity': 179}, 'M': {'name': 'High School of Mathematics, Science and Engineering at City College', 'capacity': 199}, 'S': {'name': 'Stuyvesant High School', 'capacity': 926}, 'A': {'name': 'High School of American Studies at Lehman College', 'capacity': 146}}


In [None]:
# Function that takes in input:
# (1) The data frame containing information of each student
# (2) The name and capacity of each school
# (3) The percetange of seats that are non-reserved (same for each school)
# (4) The type of algorithm to be implemented

def get_assignment(df, sch_info, MQ, algo):
    # check if algo is accepted
    if not algo in ['DISC', 'MR', 'JSA', 'Base']:
        return 'check spelling of the Algorithm name'

    # sort students in order of priority (high priority = high score & low lottery)
    df_sorted = df.sort_values(['Score', 'Lottery'], ascending=[False, True])

    # initialize number of students admitted by each school and capacities
    # each school is divided into two: for non-disadvantaged (a) and for disadvantaged (d)
    admit_dict = {}
    if algo != 'Base':
        for sch in sch_info:
            admit_dict[sch+'d'] = {}
            admit_dict[sch+'d']['capacity'] = np.ceil(sch_info[sch]['capacity']*(1-MQ))
            admit_dict[sch+'d']['admit'] = 0
            admit_dict[sch+'a'] = {}
            admit_dict[sch+'a']['capacity'] = sch_info[sch]['capacity'] - admit_dict[sch+'d']['capacity']
            admit_dict[sch+'a']['admit'] = 0
    else:
        for sch in sch_info:
            admit_dict[sch] = {}
            admit_dict[sch]['capacity'] = sch_info[sch]['capacity']
            admit_dict[sch]['admit'] = 0

    # assign students to schools
    sch_assigned = [' ' for i in range(df_sorted.shape[0])]
    rk_assigned = np.ones(df_sorted.shape[0], dtype=int)*len(sch_info)*3

    for i in range(df_sorted.shape[0]):
        # student information
        student = df_sorted.iloc[i,:]
        choice = list(student['Preference'].replace(" ", ""))

        # formulate the students' preference lists as a strict ordering (depending on the mechanism)
        # following the discussion in the Electroninic Companion EC.4

        # 1. discovery program
        if algo == 'DISC':
            if student['Disadvantaged'] == 0:
                preferences = [x+'a' for x in choice]
            else:
                preferences = [x+'a' for x in choice] + [x+'d' for x in choice]
        # 2. minority reserve
        if algo == 'MR':
            if student['Disadvantaged'] == 0:
                preferences = [x+'a' for x in choice]
            else:
                preferences = [x+'d' for x in choice] + [x+'a' for x in choice]
        # 3. joint seat allocation
        if algo == 'JSA':
            if student['Disadvantaged'] == 0:
                preferences = [x+'a' for x in choice]
            else:
                preferences = [x+g for x in choice for g in ['a', 'd']]
        # 4. baseline mechanism
        if algo == 'Base':
            preferences = [x for x in choice]

        # assign to the first school with seats remaining
        for sch in preferences:
            if admit_dict[sch]['admit'] < admit_dict[sch]['capacity']:
                admit_dict[sch]['admit'] = admit_dict[sch]['admit'] + 1
                sch_assigned[i] = sch
                rk_assigned[i] = choice.index(sch[0]) + 1
                if algo=='DISC':   # penalty for summer camp requirement
                    rk_assigned[i] = rk_assigned[i] # + 8*(sch[1]=='m')
                break

    # return assignment
    algo_MQ = algo+'_%d' % (np.round(MQ*100))
    df_sorted['sch_'+algo_MQ] = sch_assigned
    df_sorted['rk_'+algo_MQ] = rk_assigned
    return df_sorted.sort_index()[['ID', 'sch_'+algo_MQ, 'rk_'+algo_MQ]]

In [None]:
# Function that runs the algorithms with name in 'algos' and percentage of reserved seats in 'MQs'
# on the data frame 'df'
# It stores the assignment in df_assignment and in the file assignment.csv
# and prints it

def run_algo (df,alogs, MQs):
  df_assignment = df
  for MQ, algo in zip(MQs, algos):
    df_assignment = pd.concat([df_assignment, get_assignment(df, sch_info, MQ, algo)\
                               .drop(columns=['ID'])], axis=1)
  df_assignment.to_csv('assignment.csv', index=False)
  print(df_assignment)
  return df_assignment

In [None]:
# Function that takes in input a family of assignments 'df_assignment' as output by the function run_algo
# and a percentage 'quota'
# and checks whether the High Competitive Hypothesis is verified for output of MR with quota% non-reserved seats

def verify_HCH (dfo,quota):
  admits_dis={}
  quota_str=str(quota)
  for sch in sch_info:
    admits_dis[sch]=0

    filtered = dfo[
        (dfo['Disadvantaged'] == 1) & ((dfo['sch_MR_'+quota_str]==sch+'a') | (dfo['sch_MR_'+quota_str]==sch+'d'))
    ]

# computes for each school the number of disadvantaged students that obtained a seat

    admits_dis[sch]=len(filtered)

# checks if any school admits more students that their number of reserved seats - if it does, then the HC hypothesis is not satisfied

  for sch in sch_info:
    if admits_dis[sch]>=np.ceil(sch_info[sch]['capacity']*(1-(quota/100)))+1:
      print(f"The HCH hypothesis is not verified for MR with percentage of reserved seats equal to {100-quota}, since school {sch_info[sch]['name']} admits {admits_dis[sch]} disadvantaged students, and has {int(np.ceil(sch_info[sch]['capacity']*(1-(quota/100))))}  reserved seats")
      return 1
  print("The HCH hypothesis is verified for MR with percentage of reserved seats equal to " + str(100-quota))





In [None]:
# We select the algorithms to be run, and the quota for the general seats

algos = ['MR', 'MR', 'MR']
MQs = [.7, .8, .9]

# We verify when the HCH is verified for the selected quotas
# In particular, with the real-world quotas of 20% reserved seats (i.e., MQ=.8)
# The HCH is verified

dfo=run_algo(df,algos,MQs)

verify_HCH(dfo,70)
verify_HCH(dfo,80)
verify_HCH(dfo,90)



           ID       Score Preference  Disadvantaged  Lottery sch_MR_70  \
0      455412  339.779520   MBTLRSQA              0    40233             
1      855007  409.912503   MBSRLQAT              0    35924             
2      700522  428.583336   LAQRMBTS              0    79128             
3      676193  341.829528   BRMLTQAS              0    48872             
4      664419  508.224655   TQSMRLAB              0    54676        Ta   
...       ...         ...        ...            ...      ...       ...   
27850  864196  275.815748   MABQSTLR              1     4630             
27851  321115  502.661467   SBRLAMTQ              1    81247        Sd   
27852  368995  431.895820   QSLBMATR              1    52748             
27853  782857  335.559215   TSBQARLM              1    68980             
27854  690686  388.547900   MALRBTSQ              1    78163             

       rk_MR_70 sch_MR_80  rk_MR_80 sch_MR_90  rk_MR_90  
0            24                  24                  

1