# US News Medical School Rank Analyzer

Begin by going to the US News ranks, clicking on research or primary care ranks, then clicking on the "Table View" tab.

Scroll down to the bottom of the page to allow all schools to populate.

Right click on the page and say "save as html".  Download the files and open up the html document in Notepad++.

Copy and paste the long HTML line involving school ranks, paste it, and create a text file.  Import the text file here.

### Importing the reuqired packages

In [1]:
import re
import pandas as pd
import numpy as np

# 1. Research Ranks

### Reading the txt file and finding all the info for each school within the text

In [2]:
# Opening the txt file and selecting for all the school info
f = open("2021 US News Research Ranks.txt", "r")
f = f.read()
schools = re.findall(pattern='div name=(.*?)Span-sc-19wk4id-0 dJCHhF', string=f)

### Finding the school name, state, city, rank, tuition, and enrollment for each school

In [3]:
# Finding school names for every school
names = []
for s in schools:
    s = re.findall(pattern='^"(.*?)" class', string=s)
    s = s[0]
    names.append(s)
    #print(s)

In [4]:
# Finding the cities and states in which schools are located
states = []
cities = []
for c in schools:
    c = re.findall(pattern='class="Paragraph-sc-1iyax29-0 lehlCB">(.*?)<\/p><p', string=c)
    c = c[0]
    c = c.split(',')
    if len(c) < 2:
        c.append("Unknown")
        c.reverse()
    ci = c[0]
    st = c[1].strip()
    states.append(st)
    cities.append(ci)
    #print(st, ci)

In [5]:
# Finding the school ranks from US News
ranks = []
for r in schools:
    r = re.findall(pattern='NameRank__RankPosition-sc-1yvv32p-0 bgsDIR gjlzZT">(.*?)<\/', string=r)
    if len(r) == 0:
        r = ["#Unranked "]
    r = r[0].replace(" ", "").replace("#", "")
    try:
        r = int(r)
    except:
        pass
    ranks.append(r)
    #print(r)

In [6]:
# Finding both in-state and out-of-state tuition for each school 
tuition1 = []
tuition2 = []
for t in schools:
    t = re.findall(pattern='TuitionItem-sc-19pb4sw-1 ihFPLW hxYZAl">(.*?) <span', string=t)
    if len(t) == 1:
        t.append(np.nan)
    if len(t) == 0:
        t = [np.nan, np.nan]
    tuition1.append(t[0])
    tuition2.append(t[1])
    #print(t)

# In-state tuition (or just regular tuition for private schools)
tuition_1 = []
for t in tuition1:
    try:
        t = t.strip('$').replace(',', '')
        t = float(t)
    except:
        pass
    tuition_1.append(t)
    #print(t)

# Optional out-out-state tuition for public schools
tuition_2 = []
for t in tuition2:
    try:
        t = t.strip('$').replace(',', '')
        t = float(t)
    except:
        pass
    tuition_2.append(t)
    #print(t)

In [7]:
# Finding the total school enrollment for each school
enrollment = []
for e in schools:
    e = re.findall(pattern='class="Span-sc-19wk4id-0 ihFPLW">(.*?)<\/span', string=e)
    e = e[0]
    e = e.replace(' ', '').replace(',', '')
    if e == "N/A":
        e = np.nan
    e = float(e)
    enrollment.append(e)
    #print(e)

### Merging all data into a Dataframe and exporting

In [8]:
# Making a dataframe of all metrics
d = {"School Name": names, "State": states, "City": cities, "Rank": ranks, "In-State Tuition": tuition_1, "Out-of-State Tuition": tuition_2,
    "Enrollment": enrollment}
research_ranks = pd.DataFrame(d)
display(research_ranks)

# Optionally exporting to excel
#research_ranks.to_excel("2021 US News Research Ranks.xlsx", index=False)

Unnamed: 0,School Name,State,City,Rank,In-State Tuition,Out-of-State Tuition,Enrollment
0,Harvard University,MA,Boston,1,64984.0,,700.0
1,New York University (Grossman),NY,New York,2,0.0,,417.0
2,Duke University,NC,Durham,3,61170.0,,502.0
3,Columbia University,NY,New York,4,64868.0,,585.0
4,Stanford University,CA,Stanford,4,62193.0,,484.0
...,...,...,...,...,...,...,...
185,University of Texas--Austin (Dell),TX,Austin,Unranked,,,
186,University of Texas Medical Branch--Galveston,TX,Galveston,Unranked,,,
187,University of Texas--Rio Grande Valley,TX,Edinburg,Unranked,,,
188,University of the Incarnate Word,TX,San Antonio &amp; Alamo Heights,Unranked,,,


# 1. Primary Care Ranks

### Reading the txt file and finding all the info for each school within the text

In [9]:
# Opening the txt file and selecting for all the school info
f = open("2021 US News Primary Care Ranks.txt", "r")
f = f.read()
schools = re.findall(pattern='div name=(.*?)Span-sc-19wk4id-0 dJCHhF', string=f)

In [10]:
schools[16]

'"Baylor College of Medicine" class="Box-w0dun1-0 CYZBT"><h3 size="1" class="Heading__HeadingStyled-sc-1w5xk2o-0-h3 izrRLF Heading-sc-1w5xk2o-1 kQuiLM" spacing="0"><a href="https://www.usnews.com/best-graduate-schools/top-medical-schools/baylor-college-of-medicine-04110">Baylor College of Medicine</a></h3><p size="2" spacing="0" class="Paragraph-sc-1iyax29-0 lehlCB">Houston, TX</p><p size="2" spacing="0" class="Paragraph-sc-1iyax29-0 lehlCB"><a href="https://www.usnews.com/best-graduate-schools/top-medical-schools/primary-care-rankings" class="NameRank__Rank-sc-1yvv32p-1 hYlbKH"><strong size="2" class="Strong-sc-1m7huwa-0 NameRank__RankPosition-sc-1yvv32p-0 bgsDIR gjlzZT">#17 </strong>in <strong size="2" class="Strong-sc-1m7huwa-0 bgsDIR">Best Medical Schools: Primary Care</strong><span size="2" class="Span-sc-19wk4id-0 ljMlPu"> (tie)</span></a></p></div></span></td></tr><tr><td class="TableStacked__Cell-sc-82ags4-1 bDYRDH"><span size="3" class="Span-sc-19wk4id-0 cBeETl"><span size="3"

### Finding the school name, state, city, rank, tuition, and enrollment for each school

In [11]:
# Finding school names for every school
names = []
for s in schools:
    s = re.findall(pattern='^"(.*?)" class', string=s)
    s = s[0]
    names.append(s)
    #print(s)

In [12]:
# Finding the cities and states in which schools are located
states = []
cities = []
for c in schools:
    c = re.findall(pattern='class="Paragraph-sc-1iyax29-0 lehlCB">(.*?)<\/p><p', string=c)
    c = c[0]
    c = c.split(',')
    if len(c) < 2:
        c.append("Unknown")
        c.reverse()
    ci = c[0]
    st = c[1].strip()
    states.append(st)
    cities.append(ci)
    #print(st, ci)

In [13]:
# Finding the school ranks from US News
ranks = []
for r in schools:
    r = re.findall(pattern='NameRank__RankPosition-sc-1yvv32p-0 bgsDIR gjlzZT">(.*?)<\/', string=r)
    if len(r) == 0:
        r = ["#Unranked "]
    r = r[0].replace(" ", "").replace("#", "")
    try:
        r = int(r)
    except:
        pass
    ranks.append(r)
    #print(r)

In [14]:
# Finding both in-state and out-of-state tuition for each school 
tuition1 = []
tuition2 = []
for t in schools:
    t = re.findall(pattern='TuitionItem-sc-19pb4sw-1 ihFPLW hxYZAl">(.*?) <span', string=t)
    if len(t) == 1:
        t.append(np.nan)
    if len(t) == 0:
        t = [np.nan, np.nan]
    tuition1.append(t[0])
    tuition2.append(t[1])
    #print(t)

# In-state tuition (or just regular tuition for private schools)
tuition_1 = []
for t in tuition1:
    try:
        t = t.strip('$').replace(',', '')
        t = float(t)
    except:
        pass
    tuition_1.append(t)
    #print(t)

# Optional out-out-state tuition for public schools
tuition_2 = []
for t in tuition2:
    try:
        t = t.strip('$').replace(',', '')
        t = float(t)
    except:
        pass
    tuition_2.append(t)
    #print(t)

In [15]:
# Finding the total school enrollment for each school
enrollment = []
for e in schools:
    e = re.findall(pattern='class="Span-sc-19wk4id-0 ihFPLW">(.*?)<\/span', string=e)
    e = e[0]
    e = e.replace(' ', '').replace(',', '')
    if e == "N/A":
        e = np.nan
    e = float(e)
    enrollment.append(e)
    #print(e)

### Merging all data into a Dataframe and exporting

In [17]:
# Making a dataframe of all metrics
d = {"School Name": names, "State": states, "City": cities, "Rank": ranks, "In-State Tuition": tuition_1, "Out-of-State Tuition": tuition_2,
    "Enrollment": enrollment}
primary_care_ranks = pd.DataFrame(d)
display(primary_care_ranks)

# Optionally exporting to excel
#primary_care_ranks.to_excel("2021 US News Primary Care Ranks.xlsx", index=False)

Unnamed: 0,School Name,State,City,Rank,In-State Tuition,Out-of-State Tuition,Enrollment
0,University of Washington,WA,Seattle,1,37760.0,69186.0,1123.0
1,University of California--San Francisco,CA,San Francisco,2,36342.0,48587.0,664.0
2,University of North Carolina--Chapel Hill,NC,Chapel Hill,3,32746.0,60140.0,782.0
3,Oregon Health and Science University,OR,Portland,4,44356.0,68184.0,617.0
4,University of Minnesota,MN,Minneapolis,5,40191.0,57678.0,1051.0
...,...,...,...,...,...,...,...
185,University of Texas--Austin (Dell),TX,Austin,Unranked,,,
186,University of Texas Medical Branch--Galveston,TX,Galveston,Unranked,,,
187,University of Texas--Rio Grande Valley,TX,Edinburg,Unranked,,,
188,University of the Incarnate Word,TX,San Antonio &amp; Alamo Heights,Unranked,,,


### Messing around and getting a list of MSTP schools

In [24]:
mstp = pd.read_excel("MSTP Schools.xlsx", sheet_name="2021 MSTP")
mstp = mstp[mstp["MSTP"] == "Yes"]
mstp_schools = np.array(mstp["School Name"])
display(mstp.head())
mstp.shape

Unnamed: 0,School Name,State,City,MSTP
0,University of Washington,WA,Seattle,Yes
1,University of California--San Francisco,CA,San Francisco,Yes
2,University of North Carolina--Chapel Hill,NC,Chapel Hill,Yes
3,Oregon Health and Science University,OR,Portland,Yes
4,University of Minnesota,MN,Minneapolis,Yes


(51, 4)

In [25]:
mstp_schools

array(['University of Washington',
       'University of California--San Francisco',
       'University of North Carolina--Chapel Hill',
       'Oregon Health and Science University', 'University of Minnesota',
       'University of Colorado', 'Harvard University',
       'University of Kansas Medical Center',
       'University of Massachusetts--Worcester',
       'University of California--Los Angeles (Geffen)',
       'University of Wisconsin--Madison', 'University of Rochester',
       'University of Michigan--Ann Arbor', 'University of Maryland',
       'Baylor College of Medicine', 'University of Virginia',
       'University of Alabama--Birmingham', 'Stanford University',
       'University of Iowa (Carver)',
       'University of Pennsylvania (Perelman)', 'Ohio State University',
       'University of California--San Diego',
       'University of Texas Southwestern Medical Center',
       'University of Chicago (Pritzker)', 'University of Pittsburgh',
       'Emory University',