<a href="https://colab.research.google.com/github/wcj365/college-scorecard/blob/master/2-distro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# College Scorecard EDA 
## Part Two - Distribution
How are Colleges Distributed Across the US?

## Import Relevant Python Libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt   

# We will use Altair for data visualization in additon to Pandas and Matplotlib
import altair as alt           

## Load the Data into Pandas Dataframe

In [0]:
# # Since there are hundreds of columns, we don't want to load all of them. 
# They consume too much memory. Define the list of columns to be loaded. 

COLUMNS_OF_INTEREST = ['UNITID', 'STABBR']

In [0]:
# Load the data directly from the web.
# The file is 154 MB in size. Alternatively, we would download and save it to a local flder first.

#url = r'https://ed-public-download.app.cloud.gov/downloads/Most-Recent-Cohorts-All-Data-Elements.csv'   

#df = pd.read_csv(url, usecols = COLUMNS_OF_INTEREST)

In [0]:
# Assuming the data file was already downloaded to a local folder.

FOLDER_NAME = "./data/CollegeScorecard_Raw_Data/"
FILE_NAME = "Most-Recent-Cohorts-All-Data-Elements.csv"

df = pd.read_csv(FOLDER_NAME + FILE_NAME, usecols = COLUMNS_OF_INTEREST)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7112 entries, 0 to 7111
Data columns (total 2 columns):
UNITID    7112 non-null int64
STABBR    7112 non-null object
dtypes: int64(1), object(1)
memory usage: 111.2+ KB


In [0]:
df.sample(10)

Unnamed: 0,UNITID,STABBR
2945,210809,PA
137,106713,AR
1440,161527,ME
1787,174358,MN
2958,211273,PA
7011,43601201,NY
1117,151467,IN
3563,229267,TX
1182,153588,IA
1412,160658,LA


In [0]:
# Groupby produces a dataframe and uses the group variable as the index. 
# Use reset_index to make the group variable a separate column

grouped = df[["UNITID","STABBR"]].groupby("STABBR").count().reset_index()

# UNITED become the column name for the subtotal counts
# Sort the dataframe based on the counts in a descending order

grouped.sort_values('UNITID', ascending=False, inplace=True)  

grouped.head()

Unnamed: 0,STABBR,UNITID
5,CA,716
39,NY,452
50,TX,446
10,FL,412
43,PA,378


### Use Altair bar chart to display all states and territories. Since there are 59 rows, vertical bars will be too crowded, use horizontal bars instead.

In [0]:
bars = alt.Chart(grouped).mark_bar().encode(
    x='UNITID:Q',
    y=alt.Y('STABBR:N', sort='-x'),
    color=alt.Color('STABBR', scale=alt.Scale(scheme='dark2'))
)

bars = bars.properties(
    width=600,
    height=800
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't overlap with the bar
).encode(
    text='UNITID:Q'
)

bars + text

### Use Altair bar chart to display top 10 states that have most colleges - Horizontal Bars

In [0]:
bars = alt.Chart(grouped.head(10)).mark_bar().encode(
    x='UNITID:Q',
    y=alt.Y('STABBR:N', sort='-x'),
    color=alt.Color('STABBR', scale=alt.Scale(scheme='dark2'))
)

bars = bars.properties(
    width=600,
    height=600
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't overlap with the bar
).encode(
    text='UNITID:Q'
)

bars + text


### Use Altair bar chart to display top 10 states that have most colleges - Vertical Bars

In [0]:
bars = alt.Chart(grouped.head(10)).mark_bar().encode(
    y='UNITID:Q',
    x=alt.X('STABBR:N', sort='y'),
    color=alt.Color('STABBR', scale=alt.Scale(scheme='dark2'))
)

bars = bars.properties(
    width=600,
    height=600
)

text = bars.mark_text(
    align='center',
    baseline='middle',
    dy=-10  # Nudges text up so it doesn't overlap with the bar
).encode(
    text='UNITID:Q'
)

bars + text
