In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import polars as pl
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Read the data
nsf_ipeds = pl.read_csv("../data/ipeds_nsf.csv")

id_uta = 228769
id_jhu = 162928
aau = nsf_ipeds.filter(pl.col("AAU") | (pl.col("UnitID") == id_uta))

aau_average = aau.group_by("UnitID").agg(
    pl.col("Institution Name").first(),
    pl.col("Year").first(),
    pl.col("State abbreviation").first(),
    pl.col("FIPS state code").first(),
    pl.col("Historically Black College or University").first(),
    pl.col("Land Grant Institution").first(),
    pl.col("Sector of institution").first(),
    pl.col("Degree of urbanization (Urban-centric locale)").first(),
    pl.col("Institution grants a medical degree").first(),
    pl.col("Carnegie Classification 2021: Basic").first(),
    pl.col("control").first(),
    pl.col("Institutional Classification").first(),
    pl.col("Student Access and Earnings Classification").first(),
    pl.col("Research Activity Designation").first(),
    pl.col("Award Level Focus").first(),
    pl.col("Academic Mix").first(),
    pl.col("Graduate Academic Program Mix").first(),
    pl.col("Size").first(),
    pl.col("Campus Setting").first(),
    pl.col("Highest Degree Awarded").first(),
    pl.col("Community Engagement").first(),
    pl.col("Leadership for Public Practice").first(),
    pl.col("Department of Energy").mean(),
    pl.col("National Aeronautics and Space Administration").mean(),
    pl.col("Other federal agency").mean(),
    pl.col("Nonfederal").mean(),
    pl.col("Department of Defense").mean(),
    pl.col("National Science Foundation").mean(),
    pl.col("Department of Agriculture").mean(),
    pl.col("Department of Health and Human Services").mean(),
)

# Convert to pandas for easier manipulation with sklearn
df = aau_average.to_pandas()

# For now, we'll work with the sample data provided
print(f"Number of universities in the dataset: {len(df)}")

In [None]:
# List of funding columns
funding_columns = [
    "Department of Energy",
    "National Aeronautics and Space Administration",
    "Other federal agency",
    "Department of Defense",
    "National Science Foundation",
    "Department of Health and Human Services",
    "Department of Agriculture",
    "Nonfederal",
]

# Calculate total funding metrics
total_funding_cols_w_doa = funding_columns[:-1]
total_funding_cols_wo_doa = funding_columns[:-2]

# Compute total columns
df["Total (with Agriculture)"] = df[total_funding_cols_w_doa].sum(axis=1)
df["Total (without Agriculture)"] = df[total_funding_cols_wo_doa].sum(axis=1)

# Add total columns to the list for analysis
all_metrics = [*funding_columns, "Total (with Agriculture)", "Total (without Agriculture)"]


In [None]:
# Separate UTA and other AAU universities
uta_data = df[df["UnitID"] == id_uta]
aau_only = df[df["UnitID"] != id_uta]

In [None]:
# 1. Categorical Variable Comparison
categorical_columns = [
    "Historically Black College or University", "Land Grant Institution",
    "Sector of institution", "Degree of urbanization (Urban-centric locale)",
    "Institution grants a medical degree", "control", "Institutional Classification",
    "Student Access and Earnings Classification", "Research Activity Designation",
    "Award Level Focus", "Academic Mix", "Graduate Academic Program Mix",
    "Size", "Campus Setting", "Highest Degree Awarded",
]

# Function to compare categorical variables
def compare_categorical(uta_data, aau_only, column):
    """Compare categorical values between UTA and AAU universities"""
    uta_value = uta_data[column].values[0] if not uta_data.empty else "N/A"
    aau_counts = aau_only[column].value_counts(normalize=True).reset_index()
    aau_counts.columns = [column, "Proportion"]

    # Format the proportions as percentages
    aau_counts["Proportion"] = (aau_counts["Proportion"] * 100).round(1).astype(str) + "%"

    return {
        "uta_value": uta_value,
        "aau_distribution": aau_counts.to_dict("records"),
    }

# Perform categorical comparison
cat_comparisons = {}
for col in categorical_columns:
    cat_comparisons[col] = compare_categorical(uta_data, aau_only, col)


In [None]:

# 2. Clustering Analysis
# Select features for clustering
numerical_features = all_metrics

In [None]:
# Create a pipeline for preprocessing
# Process categorical data
cat_features = [col for col in categorical_columns if col in df.columns]
# Handle missing values in categorical features if any
for col in cat_features:
    df[col] = df[col].fillna("Unknown")

# One-hot encode categorical features
encoded_cats = pd.get_dummies(df[cat_features], drop_first=False)

# Process numerical data
num_features = [col for col in numerical_features if col in df.columns]
# Handle missing values in numerical features if any
for col in num_features:
    df[col] = df[col].fillna(df[col].median())

# Scale numerical features
scaler = StandardScaler()
scaled_nums = pd.DataFrame(
    scaler.fit_transform(df[num_features]),
    columns=num_features,
    index=df.index,
)

# Combine processed features
processed_df = pd.concat([scaled_nums, encoded_cats], axis=1)

In [None]:
# Create a function to create a comparison heatmap for categorical variables
def create_categorical_heatmap(df, uta_id, categorical_cols):
    """Create a heatmap showing UTA's categorical variables compared to AAU universities"""
    # Create a dataframe to hold the data
    result_df = pd.DataFrame(columns=["Variable", "UTA Value", "AAU Distribution", "Match"])

    # Get UTA data
    uta_data = df[df["UnitID"] == uta_id]
    if uta_data.empty:
        return None

    # Get AAU data (excluding UTA)
    aau_data = df[df["UnitID"] != uta_id]

    # For each categorical variable
    for col in categorical_cols:
        if col not in df.columns:
            continue

        uta_value = uta_data[col].values[0]
        aau_counts = aau_data[col].value_counts(normalize=True)

        # Get the most common value in AAU
        most_common_value = aau_counts.idxmax() if not aau_counts.empty else "Unknown"
        most_common_pct = aau_counts.max() * 100 if not aau_counts.empty else 0

        # Check if UTA matches the most common AAU value
        match = "Yes" if uta_value == most_common_value else "No"

        # Add to result dataframe
        result_df = pd.concat([
            result_df,
            pd.DataFrame({
                "Variable": [col],
                "UTA Value": [uta_value],
                "AAU Distribution": [f"{most_common_value} ({most_common_pct:.1f}%)"],
                "Match": [match],
            }),
        ], ignore_index=True)

    return result_df

In [None]:
# Perform K-means clustering
# Determine optimal number of clusters using the elbow method
inertia_values = []
k_range = range(2, min(10, len(processed_df) - 1))
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(processed_df)
    inertia_values.append(kmeans.inertia_)

# Plot elbow method
fig_elbow = px.line(
    x=list(k_range), y=inertia_values,
    title="Elbow Method for Optimal k",
    labels={"x": "Number of Clusters (k)", "y": "Inertia"},
)
fig_elbow.add_shape(
    type="line",
    line={"dash": "dash", "color": "gray"},
    x0=4, y0=min(inertia_values), x1=4, y1=max(inertia_values),
)


In [None]:
# Choose k based on elbow method (for now, let's assume k=4)
k_optimal = 3
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
df["Cluster"] = kmeans.fit_predict(processed_df)

# Add cluster information to original data
uta_cluster = df.loc[df["UnitID"] == id_uta, "Cluster"].values[0] if not uta_data.empty else None

# Perform PCA for visualization
pca = PCA(n_components=2)
pca_result = pca.fit_transform(processed_df)
df["PCA1"] = pca_result[:, 0]
df["PCA2"] = pca_result[:, 1]

# Create scatter plot with PCA
fig_pca = px.scatter(
    df, x="PCA1", y="PCA2", color="Cluster",
    hover_data=["Institution Name"],
    title="University Clusters based on PCA",
    labels={"PCA1": "Principal Component 1", "PCA2": "Principal Component 2"},
    color_continuous_scale=px.colors.qualitative.Set1,
)

# Highlight UTA in the plot if it exists
if not uta_data.empty:
    uta_point = df[df["UnitID"] == id_uta]
    fig_pca.add_trace(
        go.Scatter(
            x=uta_point["PCA1"],
            y=uta_point["PCA2"],
            mode="markers",
            marker={
                "color": "black",
                "size": 15,
                "line": {"width": 2, "color": "black"},
            },
            name="UTA",
            hoverinfo="text",
            text=uta_point["Institution Name"],
        ),
    )

fig_pca.show()


In [None]:
import numpy as np

# Create categorical comparison heatmap
cat_heatmap_df = create_categorical_heatmap(df, id_uta, categorical_columns)

# If the dataframe was created successfully, create a heatmap
if cat_heatmap_df is not None:
    # Create a heatmap figure
    fig_cat_heatmap = go.Figure(data=go.Heatmap(
        z=np.where(cat_heatmap_df["Match"] == "Yes", 1, 0),
        x=["Match with AAU Mode"],
        y=cat_heatmap_df["Variable"],
        colorscale=[[0, "red"], [1, "green"]],
        showscale=False,
    ))

    # Add annotations for UTA values
    for i, row in cat_heatmap_df.iterrows():
        fig_cat_heatmap.add_annotation(
            x=0,
            y=i,
            text=f"UTA: {row['UTA Value']}<br>AAU: {row['AAU Distribution']}",
            showarrow=False,
            font={"color": "white", "size": 10},
        )

    fig_cat_heatmap.update_layout(
        title="UTA vs AAU: Categorical Variables Comparison",
        height=max(400, len(cat_heatmap_df) * 30),  # Adjust height based on number of variables
        margin={"l": 200},  # Add left margin for variable names
    )

fig_cat_heatmap.show()

## Carnegie Methodology

The classification is based on the calculation of two measures:

- Access: To measure access, the classification evaluates whether institutions are enrolling a student population that is representative of the locations they serve. To do this, we used the enrollment of undergraduate students by Pell grant status and underrepresented race/ethnicity. Those data were contextualized based on the location that students are from using IPEDS migration data.

- Earnings: To measure economic outcomes, the classification compares median post-attendance earnings as reported by the College Scorecard to earnings of people in their area ages 22-40 who hold a high school diploma or higher. Recognizing that student data is variable based on geographic location and race/ethnicity, earnings data was analyzed based on the geographical and racial/ethnic composition of the student body.


**Award Level Focus**: The focus and mix of award levels at an institution; generally, this is where they award the most degrees. There are six categories:  
    
1. Associate: Institutions that primarily award associate degrees.

2. Associate/Baccalaureate: Institutions that primarily award associate degrees but that also award a sufficient number of bachelor’s and/or graduate degrees.

3. Baccalaureate: Institutions that primarily award bachelor’s degrees and that do not have a significant graduate program.

4. Undergraduate/Graduate-Master’s: Institutions that have both a graduate and undergraduate presence, with an emphasis on the master’s degree, and that do not have a significant doctoral program.

5. Undergraduate/Graduate-Doctorate: Institutions that have both a graduate and undergraduate presence, with a sufficient focus on the doctorate.
Graduate-focused: Institutions that are mostly focused on graduate studies.


**Academic Program Mix**: The fields of study in which institutions award 50% or more of their degrees. For most institutions, the classifications use data on undergraduate degrees only. For graduate-focused institutions, the classifications use data on graduate degrees only.

1. Special Focus: Generally, the majority of degrees are awarded in a single academic area or field of study. (11 categories)
2. Professions–focused: The majority of degrees are awarded in fields that are classified as pre-professional or career-aligned.
3. Mixed: Fewer than 50% of degrees are awarded in any one focus area.

**Size**: The size of an institution, as measured by its total 12-month headcount, including full- and part-time students, and including undergraduate and graduate students.

1. Very Small: Fewer than 500 students total
2. Small: Between 500 and 4,000 students total
3. Medium: Between 4,000 and 20,000 students total
4. Large: Between 20,000 students and 40,000 students total
5. Very Large: At least 40,000 students total


In [None]:
# Display results
print("Categorical Variables Comparison:")
for col, comparison in cat_comparisons.items():
    print(f"\n{col}:")
    print(f"UTA: {comparison['uta_value']}")
    print("AAU Distribution:")
    for item in comparison["aau_distribution"]:
        print(f"  {item[col]}: {item['Proportion']}")

### IPEDS Data Dictionary

Sector of institution (HD2022)	1	Public, 4-year or above  
Sector of institution (HD2022)	2	Private not-for-profit, 4-year or above

Land Grant Institution (HD2023)	1	Land Grant Institution  
Land Grant Institution (HD2023)	2	Not a Land Grant Institution

Historically Black College or University (HD2022)	1	Yes  
Historically Black College or University (HD2022)	2	No

Institution grants a medical degree (HD2022)	1	Yes  
Institution grants a medical degree (HD2022)	2	No  
Institution grants a medical degree (HD2022)	-2	Not applicable

Degree of urbanization (Urban-centric locale) (HD2022)	11	City: Large  
Degree of urbanization (Urban-centric locale) (HD2022)	12	City: Midsize  
Degree of urbanization (Urban-centric locale) (HD2022)	13	City: Small  
Degree of urbanization (Urban-centric locale) (HD2022)	21	Suburb: Large  
Degree of urbanization (Urban-centric locale) (HD2022)	22	Suburb: Midsize  
Degree of urbanization (Urban-centric locale) (HD2022)	23	Suburb: Small  
Degree of urbanization (Urban-centric locale) (HD2022)	31	Town: Fringe  
Degree of urbanization (Urban-centric locale) (HD2022)	32	Town: Distant  
Degree of urbanization (Urban-centric locale) (HD2022)	33	Town: Remote  
Degree of urbanization (Urban-centric locale) (HD2022)	41	Rural: Fringe  
Degree of urbanization (Urban-centric locale) (HD2022)	42	Rural: Distant  




In [None]:
print("\nClustering Results:")
if not uta_data.empty:
    print(f"UTA belongs to Cluster {uta_cluster}")

    # Count universities in each cluster
    cluster_counts = df["Cluster"].value_counts().sort_index()
    print("\nCluster Sizes:")
    for cluster, count in cluster_counts.items():
        print(f"Cluster {cluster}: {count} universities")

    # Show top universities in UTA's cluster
    print(f"\nTop Universities in Cluster {uta_cluster}:")
    cluster_unis = df[df["Cluster"] == uta_cluster].sort_values("PCA1")
    for _, uni in cluster_unis.head().iterrows():
        print(f"  {uni['Institution Name']}")
else:
    print("UTA not found in the dataset")



In [None]:
# Show figures
fig_elbow.show()


In [None]:
fig_pca.show()


In [None]:
# Create a sunburst chart to visualize categorical distributions
def create_categorical_sunburst(df, cat_col1, cat_col2):
    """Create a sunburst chart to visualize the distribution of categorical variables"""
    # Count frequencies
    sunburst_df = df.groupby([cat_col1, cat_col2]).size().reset_index(name="count")

    # Create sunburst chart
    fig = px.sunburst(
        sunburst_df,
        path=[cat_col1, cat_col2],
        values="count",
        title=f"Distribution of {cat_col1} and {cat_col2} in Universities",
    )

    return fig

# Create some sunburst charts for key categorical variables
if len(df) > 1:  # Only if we have enough data
    # Choose some interesting combinations of categorical variables
    sunburst_combinations = [
        ("control", "Size"),
        ("Research Activity Designation", "Institution grants a medical degree"),
        ("Land Grant Institution", "Institutional Classification"),
    ]

    for col1, col2 in sunburst_combinations:
        if col1 in df.columns and col2 in df.columns:
            fig_sunburst = create_categorical_sunburst(df, col1, col2)
            fig_sunburst.show()