# Salaries by College Major — Analysis Notebook (Schema: Start, Mid, P10, P90)

In [None]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format

## Load the dataset

In [None]:
df = pd.read_csv('salaries_by_college_major.csv')
print("Columns:", list(df.columns))
df.head()

## Quick data check

In [None]:
print("Shape:", df.shape)
print("\nMissing values per column:\n", df.isna().sum())

## Clean and ensure numeric types
Your CSV appears numeric already; we coerce to be safe.

In [None]:
cols_money = [
    'Starting Median Salary',
    'Mid-Career Median Salary',
    'Mid-Career 10th Percentile Salary',
    'Mid-Career 90th Percentile Salary',
]
for c in cols_money:
    df[c] = pd.to_numeric(df[c], errors='coerce')

clean_df = df.dropna(subset=['Undergraduate Major'] + cols_money).copy()
clean_df.tail()

## Key questions

In [None]:
idx_start_max = clean_df['Starting Median Salary'].idxmax()
clean_df.loc[idx_start_max, ['Undergraduate Major', 'Starting Median Salary']]

In [None]:
idx_mid_max = clean_df['Mid-Career Median Salary'].idxmax()
clean_df.loc[idx_mid_max, ['Undergraduate Major', 'Mid-Career Median Salary']]

In [None]:
idx_start_min = clean_df['Starting Median Salary'].idxmin()
clean_df.loc[idx_start_min, ['Undergraduate Major', 'Starting Median Salary']]

In [None]:
idx_mid_min = clean_df['Mid-Career Median Salary'].idxmin()
clean_df.loc[idx_mid_min, ['Undergraduate Major', 'Mid-Career Median Salary']]

## Salary spread (risk vs reward)
Spread = 90th percentile − 10th percentile.

In [None]:
clean_df = clean_df.assign(
    Spread = clean_df['Mid-Career 90th Percentile Salary'] - clean_df['Mid-Career 10th Percentile Salary']
)
clean_df[['Undergraduate Major', 'Spread']].head()

## Lowest spread majors (lower variance)

In [None]:
low_risk = clean_df.sort_values('Spread', ascending=True)
low_risk[['Undergraduate Major', 'Spread']].head(10)

## Highest potential majors (by 90th percentile)

In [None]:
highest_potential = clean_df.sort_values('Mid-Career 90th Percentile Salary', ascending=False)
highest_potential[['Undergraduate Major','Mid-Career 90th Percentile Salary']].head(10)

## Highest spread majors (higher variance)

In [None]:
high_risk = clean_df.sort_values('Spread', ascending=False)
high_risk[['Undergraduate Major','Spread']].head(10)

## Group averages
Average salaries and spread by Group.

In [None]:
group_summary = (clean_df
                 .groupby('Group', as_index=False)[cols_money + ['Spread']]
                 .mean(numeric_only=True)
                 .sort_values('Mid-Career Median Salary', ascending=False))
group_summary

## Visualizations

In [None]:
import matplotlib.pyplot as plt

### Top 10 Majors by Starting Median Salary

In [None]:
top_start = clean_df.sort_values('Starting Median Salary', ascending=False).head(10)

plt.figure(figsize=(10, 6))
plt.barh(top_start['Undergraduate Major'], top_start['Starting Median Salary'])
plt.xlabel("Starting Median Salary ($)")
plt.title("Top 10 Majors by Starting Median Salary")
plt.gca().invert_yaxis()
plt.show()

### Distribution of Salary Spread (P90 − P10)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(clean_df['Spread'].dropna(), bins=20, edgecolor='black')
plt.xlabel("Salary Spread ($)")
plt.ylabel("Number of Majors")
plt.title("Distribution of Salary Spread")
plt.show()

## (Optional) Save cleaned dataset

In [None]:
# clean_df.to_csv('salaries_by_college_major_clean.csv', index=False)