## Longitudinal Education Outcomes (LEO) analysis on graduate earnings

### Analysis performed on 2021/22 data on industry of graduate employment, up to the 5-digit Standard Industry Classification (SIC) code level, available [here](https://content.explore-education-statistics.service.gov.uk/api/releases/20a83553-df0f-4fc1-e6d5-08dc0dccb030/files/018ba8f3-c614-4abb-aa1b-88244edd0c86) on GOV.UK

In [1]:
# import required libraries

import pandas as pd
import numpy as np

In [2]:
# load data into Pandas DataFrame

df = pd.read_excel("input/industry_tables_data_5_digit_SIC.xlsx")

In [None]:
# view first 5 rows of data

df.head()

In [None]:
# explore columns

df.describe(include='all')

In [5]:
# drop columns that only have one value in them

df.drop(columns=['time_period', 'time_identifier', 'geographic_level', 'country_code', 'country_name', 'group_number_3dig', 'SIC_5dig'], inplace = True)

In [None]:
# explore YAG column
# YAG = Years after graduation

df['YAG'].value_counts()

In [7]:
# replace ' YAG' from each row and convert column to number

df["YAG"] = pd.to_numeric(df["YAG"].str.replace(" YAG", ""))

# rename the column for clarity in analysis

df.rename(columns={'YAG': 'years_after_graduation'}, inplace=True)

In [None]:
# explore SECTIONNAME column
# SECTIONNAME = Industry section (Standard Industry Classification, SIC section name)

df['SECTIONNAME'].value_counts()

In [9]:
# remove rows where industry is not known

df.drop(df[df.SECTIONNAME == 'Not known'].index, inplace=True)

# rename the column to standardise format for analysis

df.rename(columns={'SECTIONNAME': 'section_name'}, inplace=True)

In [None]:
# explore group_name column
# group_name = Industry group name

df['group_name'].value_counts()

In [None]:
# explore sic_detailed column
# sic_detailed = Industry class/sub-class name

df['sic_detailed'].value_counts()

In [None]:
# explore qualification_TR column
# qualification_TR = Qualification level

df['qualification_TR'].value_counts()

In [13]:
# Level 6 = degree or equivalent
# Level 7 = masters or equivalent
# Level 8 = doctorate or equivalent

# this analysis will focus on level 6 qualifications only as it is the most common qualification choice for higher education
# link: https://explore-education-statistics.service.gov.uk/find-statistics/progression-to-higher-education-or-training

df.drop(df[df.qualification_TR != 'First degree'].index, inplace=True)

# drop the qualification_TR column now there's only one value in it

df.drop(columns=['qualification_TR'], inplace = True)

In [None]:
# explore sex column
# sex = Graduate sex

df['sex'].value_counts()

In [None]:
# explore subject_name column
# subject_name = Subject studied

df['subject_name'].value_counts()

In [None]:
# explore ethnicity_major column
# ethnicity_major = Ethnicity broad

df['ethnicity_major'].value_counts()

In [None]:
# explore prior_attainment column
# prior_attainment = Prior attainment level

df['prior_attainment'].value_counts()

In [None]:
# explore FSM column
# FSM = Free school meal eligibility status

df['FSM'].value_counts()

In [None]:
# explore current_region column
# current_region = Current region

df['current_region'].value_counts()

In [None]:
# explore count column
# count = Number of graduates

df['count'].value_counts()

In [21]:
# rename the column for clarity in analysis

df.rename(columns={'count': 'number_of_graduates'}, inplace=True)

In [None]:
# explore earnings_median column
# earnings_median = Median earnings

df['earnings_median'].value_counts()

In [23]:
# replace 'c' values with NumPy not a number (NaN)
# c = data has been supressed due to small numbers

df["earnings_median"] = df["earnings_median"].replace("c", np.NaN)

In [None]:
# explore combinations of demographic variables in the data

df.groupby(['sex', 'ethnicity_major', 'prior_attainment', 'FSM', 'current_region']).size().reset_index().rename(columns={0:'count'})

In [25]:
# it isn't possible in this dataset to explore combinations of demographic variables
# this analysis will focus on differences between male and female graduates to further explore the well-understood gap between male and female earnings

df.drop(df[df.sex == 'Female + male'].index, inplace=True)

In [26]:
# remove the remaining demographic colummns that are no longer required

df.drop(columns=['ethnicity_major', 'prior_attainment', 'FSM', 'current_region'], inplace = True)

In [None]:
# explore the combinations of sex and subject_name

df.groupby(['sex', 'subject_name']).size().reset_index().rename(columns={0:'count'})

In [28]:
# create dataframe with subjects included except the totals

df_sex_subjects = df.drop(df[df.subject_name == 'Total'].index)

In [None]:
# check data after transformations

df_sex_subjects.head()

In [65]:
# create dataframe with only the totals across subjects

df_sex = df.drop(df[df.subject_name != 'Total'].index)

In [66]:
# remove the subject_name column that is no longer required

df_sex.drop(columns=['subject_name'], inplace=True)

In [None]:
# check data after transformations

df_sex.head()

In [68]:
# remove rows where there is no earnings data

df_sex.dropna(inplace=True)

In [69]:
# count the number of times each sic appears, we only want to keep the ones that appear 8 times (each year for each sex)

df_sex['count_sics'] = df_sex.groupby('sic_detailed')['sic_detailed'].transform('count')

In [72]:
# remove the sics that appear less than 8 times

df_sex.drop(df_sex[df_sex.count_sics != 8].index, inplace=True)

# drop the count_sics column as it's no longer required

df_sex.drop(columns=['count_sics'], inplace=True)

In [73]:
# output the data to a CSV

df_sex.to_csv("output/graduate_outcomes_industry_sex_salary.csv", index = False)
df_sex_subjects.to_csv("output/graduate_outcomes_industry_sex_salary_subject.csv", index = False)