In [None]:
import pandas as pd
import klib as kl

import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv('data/WIOA_GA_2022.csv')
data.head()

In [None]:
field_lookups = pd.read_csv('reference_data/reference_table_lookups.csv').iloc[:,:6]
field_lookups_dict = field_lookups[['DATA ELEMENT NO.', 'DATA ELEMENT NAME']].drop_duplicates()
field_lookups_dict = dict(zip(field_lookups_dict['DATA ELEMENT NO.'], field_lookups_dict['DATA ELEMENT NAME']))

In [None]:
field_lookups.head(5)

In [None]:

df_cols = [col.replace('PIRL','') for col in  data.columns]
columns = [field_lookups_dict.get(col, col) for col in df_cols]
data.columns = columns

data = kl.clean_column_names(data)

data.head(5)

In [None]:
data.shape

In [None]:
# data.to_csv('data/with_column_names.csv')

### Null Value Checks

probably need to be handled. Likely with zero fill. 

In [None]:
# change setting
pd.set_option('display.max_rows', None)  # Show all rows


### columns of interest


foster_care_youth_status_at_program_entry_wioa	
homeless_participant,_homeless_children_and_youths,_or_runaway_youth_at_program_entry_wioa	
ex_offender_status_at_program_entry_wioa	
low_income_status_at_program_entry_wioa	
english_language_learner_at_program_entry_wioa	
basic_skills_deficient_low_levels_of_literacy_at_program_entry	
cultural_barriers_at_program_entry_wioa	
single_parent_at_program_entry_wioa	
displaced_homemaker_at_program_entry_wioa

recipient_of_incumbent_worker_training	
rapid_response	
adult_education_wioa	
job_corps_wioa	
veterans_programs	
vocational_education	
vocational_rehabilitation_wioa	
wagner_peyser_employment_service_wioa	
employment_and_training_services_related_to_snap
registered_apprenticeship_program	
national_dislocated_worker_grants_dwg

date_of_program_entry_wioa	
date_of_program_exit_wioa

received_training_wioa
type_of_work_experience

most_recent_date_of_self_service_activities
most_recent_date_accessed_information_only_activities
date_of_most_recent_reportable_individual_contact
most_recent_date_received_basic_career_services_staff_assisted
most_recent_date_received_basic_career_services_self_service_information_only
date_of_first_basic_career_service_staff_assisted
date_of_first_basic_career_service_self_service_information_only

individual_with_a_disability_wioa	
category_of_disability
employment_status_at_program_entry_wioa
long_term_unemployed_at_program_entry_wioa
occupational_code_of_most_recent_employment_prior_to_participation_if_available
industry_code_of_employment_1st_quarter_prior_to_participation	
industry_code_of_employment_2nd_quarter_prior_to_participation	
industry_code_of_employment_3rd_quarter_prior_to_participation
highest_educational_level_completed_at_program_entry_wioa
school_status_at_program_entry_wioa
eligible_training_provider_cip_code_wioa
occupational_skills_training_code_hash_1

type_of_employment_match_1st_quarter_after_exit_quarter_wioa
type_of_employment_match_2nd_quarter_after_exit_quarter_wioa
type_of_employment_match_3rd_quarter_after_exit_quarter_wioa
type_of_employment_match_4th_quarter_after_exit_quarter_wioa
industry_code_of_employment_1st_quarter_after_exit_quarter	
industry_code_of_employment_2nd_quarter_after_exit_quarter	
industry_code_of_employment_3rd_quarter_after_exit_quarter	
industry_code_of_employment_4th_quarter_after_exit_quarter

retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa

wages_3rd_quarter_prior_to_participation_quarter	
wages_2nd_quarter_prior_to_participation_quarter	
wages_1st_quarter_prior_to_participation_quarter	
wages_1st_quarter_after_exit_quarter_wioa	
wages_2nd_quarter_after_exit_quarter_wioa	
wages_3rd_quarter_after_exit_quarter_wioa	
wages_4th_quarter_after_exit_quarter_wioa

type_of_recognized_credential_wioa
date_attained_recognized_credential_wioa

wioa_median_earnings_quarter_2

age_at_participation



In [None]:
# Demographic & Background Information
demographics = [
    "foster_care_youth_status_at_program_entry_wioa",
    "homeless_participant,_homeless_children_and_youths,_or_runaway_youth_at_program_entry_wioa",
    "ex_offender_status_at_program_entry_wioa",
    "low_income_status_at_program_entry_wioa",
    "english_language_learner_at_program_entry_wioa",
    "basic_skills_deficient_low_levels_of_literacy_at_program_entry",
    "cultural_barriers_at_program_entry_wioa",
    "single_parent_at_program_entry_wioa",
    "displaced_homemaker_at_program_entry_wioa",
    "individual_with_a_disability_wioa",
    "category_of_disability"
]

# Education & Training Information
education_training = [
    "highest_educational_level_completed_at_program_entry_wioa",
    "school_status_at_program_entry_wioa",
    "adult_education_wioa",
    "job_corps_wioa",
    "vocational_education",
    "vocational_rehabilitation_wioa",
    "eligible_training_provider_cip_code_wioa",
    "occupational_skills_training_code_hash_1",
    "received_training_wioa",
    "type_of_work_experience",
    "type_of_recognized_credential_wioa",
    "date_attained_recognized_credential_wioa"
]

# Employment & Industry Information (Pre-Participation)
employment_pre = [
    "employment_status_at_program_entry_wioa",
    "long_term_unemployed_at_program_entry_wioa",
    "occupational_code_of_most_recent_employment_prior_to_participation_if_available",
    "industry_code_of_employment_1st_quarter_prior_to_participation",
    "industry_code_of_employment_2nd_quarter_prior_to_participation",
    "industry_code_of_employment_3rd_quarter_prior_to_participation"
]

# Employment & Industry Information (Post-Exit)
employment_post = [
    "type_of_employment_match_1st_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_2nd_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_3rd_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_4th_quarter_after_exit_quarter_wioa",
    "industry_code_of_employment_1st_quarter_after_exit_quarter",
    "industry_code_of_employment_2nd_quarter_after_exit_quarter",
    "industry_code_of_employment_3rd_quarter_after_exit_quarter",
    "industry_code_of_employment_4th_quarter_after_exit_quarter",
    "retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa"
]

# Wages & Earnings
wages_earnings = [
    "wages_3rd_quarter_prior_to_participation_quarter",
    "wages_2nd_quarter_prior_to_participation_quarter",
    "wages_1st_quarter_prior_to_participation_quarter",
    "wages_1st_quarter_after_exit_quarter_wioa",
    "wages_2nd_quarter_after_exit_quarter_wioa",
    "wages_3rd_quarter_after_exit_quarter_wioa",
    "wages_4th_quarter_after_exit_quarter_wioa",
    "wioa_median_earnings_quarter_2"
]

# Program Participation & Services Received
program_participation = [
    "recipient_of_incumbent_worker_training",
    "rapid_response",
    "wagner_peyser_employment_service_wioa",
    "employment_and_training_services_related_to_snap",
    "registered_apprenticeship_program",
    "national_dislocated_worker_grants_dwg",
    "veterans_programs"
]

# Program Entry & Exit Dates
program_dates = [
    "date_of_program_entry_wioa",
    "date_of_program_exit_wioa"
]

# Career Services & Participation History
career_services = [
    "most_recent_date_of_self_service_activities",
    "most_recent_date_accessed_information_only_activities",
    "date_of_most_recent_reportable_individual_contact",
    "most_recent_date_received_basic_career_services_staff_assisted",
    "most_recent_date_received_basic_career_services_self_service_information_only",
    "date_of_first_basic_career_service_staff_assisted",
    "date_of_first_basic_career_service_self_service_information_only"
]

# Miscellaneous
miscellaneous = ["age_at_participation"]


In [None]:
print(data.shape)
all_columns = demographics + education_training + employment_pre + employment_post + wages_earnings + program_participation + program_dates + career_services + miscellaneous
data = data[all_columns]
print(data.shape)

In [None]:
naics_lookup = pd.read_csv('reference_data/2022_NAICS_Structure.csv')
naics_lookup = naics_lookup.iloc[2:, 1:3]
naics_lookup_dict = dict(zip(naics_lookup['Unnamed: 1'].astype(str), naics_lookup['Unnamed: 2']))

cip_codes = pd.read_csv('reference_data/CipCode2010.csv')
cip_codes['CIPCode'] = cip_codes['CIPCode'].astype(str).replace('.','')
cip_codes_dict = dict(zip(cip_codes['CIPCode'], cip_codes['CIPTitle']))

occupational_codes = pd.read_csv('reference_data/occupation_code_ref.csv')
occupational_codes['O*NET-SOC Code'] = occupational_codes['O*NET-SOC Code'].astype(str).replace(r'[.,-]','', regex=True)
occupational_codes_dict = dict(zip(occupational_codes['O*NET-SOC Code'], occupational_codes['Title']))


In [None]:

for col in all_columns:
    data[col] = data[col].fillna('0').astype(int).astype(str)

    # 9 means the data is not applicable to the specified row. so we can replace with 0.
    if len(set(data[col].unique()) - (set(['0','1','9']))) == 0:
        data[col] = data[col].str.replace('9', '0')
        data[col] = pd.to_numeric(data[col], errors='coerce')  # Convert non-numeric to NaN
        data[col] = data[col].fillna(0).astype(int)
    
    elif len(set(data[col].unique()) - (set(['0','1']))) == 0:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        
    elif 'date' in col:
        print(data[col].unique())
        data[col] = pd.to_datetime(data[col], format='%Y%m%d', errors='coerce')
        
    elif 'wages' in col or 'earnings' in col:
        data[col] = pd.to_numeric(data[col], errors='coerce')
    
    elif 'industry_code' in col:
        data[col] = data[col].map(naics_lookup_dict)

    elif 'age' in col:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        
    # 2 is the same as 1, just 'verified with addtl info' and 9 is the same as 0
    elif len(set(data[col].unique()) - (set(['0','1','2','9']))) == 0 or len(set(data[col].unique()) - (set(['0','1','2']))) == 0 or len(set(data[col].unique()) - (set(['0','1','4']))) == 0:
        data[col] = data[col].str.replace('9', '0')
        data[col] = data[col].str.replace('2', '1')
        data[col] = data[col].str.replace('4', '1') # data entry error here
        data[col] = pd.to_numeric(data[col], errors='coerce')
   
    elif col == 'type_of_work_experience':
        work_experience_dict = {
            '1': "summer employment/internship",
            '2': "non-summer internship/employment",
            '3': "pre-apprenticeship program.",
            '4': "job shadowing.",
            '5': "on-the-job training",
            '6': "transitional job",
            '7': "other work experience.",
            '0': "no work experience.",
            "": "This data element does not apply to the participant."
        }
        data[col] = data[col].map(work_experience_dict)

    elif col == 'type_of_recognized_credential_wioa':
        education_credentials_dict = {
            "1": "Secondary School Diploma or Equivalency",
            "2": "AA or AS Diploma/Degree",
            "3": "BA or BS Diploma/Degree",
            "4": "Occupational Licensure",
            "5": "Occupational Certificate",
            "6": "Occupational Certification",
            "7": "Other Recognized Diploma, Degree, or Certificate",
            "0": "No Recognized Credential"
        }
        data[col] = data[col].map(education_credentials_dict)

    elif col == 'employment_status_at_program_entry_wioa':
        employment_status_dict = {
            "1": "Employed",
            "2": "Employed, but Received Notice of Termination of Employment or Military Separation is Pending",
            "3": "Not in Labor Force",
            "0": "Unemployed"
        }
        data[col] = data[col].map(employment_status_dict)

    elif col == 'category_of_disability':
        disability_status_dict = {
            "1": "Physical/Chronic Health Condition",
            "2": "Physical/Mobility Impairment",
            "3": "Mental or Psychiatric Disability",
            "4": "Vision-related Disability",
            "5": "Hearing-related Disability",
            "6": "Learning Disability",
            "7": "Cognitive/Intellectual Disability",
            "9": "Participant did not disclose type of disability",
            "0": "No Disability"
        }

        data[col] = data[col].map(disability_status_dict)

    elif col == 'highest_educational_level_completed_at_program_entry_wioa':
        educational_attainment_dict = {
            "1": "Attained secondary school diploma",
            "2": "Attained a secondary school equivalency",
            "3": "Received a certificate of attendance/completion from an Individualized Education Program (IEP)",
            "4": "Completed one or more years of postsecondary education",
            "5": "Attained a postsecondary technical or vocational certificate (non-degree)",
            "6": "Attained an Associate's degree",
            "7": "Attained a Bachelor's degree",
            "8": "Attained a degree beyond a Bachelor's degree",
            "0": "No educational level completed"
        }

        data[col] = data[col].map(educational_attainment_dict)

    elif col == 'school_status_at_program_entry_wioa':
        school_attendance_status_dict = {
            "1": "In-school, secondary school or less",
            "2": "In-school, Alternative School",
            "3": "In-school, Postsecondary school",
            "4": "Not attending school or Secondary School Dropout",
            "5": "Not attending school; secondary school graduate or has a recognized equivalent",
            "6": "Not attending school; within age of compulsory school attendance"
        }

        data[col] = data[col].map(school_attendance_status_dict)

    elif col == 'eligible_training_provider_cip_code_wioa':
        
        data[col] = data[col].map(cip_codes_dict)

    elif col == 'occupational_code_of_most_recent_employment_prior_to_participation_if_available' or col == 'occupational_skills_training_code_hash_1':

        data[col] = data[col].map(occupational_codes_dict)



#### AB column dive in

In [None]:
columns_for_ab = [
    # Job Placement Rate
    "type_of_employment_match_1st_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_2nd_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_3rd_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_4th_quarter_after_exit_quarter_wioa",
    "retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa",
    "industry_code_of_employment_1st_quarter_after_exit_quarter",
    "industry_code_of_employment_2nd_quarter_after_exit_quarter",
    "industry_code_of_employment_3rd_quarter_after_exit_quarter",
    "industry_code_of_employment_4th_quarter_after_exit_quarter",
    "date_of_program_exit_wioa"
]

### consolidating industries into latest industry worked

In [None]:
# Select the relevant columns
columns = [
    "industry_code_of_employment_1st_quarter_after_exit_quarter",
    "industry_code_of_employment_2nd_quarter_after_exit_quarter",
    "industry_code_of_employment_3rd_quarter_after_exit_quarter",
    "industry_code_of_employment_4th_quarter_after_exit_quarter"
]

# Use backfill to get the latest non-null industry code
data["latest_industry_code_of_employment_after_exit_quarter"] = data[columns].ffill(axis=1).iloc[:, -1]


In [None]:
data["latest_industry_code_of_employment_after_exit_quarter"].value_counts()

### Retention stats

In [None]:
data['retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa'].value_counts()

In [None]:
industry_codes = data.groupby('latest_industry_code_of_employment_after_exit_quarter')['retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa'].value_counts().reset_index()

retained_industry_codes = industry_codes.loc[industry_codes.retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa == 1].sort_values(['retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa','count'], ascending=False)
dropped_industry_codes = industry_codes.loc[industry_codes.retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa == 0].sort_values(['retention_with_the_same_employer_in_the_2nd_quarter_and_the_4th_quarter_wioa','count'], ascending=False)

In [None]:
retained_industry_codes[1:21]

In [None]:
dropped_industry_codes[1:21]

### Total Industry Counts 

In [None]:
columns = [
    'industry_code_of_employment_1st_quarter_after_exit_quarter',
    "industry_code_of_employment_2nd_quarter_after_exit_quarter",
    "industry_code_of_employment_3rd_quarter_after_exit_quarter",
    "industry_code_of_employment_4th_quarter_after_exit_quarter"
]


df1 =  data["industry_code_of_employment_1st_quarter_after_exit_quarter"].value_counts().reset_index()
df2 = data["industry_code_of_employment_2nd_quarter_after_exit_quarter"].value_counts().reset_index()
df3 = data["industry_code_of_employment_3rd_quarter_after_exit_quarter"].value_counts().reset_index()
df4 = data["industry_code_of_employment_4th_quarter_after_exit_quarter"].value_counts().reset_index()

df1.columns = ['col1', 'count']
df2.columns = ['col1', 'count']
df3.columns = ['col1', 'count']
df4.columns = ['col1', 'count']

df = pd.concat([df1, df2, df3, df4])
df = df.groupby('col1').sum().reset_index()
df.sort_values('count', ascending=False)[:30]

### Consolidating employment status to see if they were ever employed after the program


In [None]:
temp = data.copy()

cols = ["type_of_employment_match_1st_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_2nd_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_3rd_quarter_after_exit_quarter_wioa",
    "type_of_employment_match_4th_quarter_after_exit_quarter_wioa"
]

temp[cols] = temp[cols].replace(0,pd.NA)

data['employment_status_after_program'] = temp[cols].bfill(axis=1).iloc[:, 0]

data['employment_status_after_program'] = data['employment_status_after_program'].fillna(0)

data['employment_status_after_program'].value_counts()

#### Employment status at each quarter

In [None]:

# Define the relevant columns and corresponding quarter numbers
quarters = {
    "type_of_employment_match_1st_quarter_after_exit_quarter_wioa": 1,
    "type_of_employment_match_2nd_quarter_after_exit_quarter_wioa": 2,
    "type_of_employment_match_3rd_quarter_after_exit_quarter_wioa": 3,
    "type_of_employment_match_4th_quarter_after_exit_quarter_wioa": 4,
}

# Process all quarters in a single step
df_list = [
    data[col].value_counts().reset_index().assign(quarter=quarter).rename(columns={"index": "col1", col: "count"})
    for col, quarter in quarters.items()
]

# Concatenate all quarter data into one DataFrame
df = pd.concat(df_list, ignore_index=True)





In [None]:
df.columns = ['col1', 'count', 'quarter']

In [None]:
# Plot the data in a grouped bar chart
sns.barplot(x="quarter", y="count", hue="col1", data=df)
plt.xlabel("Quarter")
plt.ylabel("Count")
plt.title("Employment Status by Quarter")
plt.show()

In [None]:
# Filter the data for col1 == 1 and sort by quarter
df_col1_1 = df[df["col1"] == 1].sort_values("quarter")

# Calculate the percentage change
df_col1_1["pct_change"] = df_col1_1["count"].pct_change() * 100

df_col1_1

In [None]:
df_col1_1['pct_change'].mean()

#### PCT Change by Industry

In [None]:
df1 = data.groupby('industry_code_of_employment_1st_quarter_after_exit_quarter')['type_of_employment_match_1st_quarter_after_exit_quarter_wioa'].value_counts().reset_index()
df2 = data.groupby('industry_code_of_employment_2nd_quarter_after_exit_quarter')['type_of_employment_match_2nd_quarter_after_exit_quarter_wioa'].value_counts().reset_index()
df3 = data.groupby('industry_code_of_employment_3rd_quarter_after_exit_quarter')['type_of_employment_match_3rd_quarter_after_exit_quarter_wioa'].value_counts().reset_index()
df4 = data.groupby('industry_code_of_employment_4th_quarter_after_exit_quarter')['type_of_employment_match_4th_quarter_after_exit_quarter_wioa'].value_counts().reset_index()

df1.columns = ['industry_code', 'employed', 'count']
df2.columns = ['industry_code', 'employed', 'count']
df3.columns = ['industry_code', 'employed', 'count']
df4.columns = ['industry_code', 'employed', 'count']

df1['quarter'] = 1
df2['quarter'] = 2
df3['quarter'] = 3
df4['quarter'] = 4

df = pd.concat([df1, df2, df3, df4])
employed = df.loc[df.employed == 1].groupby(['industry_code', 'quarter'])['count'].sum().reset_index()
unemployed = df.loc[df.employed == 0].groupby(['industry_code', 'quarter'])['count'].sum().reset_index()

employed['pct_change'] = employed['count'].pct_change() 
unemployed['pct_change'] = unemployed['count'].pct_change()

employed_group = employed.groupby('industry_code')['pct_change'].mean().sort_values(ascending=False)
unemployed_group = unemployed.groupby('industry_code')['pct_change'].mean().sort_values(ascending=False)


In [89]:
employed_group[:20]

industry_code
Temporary Help Services                                                     124.747146
Hotels (except Casino Hotels) and Motels                                     93.713948
Security Guards and Patrol Services                                          88.554594
Elementary and Secondary Schools                                             61.042195
Department Stores                                                            58.686728
Motion Picture and Video Production                                          50.998115
Limited-Service Restaurants                                                  50.224985
General Warehousing and Storage                                              42.155627
Scheduled Passenger Air Transportation                                       41.794501
Child Care Services                                                          40.762275
All Other General Merchandise Retailers                                      36.393569
Custom Computer Programming S

In [90]:
employed_group[-20:]

industry_code
Other Electronic Component Manufacturing                      -0.613793
Wineries                                                      -0.645652
Deep Sea Freight Transportation                               -0.666667
Bed-and-Breakfast Inns                                        -0.694444
Port and Harbor Operations                                    -0.698214
Dry, Condensed, and Evaporated Dairy Product Manufacturing    -0.800000
Spring Manufacturing                                          -0.812500
Line-Haul Railroads                                           -0.823529
Other Communication and Energy Wire Manufacturing             -0.840000
Power-Driven Handtool Manufacturing                           -0.850000
Apple Orchards                                                -0.851852
Tire Retreading                                               -0.875000
Steel Wire Drawing                                            -0.882353
Air and Gas Compressor Manufacturing              