In [None]:
import pandas as pd
import random as rnd 
import mysql.connector
from mysql.connector import Error
import plotly
import json
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pymysql
import plotly.express as px
import plotly.graph_objects as go
from sqlalchemy import create_engine
import networkx as nx
from networkx_viewer import Viewer
from functools import reduce
from scipy.spatial.distance import pdist, squareform
import jgraph
from cyjupyter import Cytoscape
from scipy.stats import kendalltau
from scipy.stats import weightedtau

# Getting data from SQL database

In [None]:
connection = mysql.connector.connect(host='127.0.0.1',
                                         database='employment',
                                         user='root',
                                         password='thanhn123')

In [None]:
def get_data(connection, descriptor_domain,filt = False,val = None):
    '''
    Return df in the SOCs-decriptor format of a specific domain 
    descriptor_domeain:
        work context
        knowledge
        skills
        gwas
        job_zones, filter
        Abilities, filter
    
    '''
    df= pd.DataFrame(data=None)
    if descriptor_domain == 'work_context':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_context
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'CX';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'job_zones':
        if filt:
            df= pd.read_sql('''
                SELECT title,job_zone FROM employment.job_zones
                LEFT JOIN employment.occupation_data USING (onetsoc_code); ''', con = connection)
            df = df[df['job_zone'].isin(val)]
        else:
            df= pd.read_sql('''
                SELECT title,job_zone FROM employment.job_zones
                LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
    elif descriptor_domain == 'abilities':
        if filt:
#             df= pd.read_sql('''
#             SELECT title, element_name, element_id,data_value FROM employment.''' + descriptor_domain + '''
#             LEFT JOIN employment.content_model_reference USING (element_id) 
#             LEFT JOIN employment.occupation_data USING (onetsoc_code); ''', con = connection)
#             df = df[df['element_id'].contains(val)]
            df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE element_id LIKE\''''+ val +'''%\'; ''', con = connection)
            
        else:
            df= pd.read_sql('''
            SELECT title, element_name,element_id, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
        df = df.groupby(by=['title','element_name'])['data_value'].mean()
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'work_values':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_values
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'EX';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'interests':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.interests
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'OI';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'work_styles':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_styles
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'IM';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'name':
        df= pd.read_sql('''
            SELECT * FROM employment.occupation_data;''', con = connection)
    else:
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
        df = df.groupby(by=['title','element_name'])['data_value'].mean()
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    return df

# Finding distance measures from ONET 

In [None]:
def distance_table(df, jz=False):
    
#     squareform(pdist(df.iloc[:, 1:])), columns=df.title.unique(), index=df.title.unique()
    dist = []
    name_dict = {}
    if jz:
        name_dict = df['title'].sort_values().reset_index()['title'].to_dict()
        dist = pdist(df[df.columns[1:]],'euclidean')
    else:
        for i in range(len(df.index)):
            name_dict[i] = df.index[i]
        dist = pdist(df,'euclidean') #Euclidean distance for 1XN
    df_dist = pd.DataFrame(squareform(dist)) #Euclidean distance for NxN
#     return df_dist
    columns = list(df_dist.index) 
    for i in range(len(df.index)): #convert 0 values to NaN
        df_dist.loc[i,i] = np.NaN
    mean = df_dist.mean().mean()
    std = df_dist.stack().std() 

    for col in columns: #Z-score calculated 
        df_dist[col] = (df_dist[col] - mean)/std
#     return df_dist
    return df_dist.rename(columns = name_dict, index = name_dict)

In [None]:
# Get CPS data

In [None]:
df_cps = pd.read_csv('../CPS Job Changes/JobChanges_2011to19.csv')

In [None]:
df_cps.sort_values(by = 'pct_tot', ascending = False)

## Skills 

In [None]:
df_skills = get_data(connection,'skills')
df_skills

skills_dist = distance_table(df_skills)
skills_dist

In [None]:
skills_dist.loc['Retail Salespersons'].sort_values()[0:10]

### Percentages

In [None]:
cps_retail_pct = df_cps[df_cps['ONET18_Title_LY'] == 'Registered Nurses'].sort_values(by = 'pct_tot', ascending = False)
# cps_retail_pct[cps_retail_pct['ONET18_Title'] == 'Cashiers']['pct_tot']
# cps_retail_pct[0:10]['pct_tot'].sum()
cps_retail_pct

In [None]:
def percentage_default(occ,df_cps,a,b):
    df_occ = df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_socly', ascending = False)
    return df_occ[a:b]['pct_socly'].sum()

In [None]:
retail_default = percentage_default('Retail Salespersons',df_cps,0,10)

In [None]:
nurse_default = percentage_default('Registered Nurses',df_cps,0,10)

In [None]:
janitors_default = percentage_default('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_cps,0,10)

In [None]:
sales_default = percentage_default('Sales Managers',df_cps,0,10)

In [None]:
def percentage(occ,df_onet,df_cps,a,b):
    '''
    Get total percentage that ONET transition takes ups
    @param occ occupation of interests
    @param df_onet dataFrame for ONET
    @param df_cps dataFrame for CPS data
    @param a index of first position for onet
    @param index of last position
    
    return the percentage composition for the ath to bth place of ONET
    '''
    
    total = 0
    onet_list = df_onet.loc[occ].sort_values()[a:b].index.to_list()
    cps_occ = df_cps[df_cps['ONET18_Title_LY'] == occ]
    for occ in onet_list:
#         print(occ)
#         print(cps_occ[cps_occ['ONET18_Title'] == occ]['pct_tot'].values[0])
        val = cps_occ[cps_occ['ONET18_Title'] == occ]['pct_socly'].values
        if len(val) == 0:
            continue
        else:
            total += val[0]
    
    return total


In [None]:
retail_skills = percentage('Retail Salespersons',skills_dist,df_cps,0,10)
print(retail_default)
print(retail_skills)
print(retail_skills/retail_default*100)

In [None]:
nurse_skills = percentage('Registered Nurses',skills_dist,df_cps,0,10)
print(nurse_default)
print(nurse_skills)
print(nurse_skills/nurse_default*100)

In [None]:
janitors_skills = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',skills_dist,df_cps,0,10)
print(janitors_default)
print(janitors_skills)
print(janitors_skills/janitors_default*100)

In [None]:
sales_skills = percentage('Sales Managers',skills_dist,df_cps,0,10)
print(janitors_default)
print(sales_skills)
print(sales_skills/sales_default*100)

## ONET Regular

In [None]:
df_ab = get_data(connection,'abilities')
df_in = get_data(connection,'interests')
df_ws = get_data(connection,'work_styles')
df_wv = get_data(connection, 'work_values')
df_wc = get_data(connection,'work_context')
df_knowledge = get_data(connection,'knowledge')
df_gwas = get_data(connection,'work_activities')
df_jz = get_data(connection,'job_zones')
df_jz = df_jz[df_jz['title'] != 'Legislators']


In [None]:
wc_dist = distance_table(df_wc)
knowledge_dist = distance_table(df_knowledge)
skills_dist = distance_table(df_skills)
gwas_dist = distance_table(df_gwas)
jz_dist = distance_table(df_jz,True)
wv_dist = distance_table(df_wv)
in_dist = distance_table(df_in)
ab_dist = distance_table(df_ab)
ws_dist = distance_table(df_ws)
df_concat = pd.concat([wc_dist,knowledge_dist,skills_dist,gwas_dist,jz_dist.multiply(1.3)]).groupby(level =0).mean()
# df_concat

In [None]:
retail_onet = percentage('Retail Salespersons',df_concat,df_cps,0,10)
print(retail_default)
print(retail_onet)
print(retail_onet/retail_default*100)

In [None]:
nurse_onet = percentage('Registered Nurses',df_concat,df_cps,0,10)
print(nurse_default)
print(nurse_onet)
print(nurse_onet/nurse_default*100)

In [None]:
janitors_onet = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_concat,df_cps,0,10)
print(janitors_default)
print(janitors_onet)
print(janitors_onet/janitors_default*100)

In [None]:
sales_onet = percentage('Sales Managers',df_concat,df_cps,0,10)
print(sales_default)
print(sales_onet)
print(sales_onet/sales_default*100)

In [None]:
## ONET Custom

In [None]:
def filt(df,d):
    '''
    @params: df: Dataframe to filter, d: set of all relevant names
    @returns: filtered Dataframe
    '''
    return df.loc[df.index.isin(d)]

In [None]:
# df_rel = pd.concat([knowledge_dist.multiply(2),skills_dist.multiply(3),gwas_dist]).groupby(level =0).mean()
# df_jz_filt = get_data(connection,'job_zones',True,[2])
df_jz_filt = get_data(connection,'job_zones')
# df_ab_filt = get_data(connection,'abilities',True,'1.A.1.')
df_ab_filt = get_data(connection,'abilities')
# jz_dist_filt = distance_table(df_jz_filt,True)
jz_set = set(df_jz_filt['title'].unique()) #to filter out same job_zones jobs

In [None]:
abilities_filtered = filt(df_ab_filt,jz_set)
skills_filtered = filt(df_skills,jz_set)
knowledge_filtered = filt(df_knowledge,jz_set)
in_filtered = filt(df_in,jz_set)
gwas_filtered = filt(df_gwas,jz_set)

In [None]:
ab_dist_filt = distance_table(abilities_filtered) #NaN?
skills_dist_filt = distance_table(skills_filtered)
knowledge_dist_filt = distance_table(knowledge_filtered)
in_dist_filt = distance_table(in_filtered)
gwas_dist_filt = distance_table(gwas_filtered)
df_concat_filt = pd.concat([ab_dist_filt.multiply(4.0),skills_dist_filt.multiply(3.0),knowledge_dist_filt.multiply(2.0),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()

In [None]:
retail_alg1 = percentage('Retail Salespersons',df_concat_filt,df_cps,0,10)
print(retail_default)
print(retail_alg1)
print(retail_alg1/retail_default*100)

In [None]:
nurse_alg1 = percentage('Registered Nurses',df_concat_filt,df_cps,0,10)
print(nurse_default)
print(nurse_alg1)
print(nurse_alg1/nurse_default*100)

In [None]:
janitor_alg1 = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_concat_filt,df_cps,0,10)
print(janitors_default)
print(janitor_alg1)
print(janitor_alg1/janitors_default*100)

In [None]:
sales_alg1 = percentage('Sales Managers',df_concat_filt,df_cps,0,10)
print(sales_default)
print(sales_alg1)
print(sales_alg1/sales_default*100)

## Comparison using Kendall Tau

In [None]:
# kendalltau([1,2,3,4,5], [1,2,3,4,5])
kendalltau([1,2,3,4,5], [2,1,3,4,5])

In [None]:
kendalltau([1,2,3,4,5], [1,2,3,5,4])

In [None]:
weightedtau([1,2,3,4,5], [2,1,3,4,5])

In [None]:
weightedtau([1,2,3,4,5], [1,2,3,5,4])

In [None]:
list_cps = df_cps[df_cps['ONET18_Title_LY'] == 'Retail Salespersons'].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title']
list_cps= list_cps.to_list()[0:50]
list_onet = skills_dist.loc['Retail Salespersons'].sort_values().index.to_list()[0:50]
# cps_array = np.array(list_cps).astype(float)


In [None]:
# weightedtau(list_cps, list_onet)

In [None]:
def kendall_tau_rank(df_cps,df_onet,a,b,occ):
    list_cps = df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title']
    list_cps= list_cps.to_list()[a:b]
    list_onet = df_onet.loc[occ].sort_values().index.to_list()[a:b]
    
    return kendalltau(list_cps, list_onet)

In [None]:
kendall_tau_rank(df_cps,skills_dist,0,50,'Sales Managers')

In [None]:
kendall_tau_rank(df_cps,skills_dist,0,50,'Retail Salespersons')

In [None]:
#find all occupations that are in both cps and onet
cps_occ = set(df_cps['ONET18_Title_LY'].unique())
onet_occ = set(df_skills.index.unique())

relevant_occ = cps_occ & onet_occ
relevant_occ

In [None]:
skills_dist
df_concat
df_concat_filt = pd.concat([skills_dist_filt.multiply(3.0),knowledge_dist_filt.multiply(2.0),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()
df_new = pd.concat([skills_dist_filt.multiply(1.3),knowledge_dist_filt.multiply(1.2),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()

In [None]:
avg_occ = 0
count = 0
occ_max = ''
max_r = 0
occ_min = ''
min_r = 1

for occ in relevant_occ:
    cps_len = len(df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title'])
    if cps_len > 50:
        corr, p = kendall_tau_rank(df_cps,df_new,0,50,occ)
        count += 1 
        avg_occ += corr
        if corr >= max_r: 
            occ_max = occ
            max_r = corr
        if corr <= min_r: 
            occ_min = occ
            min_r = corr
print(avg_occ/count)
print(count)
print(occ_max, max_r)
print(occ_min, min_r)

In [None]:
# df_cps[df_cps['ONET18_Title_LY'] == 'Etchers and Engravers'].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title']
# df_cps

In [None]:
skills_dist
df_concat
df_concat_filt = pd.concat([skills_dist_filt.multiply(3.0),knowledge_dist_filt.multiply(2.0),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()
df_new = pd.concat([skills_dist_filt.multiply(1.3),knowledge_dist_filt.multiply(1.2),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()
df_test = pd.concat([skills_dist_filt.multiply(1.3),knowledge_dist_filt.multiply(1.2),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()


In [None]:
# avg_occ = 0
# count = 0
# occ_max = ''
# max_r = 0
# occ_min = ''
# min_r = 1

# cov_max = ''
# max_c = 0
# cov_min = ''
# min_c = 1

# # occ_dictionary = {}
# occ_dictionary_skills = {}
# for occ in relevant_occ:
#     cps_len = len(df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title'])
#     if cps_len > 50:
#         n = percentage(occ,skills_dist,df_cps,0,50)
#         d = percentage_default(occ,df_cps,0,50)
#         perc = n/d
#         count += 1 
#         avg_occ += perc
#         occ_dictionary_skills[occ] = perc
#         if n >= max_c:
#             cov_max =occ
#             max_c = n 
#         if n <= max_c:
#             cov_min =occ
#             min_c = n  
#         if perc >= max_r: 
#             occ_max = occ
#             max_r = perc
#         if perc <= min_r: 
#             occ_min = occ
#             min_r = perc
# print(100*avg_occ/count)
# print(count)
# print(occ_max, 100*max_r)
# print(occ_min, min_r)
# print(cov_max, max_c)
# print(cov_min, min_c)

In [None]:
def percentage_coverage(df_onet,df_cps):
    avg_occ = 0
    count = 0
    occ_max = ''
    max_r = 0
    occ_min = ''
    min_r = 1

    cov_max = ''
    max_c = 0
    cov_min = ''
    min_c = 1
    
    cps_occ = set(df_cps['ONET18_Title_LY'].unique())
    onet_occ = set(df_onet.index.unique())

    relevant_occ = cps_occ & onet_occ
    occ_dictionary = {}
    for occ in relevant_occ:
        cps_len = len(df_cps[df_cps['ONET18_Title_LY'] == occ]['ONET18_Title'])
        if cps_len > 10:
            n = percentage(occ,df_onet,df_cps,0,10)
            d = percentage_default(occ,df_cps,0,10)
            perc = n/d
            count += 1 
            avg_occ += perc
            occ_dictionary[occ] = perc
            if n >= max_c:
                cov_max =occ
                max_c = n 
            if n <= max_c:
                cov_min =occ
                min_c = n  
            if perc >= max_r: 
                occ_max = occ
                max_r = perc
            if perc <= min_r: 
                occ_min = occ
                min_r = perc
    print(100*avg_occ/count)
    print(count)
    print(occ_max, 100*max_r)
    print(occ_min, min_r)
    print(cov_max, max_c)
    print(cov_min, min_c)
    
    return occ_dictionary

In [None]:
# kendalltau([1,2,3,4,5], [5,4,3,2,1])

In [None]:
# kendalltau([1,2,3,4,5], [6,8,8,9,10])

In [None]:
wc_dist 
knowledge_dist
skills_dist 
gwas_dist 
jz_dist 
wv_dist 
in_dist 
ab_dist 
ws_dist

In [None]:
# print("ONET default")
# occ_default = percentage_coverage(df_concat,df_cps)
# print("Algo 1")
# occ_alg1 = percentage_coverage(df_concat_filt,df_cps)
# print("algo 2")
# occ_alg2 = percentage_coverage(df_new,df_cps)

# print("skills")
# occ_skills = percentage_coverage(skills_dist,df_cps)
# print("work context")
# occ_wc = percentage_coverage(wc_dist,df_cps)
# print("knowledge")
# occ_knowledge = percentage_coverage(knowledge_dist,df_cps)
# print("genral work activities")
# occ_gwa = percentage_coverage(gwas_dist,df_cps)
# print("work value")
# occ_wv = percentage_coverage(wv_dist,df_cps)
# print("interest")
# occ_in = percentage_coverage(in_dist,df_cps)
# print("abilities")
# occ_ab = percentage_coverage(ab_dist,df_cps)
# print("work setting")
# occ_ws = percentage_coverage(ws_dist,df_cps)

## Top 10

In [None]:
# ONET default
# 12.30327998606195
# 610
# Occupational Therapists 83.19753814184264
# Audiologists 0.0
# Occupational Therapists 77.4223233536707
# Psychiatrists 9.39391078293398
# Algo 1
# 14.957654963925167
# 610
# First-Line Supervisors of Non-Retail Sales Workers 93.65570632675418
# Biological Science Teachers, Postsecondary 0.0
# Occupational Therapists 79.21193429924357
# Psychiatrists 9.39391078293398
# algo 2
# 15.150203611587056
# 610
# First-Line Supervisors of Non-Retail Sales Workers 93.58518644453213
# Biological Science Teachers, Postsecondary 0.0
# Occupational Therapists 76.9736307458273
# Psychiatrists 9.39391078293398
# skills
# 10.761938070708014
# 610
# First-Line Supervisors of Non-Retail Sales Workers 93.45955638335593
# Psychiatrists 0.0
# Title Examiners, Abstractors, and Searchers 68.44221971384758
# Psychiatrists 0
# work context
# 10.657859811575516
# 610
# Occupational Therapists 80.77457658839083
# Psychiatrists 0.0
# Occupational Therapists 75.16755335620954
# Psychiatrists 0
# knowledge
# 12.671715501339706
# 610
# First-Line Supervisors of Non-Retail Sales Workers 93.59691732177119
# Audiologists 0.0
# Occupational Therapists 71.56777726748102
# Psychiatrists 9.39391078293398
# genral work activities
# 11.832327429486321
# 610
# Optometrists 87.4116643432956
# Audiologists 0.0
# Title Examiners, Abstractors, and Searchers 69.44929984347104
# Psychiatrists 9.39391078293398
# work value
# 6.0683701226956055
# 610
# Radiation Therapists 70.63941879537158
# Nurse Anesthetists 0.0
# Radiation Therapists 61.58833798592869
# Psychiatrists 19.53329345958491
# interest
# 8.414766411210467
# 610
# First-Line Supervisors of Non-Retail Sales Workers 93.82561635343902
# Curators 0.0
# First-Line Supervisors of Non-Retail Sales Workers 62.215262666975434
# Psychiatrists 0.3604869380908128
# abilities
# 10.457553558969988
# 610
# Title Examiners, Abstractors, and Searchers 82.30568161448473
# Audiologists 0.0
# Title Examiners, Abstractors, and Searchers 67.99255631465411
# Psychiatrists 0.317934589004297
# work setting
# 6.070237629881494
# 610
# Title Examiners, Abstractors, and Searchers 82.30568161448473
# Audiologists 0.0
# Title Examiners, Abstractors, and Searchers 67.99255631465411
# Psychiatrists 2.0379341981291943

## Top 20

In [None]:
# ONET default
# 17.00152350132241
# 569
# First-Line Supervisors of Non-Retail Sales Workers 87.19720582811054
# Petroleum Engineers 0.0
# Occupational Therapists 79.71861719209504
# Psychiatrists 9.436463132020497
# Algo 1
# 20.157148088139543
# 569
# First-Line Supervisors of Non-Retail Sales Workers 89.3305850797751
# Private Detectives and Investigators 0.0
# Occupational Therapists 83.76743245968721
# Psychiatrists 9.436463132020497
# algo 2
# 19.845296270656902
# 569
# First-Line Supervisors of Non-Retail Sales Workers 89.23131888818325
# Private Detectives and Investigators 0.0
# Occupational Therapists 83.71387649923727
# Psychiatrists 9.754397721024795
# skills
# 14.50071320223328
# 569
# First-Line Supervisors of Non-Retail Sales Workers 87.99728791972292
# Audiologists 0.0
# Title Examiners, Abstractors, and Searchers 68.51075986436102
# Psychiatrists 9.570318319104194
# work context
# 14.698606243888175
# 569
# Occupational Therapists 80.08669903163094
# Petroleum Engineers 0.0
# Occupational Therapists 79.26549025969348
# Psychiatrists 9.244104885791575
# knowledge
# 18.27738898232911
# 569
# First-Line Supervisors of Non-Retail Sales Workers 87.20541559832638
# Audiologists 0.0
# Occupational Therapists 75.67014849552311
# Psychiatrists 9.733121546481536
# genral work activities
# 15.745385724125168
# 569
# Occupational Therapists 80.08669903163094
# Audiologists 0.0
# Occupational Therapists 79.26549025969348
# Psychiatrists 15.990771674096809
# work value
# 8.305812619552738
# 569
# Nurse Practitioners 65.48348694740163
# Nurse Anesthetists 0.0
# Radiation Therapists 61.58833798592869
# Psychiatrists 24.637312362381486
# interest
# 11.946956823721644
# 569
# First-Line Supervisors of Non-Retail Sales Workers 87.70273233909465
# Curators 0.0
# Postal Service Clerks 84.043529949622
# Psychiatrists 0.5094201598936182
# abilities
# 13.640188056376925
# 569
# Title Examiners, Abstractors, and Searchers 75.51224315682042
# Audiologists 0.0
# Title Examiners, Abstractors, and Searchers 68.44221971384758
# Psychiatrists 2.6472439946287474
# work setting
# 8.710758743472496
# 569
# Title Examiners, Abstractors, and Searchers 75.01612990858587
# Audiologists 0.0
# Respiratory Therapists 70.77391178357794
# Psychiatrists 11.431844981063175

## Top 50

In [None]:
# ONET default
# 26.620927695763267
# 434
# Occupational Therapists 84.63236189185216
# Museum Technicians and Conservators 0.0
# Occupational Therapists 84.54229292527187
# Psychiatrists 19.202197876361616
# Algo 1
# 30.065967663680865
# 434
# Occupational Therapists 84.02195401297996
# Museum Technicians and Conservators 0.0
# Occupational Therapists 83.93253466559514
# Psychiatrists 22.848961919671954
# algo 2
# 30.445321068270225
# 434
# Occupational Therapists 84.69041395806892
# Museum Technicians and Conservators 0.0
# Occupational Therapists 84.60028321027994
# Psychiatrists 16.506389973814574
# skills
# 22.128976873277793
# 434
# First-Line Supervisors of Non-Retail Sales Workers 78.79857653826994
# Audiologists 0.0
# Title Examiners, Abstractors, and Searchers 71.41240257266489
# Psychiatrists 9.804356239080036
# work context
# 23.713811141031805
# 434
# Occupational Therapists 80.74047037766475
# Museum Technicians and Conservators 0.0
# Occupational Therapists 80.65454330951303
# Psychiatrists 14.433168301049843
# knowledge
# 27.98233953212392
# 434
# Occupational Therapists 83.96390194676319
# Materials Scientists 0.002744464047177644
# Occupational Therapists 83.87454438058705
# Psychiatrists 28.11578404984587
# genral work activities
# 23.093421339453133
# 434
# Occupational Therapists 81.41336937151577
# Museum Technicians and Conservators 0.0
# Occupational Therapists 81.32672617875599
# Psychiatrists 16.614730670584397
# work value
# 14.735038211529552
# 434
# Occupational Therapists 83.73795579085653
# Mechanical Drafters 0.0
# Occupational Therapists 83.64883868514151
# Psychiatrists 35.93617801837723
# interest
# 21.332168245358215
# 434
# Occupational Therapists 78.47689809054984
# Curators 0.0
# Occupational Therapists 78.39338000180177
# Psychiatrists 17.43556422986619
# abilities
# 22.075793929028823
# 434
# Occupational Therapists 80.92923823839733
# Community Health Workers 0.0
# Occupational Therapists 80.84311027633505
# Psychiatrists 14.214946244347924
# work setting
# 14.117143131454771
# 434
# Title Examiners, Abstractors, and Searchers 68.24014209372007
# Audiologists 0.0
# Title Examiners, Abstractors, and Searchers 67.99255631465411
# Psychiatrists 18.313271225235116


In [None]:
# df_score = pd.DataFrame.from_dict(occ_dictionary)
# list(occ_dictionary.keys())

# df_score = pd.DataFrame.from_dict(occ_dictionary, orient = 'index',  columns= ['score'])
# df_score = df_score.sort_values(by = 'score', ascending = False)

In [None]:
# Use textposition='auto' for direct text
# fig = go.Figure(data=[go.Bar(
#             x=df_score.index[:30], y=df_score['score'][:30],
#             textposition='auto',
#         )])
# fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

# fig.show()

In [None]:
def distribution(df):
    df_output = pd.DataFrame.from_dict(df, orient = 'index',  columns= ['score'])
    df_output = df_output.sort_values(by = 'score', ascending = False)
    fig = go.Figure(data=[go.Bar(
                x=df_output.index[:30], y=df_output['score'][:10],
                textposition='auto',
            )])
    fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

    fig.show()

In [None]:
distribution(occ_skills)

In [None]:
distribution(occ_default)

In [None]:
distribution(occ_alg1)

In [None]:
distribution(occ_alg2)

### Occupational Therapists

In [None]:
df_concat.loc['Occupational Therapists'].sort_values()[0:10]

In [None]:
df_cps[df_cps['ONET18_Title_LY'] == 'Occupational Therapists'].sort_values(by = 'pct_tot', ascending = False)[:10]

## First-Line Supervisors of Non-Retail Sales Workers

In [None]:
df_concat.loc['First-Line Supervisors of Non-Retail Sales Workers'].sort_values()[0:20]

In [None]:
df_new.loc['First-Line Supervisors of Non-Retail Sales Workers'].sort_values()[0:10]

In [None]:
df_cps[df_cps['ONET18_Title_LY'] == 'First-Line Supervisors of Non-Retail Sales Workers'].sort_values(by = 'pct_socly', ascending = False)[:10][['ONET18_Title','employment_projection_NewJob','wage_change_BetweenJobs','pct_socly']]

## Retail Salespersons

In [None]:
df_new.loc['Retail Salespersons'].sort_values()[0:10]

In [None]:
df_cps[df_cps['ONET18_Title_LY'] == 'Retail Salespersons'].sort_values(by = 'pct_socly', ascending = False)[:10][['ONET18_Title','employment_projection_NewJob','wage_change_BetweenJobs','pct_socly']]

## Tutors

In [None]:
df_new.loc['Tutors'].sort_values()[0:10]

In [None]:
df_cps[df_cps['ONET18_Title_LY'] == 'Tutors'].sort_values(by = 'pct_socly', ascending = False)[:10][['ONET18_Title','employment_projection_NewJob','wage_change_BetweenJobs','pct_socly']]

## Private Detectives and Investigators 

In [None]:
df_new.loc['Private Detectives and Investigators'].sort_values()[0:10]

In [None]:
df_cps[df_cps['ONET18_Title_LY'] == 'Private Detectives and Investigators'].sort_values(by = 'pct_socly', ascending = False)[:10][['ONET18_Title','employment_projection_NewJob','wage_change_BetweenJobs','pct_socly']]