In [1]:
import pandas as pd
import random as rnd 
import mysql.connector
from mysql.connector import Error
import plotly
import json
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pymysql
import plotly.express as px
import plotly.graph_objects as go
from sqlalchemy import create_engine
import networkx as nx
from networkx_viewer import Viewer
from functools import reduce
from scipy.spatial.distance import pdist, squareform
import igraph
from cyjupyter import Cytoscape
from scipy.stats import kendalltau
from scipy.stats import weightedtau

# Getting data from SQL database

In [2]:
connection = mysql.connector.connect(host='localhost',
                                         database='employment',
                                         user='root',
                                         password='1234')

In [3]:
def get_data(connection, descriptor_domain,filt = False,val = None):
    '''
    Return df in the SOCs-decriptor format of a specific domain 
    descriptor_domeain:
        work context
        knowledge
        skills
        gwas
        job_zones, filter
        Abilities, filter
    
    '''
    df= pd.DataFrame(data=None)
    if descriptor_domain == 'work_context':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_context
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'CX';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'job_zones':
        if filt:
            df= pd.read_sql('''
                SELECT title,job_zone FROM employment.job_zones
                LEFT JOIN employment.occupation_data USING (onetsoc_code); ''', con = connection)
            df = df[df['job_zone'].isin(val)]
        else:
            df= pd.read_sql('''
                SELECT title,job_zone FROM employment.job_zones
                LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
    elif descriptor_domain == 'abilities':
        if filt:
#             df= pd.read_sql('''
#             SELECT title, element_name, element_id,data_value FROM employment.''' + descriptor_domain + '''
#             LEFT JOIN employment.content_model_reference USING (element_id) 
#             LEFT JOIN employment.occupation_data USING (onetsoc_code); ''', con = connection)
#             df = df[df['element_id'].contains(val)]
            df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE element_id LIKE\''''+ val +'''%\'; ''', con = connection)
            
        else:
            df= pd.read_sql('''
            SELECT title, element_name,element_id, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
        df = df.groupby(by=['title','element_name'])['data_value'].mean()
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'work_values':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_values
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'EX';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'interests':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.interests
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'OI';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'work_styles':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_styles
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'IM';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'name':
        df= pd.read_sql('''
            SELECT * FROM employment.occupation_data;''', con = connection)
    else:
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
        df = df.groupby(by=['title','element_name'])['data_value'].mean()
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    return df

# Finding distance measures from ONET 

In [4]:
def distance_table(df, jz=False):
    
#     squareform(pdist(df.iloc[:, 1:])), columns=df.title.unique(), index=df.title.unique()
    dist = []
    name_dict = {}
    if jz:
        name_dict = df['title'].sort_values().reset_index()['title'].to_dict()
        dist = pdist(df[df.columns[1:]],'euclidean')
    else:
        for i in range(len(df.index)):
            name_dict[i] = df.index[i]
        dist = pdist(df,'euclidean') #Euclidean distance for 1XN
    df_dist = pd.DataFrame(squareform(dist)) #Euclidean distance for NxN
#     return df_dist
    columns = list(df_dist.index) 
    for i in range(len(df.index)): #convert 0 values to NaN
        df_dist.loc[i,i] = np.NaN
    mean = df_dist.mean().mean()
    std = df_dist.stack().std() 

    for col in columns: #Z-score calculated 
        df_dist[col] = (df_dist[col] - mean)/std
#     return df_dist
    return df_dist.rename(columns = name_dict, index = name_dict)

In [5]:
# Get CPS data

In [6]:
df_cps = pd.read_csv('../CPS Job Changes/JobChanges_2011to19.csv')

In [7]:
df_cps.sort_values(by = 'pct_tot', ascending = False)

Unnamed: 0,ONET18_SOC,ONET18_SOC_LY,ONET18_Title,ONET18_Title_LY,count,pct_tot,count_socly,pct_socly,in_grp,in_minor_grp,in_broad_occ
72958,41-1012,41-1011,First-Line Supervisors of Non-Retail Sales Wor...,First-Line Supervisors of Retail Sales Workers,734264.510000,4.395486e-01,2207170.940,33.267224,1,1,1
72703,41-1011,41-1012,First-Line Supervisors of Retail Sales Workers,First-Line Supervisors of Non-Retail Sales Wor...,603472.900000,3.612536e-01,999832.010,60.357429,1,1,1
74337,41-2031,41-2012,Retail Salespersons,Gambling Change Persons and Booth Cashiers,509346.270000,3.049071e-01,2630467.290,19.363338,1,1,0
74338,41-2031,41-2011,Retail Salespersons,Cashiers,509346.270000,3.049071e-01,2630467.290,19.363338,1,1,0
27,11-1011,11-1021,Chief Executives,General and Operations Managers,352402.430000,2.109567e-01,751730.590,46.878820,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
26910,25-1194,45-2091,"Career/Technical Education Teachers, Postsecon...",Agricultural Equipment Operators,1.135947,6.800058e-07,184416.524,0.000616,0,0,0
26911,25-1051,45-2021,"Atmospheric, Earth, Marine, and Space Sciences...",Animal Breeders,1.135947,6.800058e-07,184416.524,0.000616,0,0,0
26912,25-1041,45-2091,"Agricultural Sciences Teachers, Postsecondary",Agricultural Equipment Operators,1.135947,6.800058e-07,184416.524,0.000616,0,0,0
26913,25-1051,45-2091,"Atmospheric, Earth, Marine, and Space Sciences...",Agricultural Equipment Operators,1.135947,6.800058e-07,184416.524,0.000616,0,0,0


## Skills 

In [8]:
df_skills = get_data(connection,'skills')
df_skills

skills_dist = distance_table(df_skills)
skills_dist

Unnamed: 0,Accountants,Actors,Actuaries,Acupuncturists,Acute Care Nurses,Adapted Physical Education Specialists,Adhesive Bonding Machine Operators and Tenders,"Administrative Law Judges, Adjudicators, and Hearing Officers",Administrative Services Managers,Adult Basic and Secondary Education and Literacy Teachers and Instructors,...,"Welding, Soldering, and Brazing Machine Setters, Operators, and Tenders",Wellhead Pumpers,"Wholesale and Retail Buyers, Except Farm Products",Wind Energy Engineers,Wind Energy Operations Managers,Wind Energy Project Managers,Wind Turbine Service Technicians,"Woodworking Machine Setters, Operators, and Tenders, Except Sawing",Word Processors and Typists,Zoologists and Wildlife Biologists
Accountants,,-0.232192,-1.231166,-1.343769,-0.572105,-1.103413,1.134355,-1.129204,-1.140781,-1.063139,...,0.726581,1.567913,-1.525925,-0.817695,-0.316332,-1.144954,1.109038,1.435817,-0.110304,-0.934922
Actors,-0.232192,,1.136940,-0.746946,0.663868,-0.444427,1.199026,-0.376028,0.049845,-0.946980,...,0.331044,1.243432,0.199842,1.012206,1.071827,0.547006,1.572133,1.323007,-0.682780,0.499415
Actuaries,-1.231166,1.136940,,-0.292468,-0.288049,-0.276207,1.966989,-0.646019,-0.467885,0.037310,...,1.808031,2.491332,-0.975188,-0.933491,-0.088138,-0.834385,1.662193,2.283982,1.218300,-0.706944
Acupuncturists,-1.343769,-0.746946,-0.292468,,-0.939257,-1.496659,0.573997,-0.909846,-0.954193,-1.141138,...,0.083364,0.990593,-1.083227,-0.670565,-0.312289,-0.976759,0.607291,0.893135,-0.566595,-1.162061
Acute Care Nurses,-0.572105,0.663868,-0.288049,-0.939257,,-1.168764,0.831865,-0.345816,-0.839026,-0.281779,...,0.677219,1.419451,-0.558503,-1.306457,-0.923236,-1.050700,0.542298,1.359072,0.914576,-1.552209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wind Energy Project Managers,-1.144954,0.547006,-0.834385,-0.976759,-1.050700,-1.114667,0.909341,-0.597025,-1.836890,-0.282424,...,0.802140,1.459114,-1.836637,-0.914224,-1.597539,,0.645006,1.390519,0.473065,-1.145214
Wind Turbine Service Technicians,1.109038,1.572133,1.662193,0.607291,0.542298,0.753377,-1.281451,1.703529,0.831136,1.345588,...,-0.789641,-0.855040,1.207617,0.564500,-0.199079,0.645006,,-0.965950,0.924188,0.745823
"Woodworking Machine Setters, Operators, and Tenders, Except Sawing",1.435817,1.323007,2.283982,0.893135,1.359072,1.333042,-1.608119,2.094264,1.386387,1.653915,...,-1.714726,-1.738756,1.662305,1.222440,0.910910,1.390519,-0.965950,,0.158089,1.482784
Word Processors and Typists,-0.110304,-0.682780,1.218300,-0.566595,0.914576,0.154761,0.149986,0.560918,0.231741,-0.010130,...,-0.508398,0.314869,0.345421,0.764722,0.926314,0.473065,0.924188,0.158089,,0.632334


In [9]:
skills_dist.loc['Retail Salespersons'].sort_values()[0:10]

Demonstrators and Product Promoters                                                            -1.970922
Bartenders                                                                                     -1.942456
Psychiatric Aides                                                                              -1.928493
Home Health Aides                                                                              -1.889651
Reservation and Transportation Ticket Agents and Travel Clerks                                 -1.885947
Sales Representatives, Wholesale and Manufacturing, Except Technical and Scientific Products   -1.885568
Travel Agents                                                                                  -1.856308
Order Clerks                                                                                   -1.822059
Counter and Rental Clerks                                                                      -1.810728
Concierges                                             

### Percentages

In [10]:
cps_retail_pct = df_cps[df_cps['ONET18_Title_LY'] == 'Registered Nurses'].sort_values(by = 'pct_tot', ascending = False)
# cps_retail_pct[cps_retail_pct['ONET18_Title'] == 'Cashiers']['pct_tot']
# cps_retail_pct[0:10]['pct_tot'].sum()
cps_retail_pct

Unnamed: 0,ONET18_SOC,ONET18_SOC_LY,ONET18_Title,ONET18_Title_LY,count,pct_tot,count_socly,pct_socly,in_grp,in_minor_grp,in_broad_occ
47123,29-1123,29-1141,Physical Therapists,Registered Nurses,208251.380000,0.124664,1137881.06,18.301683,1,1,0
47164,29-1126,29-1141,Respiratory Therapists,Registered Nurses,133490.700000,0.079911,1137881.06,11.731516,1,1,0
47379,29-1128,29-1141,Exercise Physiologists,Registered Nurses,84033.560000,0.050305,1137881.06,7.385092,1,1,0
47378,29-1129,29-1141,"Therapists, All Other",Registered Nurses,84033.560000,0.050305,1137881.06,7.385092,1,1,0
47225,29-1127,29-1141,Speech-Language Pathologists,Registered Nurses,78260.440000,0.046849,1137881.06,6.877735,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
33339,25-4013,29-1141,Museum Technicians and Conservators,Registered Nurses,104.406667,0.000063,1137881.06,0.009176,0,0,0
110210,53-3053,29-1141,Shuttle Drivers and Chauffeurs,Registered Nurses,47.747500,0.000029,1137881.06,0.004196,0,0,0
110211,53-3054,29-1141,Taxi Drivers,Registered Nurses,47.747500,0.000029,1137881.06,0.004196,0,0,0
110212,53-3051,29-1141,"Bus Drivers, School",Registered Nurses,47.747500,0.000029,1137881.06,0.004196,0,0,0


In [11]:
def percentage_default(occ,df_cps,a,b):
    df_occ = df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_socly', ascending = False)
    return df_occ[a:b]['pct_socly'].sum()

In [12]:
retail_default = percentage_default('Retail Salespersons',df_cps,0,10)

In [13]:
nurse_default = percentage_default('Registered Nurses',df_cps,0,10)

In [14]:
janitors_default = percentage_default('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_cps,0,10)

In [15]:
sales_default = percentage_default('Sales Managers',df_cps,0,10)

In [16]:
def percentage(occ,df_onet,df_cps,a,b):
    '''
    Get total percentage that ONET transition takes ups
    @param occ occupation of interests
    @param df_onet dataFrame for ONET
    @param df_cps dataFrame for CPS data
    @param a index of first position for onet
    @param index of last position
    
    return the percentage composition for the ath to bth place of ONET
    '''
    
    total = 0
    onet_list = df_onet.loc[occ].sort_values()[a:b].index.to_list()
    cps_occ = df_cps[df_cps['ONET18_Title_LY'] == occ]
    for occ in onet_list:
#         print(occ)
#         print(cps_occ[cps_occ['ONET18_Title'] == occ]['pct_tot'].values[0])
        val = cps_occ[cps_occ['ONET18_Title'] == occ]['pct_socly'].values
        if len(val) == 0:
            continue
        else:
            total += val[0]
    
    return total


In [17]:
retail_skills = percentage('Retail Salespersons',skills_dist,df_cps,0,10)
print(retail_default)
print(retail_skills)
print(retail_skills/retail_default*100)

41.541673571317794
6.247645518462117
15.039465147537454


In [18]:
nurse_skills = percentage('Registered Nurses',skills_dist,df_cps,0,10)
print(nurse_default)
print(nurse_skills)
print(nurse_skills/nurse_default*100)

69.29806178512185
9.807177002520186
14.152166380829092


In [19]:
janitors_skills = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',skills_dist,df_cps,0,10)
print(janitors_default)
print(janitors_skills)
print(janitors_skills/janitors_default*100)

59.08144679332252
7.397082505916141
12.520144491031948


In [20]:
sales_skills = percentage('Sales Managers',skills_dist,df_cps,0,10)
print(janitors_default)
print(sales_skills)
print(sales_skills/sales_default*100)

59.08144679332252
3.536867901091421
7.171481406185616


## ONET Regular

In [21]:
df_ab = get_data(connection,'abilities')
df_in = get_data(connection,'interests')
df_ws = get_data(connection,'work_styles')
df_wv = get_data(connection, 'work_values')
df_wc = get_data(connection,'work_context')
df_knowledge = get_data(connection,'knowledge')
df_gwas = get_data(connection,'work_activities')
df_jz = get_data(connection,'job_zones')
df_jz = df_jz[df_jz['title'] != 'Legislators']


In [22]:
wc_dist = distance_table(df_wc)
knowledge_dist = distance_table(df_knowledge)
skills_dist = distance_table(df_skills)
gwas_dist = distance_table(df_gwas)
jz_dist = distance_table(df_jz,True)
wv_dist = distance_table(df_wv)
in_dist = distance_table(df_in)
ab_dist = distance_table(df_ab)
ws_dist = distance_table(df_ws)
df_concat = pd.concat([wc_dist,knowledge_dist,skills_dist,gwas_dist,jz_dist.multiply(1.3)]).groupby(level =0).mean()
# df_concat

In [23]:
retail_onet = percentage('Retail Salespersons',df_concat,df_cps,0,10)
print(retail_default)
print(retail_onet)
print(retail_onet/retail_default*100)

41.541673571317794
13.782679061142858
33.17795812313885


In [24]:
nurse_onet = percentage('Registered Nurses',df_concat,df_cps,0,10)
print(nurse_default)
print(nurse_onet)
print(nurse_onet/nurse_default*100)

69.29806178512185
27.79892566275773
40.1150118006996


In [25]:
janitors_onet = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_concat,df_cps,0,10)
print(janitors_default)
print(janitors_onet)
print(janitors_onet/janitors_default*100)

59.08144679332252
27.73909571228287
46.95060330753103


In [26]:
sales_onet = percentage('Sales Managers',df_concat,df_cps,0,10)
print(sales_default)
print(sales_onet)
print(sales_onet/sales_default*100)

49.31851176579454
3.274858378111049
6.640221411511382


In [27]:
## ONET Custom

In [28]:
def filt(df,d):
    '''
    @params: df: Dataframe to filter, d: set of all relevant names
    @returns: filtered Dataframe
    '''
    return df.loc[df.index.isin(d)]

In [29]:
# df_rel = pd.concat([knowledge_dist.multiply(2),skills_dist.multiply(3),gwas_dist]).groupby(level =0).mean()
# df_jz_filt = get_data(connection,'job_zones',True,[2])
df_jz_filt = get_data(connection,'job_zones')
# df_ab_filt = get_data(connection,'abilities',True,'1.A.1.')
df_ab_filt = get_data(connection,'abilities')
# jz_dist_filt = distance_table(df_jz_filt,True)
jz_set = set(df_jz_filt['title'].unique()) #to filter out same job_zones jobs

In [30]:
abilities_filtered = filt(df_ab_filt,jz_set)
skills_filtered = filt(df_skills,jz_set)
knowledge_filtered = filt(df_knowledge,jz_set)
in_filtered = filt(df_in,jz_set)
gwas_filtered = filt(df_gwas,jz_set)

In [31]:
ab_dist_filt = distance_table(abilities_filtered) #NaN?
skills_dist_filt = distance_table(skills_filtered)
knowledge_dist_filt = distance_table(knowledge_filtered)
in_dist_filt = distance_table(in_filtered)
gwas_dist_filt = distance_table(gwas_filtered)
df_concat_filt = pd.concat([ab_dist_filt.multiply(4.0),skills_dist_filt.multiply(3.0),knowledge_dist_filt.multiply(2.0),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()

In [32]:
retail_alg1 = percentage('Retail Salespersons',df_concat_filt,df_cps,0,10)
print(retail_default)
print(retail_alg1)
print(retail_alg1/retail_default*100)

41.541673571317794
13.568072196671913
32.661351915393006


In [33]:
nurse_alg1 = percentage('Registered Nurses',df_concat_filt,df_cps,0,10)
print(nurse_default)
print(nurse_alg1)
print(nurse_alg1/nurse_default*100)

69.29806178512185
41.8177848702394
60.34481166285317


In [34]:
janitor_alg1 = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_concat_filt,df_cps,0,10)
print(janitors_default)
print(janitor_alg1)
print(janitor_alg1/janitors_default*100)

59.08144679332252
29.03568131531777
49.1451765169018


In [35]:
sales_alg1 = percentage('Sales Managers',df_concat_filt,df_cps,0,10)
print(sales_default)
print(sales_alg1)
print(sales_alg1/sales_default*100)

49.31851176579454
3.536867901091421
7.171481406185616


## Comparison using Kendall Tau

In [36]:
# kendalltau([1,2,3,4,5], [1,2,3,4,5])
kendalltau([1,2,3,4,5], [2,1,3,4,5])

KendalltauResult(correlation=0.7999999999999999, pvalue=0.08333333333333333)

In [37]:
kendalltau([1,2,3,4,5], [1,2,3,5,4])

KendalltauResult(correlation=0.7999999999999999, pvalue=0.08333333333333333)

In [38]:
weightedtau([1,2,3,4,5], [2,1,3,4,5])

WeightedTauResult(correlation=0.9014598540145985, pvalue=nan)

In [39]:
weightedtau([1,2,3,4,5], [1,2,3,5,4])

WeightedTauResult(correlation=0.6715328467153284, pvalue=nan)

In [40]:
list_cps = df_cps[df_cps['ONET18_Title_LY'] == 'Retail Salespersons'].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title']
list_cps= list_cps.to_list()[0:50]
list_onet = skills_dist.loc['Retail Salespersons'].sort_values().index.to_list()[0:50]
# cps_array = np.array(list_cps).astype(float)


In [41]:
# weightedtau(list_cps, list_onet)

In [42]:
def kendall_tau_rank(df_cps,df_onet,a,b,occ):
    list_cps = df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title']
    list_cps= list_cps.to_list()[a:b]
    list_onet = df_onet.loc[occ].sort_values().index.to_list()[a:b]
    
    return kendalltau(list_cps, list_onet)

In [43]:
kendall_tau_rank(df_cps,skills_dist,0,50,'Sales Managers')

KendalltauResult(correlation=-0.08493263264200439, pvalue=0.38431371414818993)

In [44]:
kendall_tau_rank(df_cps,skills_dist,0,50,'Retail Salespersons')

KendalltauResult(correlation=0.05795918367346938, pvalue=0.552575720493856)

In [45]:
#find all occupations that are in both cps and onet
cps_occ = set(df_cps['ONET18_Title_LY'].unique())
onet_occ = set(df_skills.index.unique())

relevant_occ = cps_occ & onet_occ
relevant_occ

{'Actors',
 'Actuaries',
 'Acupuncturists',
 'Adhesive Bonding Machine Operators and Tenders',
 'Administrative Law Judges, Adjudicators, and Hearing Officers',
 'Administrative Services Managers',
 'Advertising Sales Agents',
 'Advertising and Promotions Managers',
 'Aerospace Engineers',
 'Agents and Business Managers of Artists, Performers, and Athletes',
 'Agricultural Engineers',
 'Agricultural Equipment Operators',
 'Agricultural Inspectors',
 'Agricultural Sciences Teachers, Postsecondary',
 'Agricultural Technicians',
 'Air Traffic Controllers',
 'Aircraft Cargo Handling Supervisors',
 'Aircraft Mechanics and Service Technicians',
 'Aircraft Structure, Surfaces, Rigging, and Systems Assemblers',
 'Airfield Operations Specialists',
 'Airline Pilots, Copilots, and Flight Engineers',
 'Ambulance Drivers and Attendants, Except Emergency Medical Technicians',
 'Amusement and Recreation Attendants',
 'Anesthesiologists',
 'Animal Breeders',
 'Animal Control Workers',
 'Animal Scienti

In [46]:
skills_dist
df_concat
df_concat_filt = pd.concat([skills_dist_filt.multiply(3.0),knowledge_dist_filt.multiply(2.0),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()
df_new = pd.concat([skills_dist_filt.multiply(1.3),knowledge_dist_filt.multiply(1.2),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()
df_test = pd.concat([skills_dist_filt.multiply(1.3),knowledge_dist_filt.multiply(1.2),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()

In [47]:
avg_occ = 0
count = 0
occ_max = ''
max_r = 0
occ_min = ''
min_r = 1

for occ in relevant_occ:
    cps_len = len(df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title'])
    if cps_len > 50:
        corr, p = kendall_tau_rank(df_cps,df_new,0,50,occ)
        count += 1 
        avg_occ += corr
        if corr >= max_r: 
            occ_max = occ
            max_r = corr
        if corr <= min_r: 
            occ_min = occ
            min_r = corr
print(avg_occ/count)
print(count)
print(occ_max, max_r)
print(occ_min, min_r)

-0.0032435164145185403
434
Coaches and Scouts 0.3273469387755102
Electrical Power-Line Installers and Repairers -0.33387755102040817


In [48]:
# df_cps[df_cps['ONET18_Title_LY'] == 'Etchers and Engravers'].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title']
# df_cps

In [None]:
skills_dist
df_concat
df_concat_filt = pd.concat([skills_dist_filt.multiply(3.0),knowledge_dist_filt.multiply(2.0),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()
df_new = pd.concat([skills_dist_filt.multiply(1.3),knowledge_dist_filt.multiply(1.2),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()
df_test = pd.concat([skills_dist_filt.multiply(1.3),knowledge_dist_filt.multiply(1.2),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()


In [102]:
avg_occ = 0
count = 0
occ_max = ''
max_r = 0
occ_min = ''
min_r = 1

cov_max = ''
max_c = 0
cov_min = ''
min_c = 1

# occ_dictionary = {}
occ_dictionary_skills = {}
for occ in relevant_occ:
    cps_len = len(df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_tot', ascending = False)['ONET18_Title'])
    if cps_len > 50:
        n = percentage(occ,skills_dist,df_cps,0,50)
        d = percentage_default(occ,df_cps,0,50)
        perc = n/d
        count += 1 
        avg_occ += perc
        occ_dictionary_skills[occ] = perc
        if n >= max_c:
            cov_max =occ
            max_c = n 
        if n <= max_c:
            cov_min =occ
            min_c = n  
        if perc >= max_r: 
            occ_max = occ
            max_r = perc
        if perc <= min_r: 
            occ_min = occ
            min_r = perc
print(100*avg_occ/count)
print(count)
print(occ_max, 100*max_r)
print(occ_min, min_r)
print(cov_max, max_c)
print(cov_min, min_c)

22.018790913158497
434
First-Line Supervisors of Non-Retail Sales Workers 78.34674622646243
Office Machine Operators, Except Computer 0.0
Title Examiners, Abstractors, and Searchers 71.41240257266489
Phlebotomists 54.58734158301426


In [57]:
# kendalltau([1,2,3,4,5], [5,4,3,2,1])

KendalltauResult(correlation=-0.9999999999999999, pvalue=0.016666666666666666)

In [58]:
# kendalltau([1,2,3,4,5], [6,8,8,9,10])

KendalltauResult(correlation=0.9486832980505137, pvalue=0.022977401503206086)

In [60]:
occ_dictionary

{'Advertising Sales Agents': 0.47791291637827404,
 'Administrative Law Judges, Adjudicators, and Hearing Officers': 0.08979431572515985,
 'Database Administrators': 0.4311419527978906,
 'Area, Ethnic, and Cultural Studies Teachers, Postsecondary': 0.13402182638596113,
 'Lifeguards, Ski Patrol, and Other Recreational Protective Service Workers': 0.3788194675851128,
 'Broadcast Technicians': 0.01552836987257468,
 'Education Administrators, Postsecondary': 0.3464248028971981,
 'Couriers and Messengers': 0.09600641393203023,
 'Philosophy and Religion Teachers, Postsecondary': 0.12329502304478811,
 'Web Developers': 0.4677216008643017,
 'General and Operations Managers': 0.11762128630524298,
 'Tapers': 0.16394308362466173,
 'Nuclear Medicine Technologists': 0.23360165980704603,
 'Residential Advisors': 0.10932848087927546,
 'Farm Equipment Mechanics and Service Technicians': 0.4031956974793097,
 'Directors, Religious Activities and Education': 0.28421489492260527,
 'Crushing, Grinding, and 

In [98]:
# df_score = pd.DataFrame.from_dict(occ_dictionary)
# list(occ_dictionary.keys())
df_score = pd.DataFrame.from_dict(occ_dictionary, orient = 'index',  columns= ['score'])
df_score = df_score.sort_values(by = 'score', ascending = False)

In [106]:
x = list(occ_dictionary.keys())[:20]
y = list(occ_dictionary.values())[:20]

# Use textposition='auto' for direct text
fig = go.Figure(data=[go.Bar(
            x=df_score.index[:30], y=df_score['score'][:30],
            textposition='auto',
        )])
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

fig.show()

In [107]:

df_score_skills = pd.DataFrame.from_dict(occ_dictionary_skills, orient = 'index',  columns= ['score'])
df_score_skills = df_score_skills.sort_values(by = 'score', ascending = False)
fig = go.Figure(data=[go.Bar(
            x=df_score_skills.index[:30], y=df_score_skills['score'][:30],
            textposition='auto',
        )])
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})

fig.show()