In [1]:
import pandas as pd
import random as rnd 
import mysql.connector
from mysql.connector import Error
import plotly
import json
import csv
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pymysql
import plotly.express as px
import plotly.graph_objects as go
from sqlalchemy import create_engine
import networkx as nx
from networkx_viewer import Viewer
from functools import reduce
from scipy.spatial.distance import pdist, squareform
import igraph
from cyjupyter import Cytoscape

# Getting data from SQL database

In [2]:
connection = mysql.connector.connect(host='localhost',
                                         database='employment',
                                         user='root',
                                         password='1234')

In [3]:
def get_data(connection, descriptor_domain,filt = False,val = None):
    '''
    Return df in the SOCs-decriptor format of a specific domain 
    descriptor_domeain:
        work context
        knowledge
        skills
        gwas
        job_zones, filter
        Abilities, filter
    
    '''
    df= pd.DataFrame(data=None)
    if descriptor_domain == 'work_context':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_context
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'CX';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'job_zones':
        if filt:
            df= pd.read_sql('''
                SELECT title,job_zone FROM employment.job_zones
                LEFT JOIN employment.occupation_data USING (onetsoc_code); ''', con = connection)
            df = df[df['job_zone'].isin(val)]
        else:
            df= pd.read_sql('''
                SELECT title,job_zone FROM employment.job_zones
                LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
    elif descriptor_domain == 'abilities':
        if filt:
#             df= pd.read_sql('''
#             SELECT title, element_name, element_id,data_value FROM employment.''' + descriptor_domain + '''
#             LEFT JOIN employment.content_model_reference USING (element_id) 
#             LEFT JOIN employment.occupation_data USING (onetsoc_code); ''', con = connection)
#             df = df[df['element_id'].contains(val)]
            df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE element_id LIKE\''''+ val +'''%\'; ''', con = connection)
            
        else:
            df= pd.read_sql('''
            SELECT title, element_name,element_id, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
        df = df.groupby(by=['title','element_name'])['data_value'].mean()
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'work_values':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_values
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'EX';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'interests':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.interests
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'OI';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'work_styles':
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.work_styles
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code)
            WHERE scale_id LIKE 'IM';''', con = connection)
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    elif descriptor_domain == 'name':
        df= pd.read_sql('''
            SELECT * FROM employment.occupation_data;''', con = connection)
    else:
        df= pd.read_sql('''
            SELECT title, element_name, data_value FROM employment.''' + descriptor_domain + '''
            LEFT JOIN employment.content_model_reference USING (element_id) 
            LEFT JOIN employment.occupation_data USING (onetsoc_code);''', con = connection)
        df = df.groupby(by=['title','element_name'])['data_value'].mean()
        df = df.reset_index().pivot(index = 'title', columns = 'element_name', values = 'data_value')
    return df

# Finding distance measures from ONET 

In [4]:
def distance_table(df, jz=False):
    
#     squareform(pdist(df.iloc[:, 1:])), columns=df.title.unique(), index=df.title.unique()
    dist = []
    name_dict = {}
    if jz:
        name_dict = df['title'].sort_values().reset_index()['title'].to_dict()
        dist = pdist(df[df.columns[1:]],'euclidean')
    else:
        for i in range(len(df.index)):
            name_dict[i] = df.index[i]
        dist = pdist(df,'euclidean') #Euclidean distance for 1XN
    df_dist = pd.DataFrame(squareform(dist)) #Euclidean distance for NxN
#     return df_dist
    columns = list(df_dist.index) 
    for i in range(len(df.index)): #convert 0 values to NaN
        df_dist.loc[i,i] = np.NaN
    mean = df_dist.mean().mean()
    std = df_dist.stack().std() 

    for col in columns: #Z-score calculated 
        df_dist[col] = (df_dist[col] - mean)/std
#     return df_dist
    return df_dist.rename(columns = name_dict, index = name_dict)

In [5]:
# Get CPS data

In [6]:
df_cps = pd.read_csv('../CPS Job Changes/JobChanges_2011to19.csv')

In [7]:
df_cps.sort_values(by = 'pct_tot', ascending = False)

Unnamed: 0,ONET18_SOC,ONET18_SOC_LY,ONET18_Title,ONET18_Title_LY,count,pct_tot,count_socly,pct_socly,in_grp,in_minor_grp,in_broad_occ
2933,41-1012,41-1011,First-Line Supervisors of Non-Retail Sales Wor...,First-Line Supervisors of Retail Sales Workers,734264.510000,4.375723e-01,2207424.090,33.263409,1.0,1.0,1.0
1553,41-1011,41-1012,First-Line Supervisors of Retail Sales Workers,First-Line Supervisors of Non-Retail Sales Wor...,603472.900000,3.596293e-01,999832.010,60.357429,1.0,1.0,1.0
1088,41-2031,41-2011,Retail Salespersons,Cashiers,509346.270000,3.035362e-01,2630467.290,19.363338,1.0,1.0,0.0
1089,41-2031,41-2012,Retail Salespersons,Gambling Change Persons and Booth Cashiers,509346.270000,3.035362e-01,2630467.290,19.363338,1.0,1.0,0.0
31,11-1011,11-1021,Chief Executives,General and Operations Managers,352402.430000,2.100082e-01,751730.590,46.878820,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
95437,25-1111,45-2091,"Criminal Justice and Law Enforcement Teachers,...",Agricultural Equipment Operators,1.135947,6.769483e-07,185207.396,0.000613,0.0,0.0,0.0
95438,25-1067,45-2093,"Sociology Teachers, Postsecondary","Farmworkers, Farm, Ranch, and Aquacultural Ani...",1.135947,6.769483e-07,185207.396,0.000613,0.0,0.0,0.0
95439,25-1043,45-2093,"Forestry and Conservation Science Teachers, Po...","Farmworkers, Farm, Ranch, and Aquacultural Ani...",1.135947,6.769483e-07,185207.396,0.000613,0.0,0.0,0.0
95440,25-1192,45-2099,"Family and Consumer Sciences Teachers, Postsec...","Agricultural Workers, All Other",1.135947,6.769483e-07,185207.396,0.000613,0.0,0.0,0.0


## Skills 

In [8]:
df_skills = get_data(connection,'skills')
df_skills

skills_dist = distance_table(df_skills)
skills_dist

Unnamed: 0,Accountants,Actors,Actuaries,Acupuncturists,Acute Care Nurses,Adapted Physical Education Specialists,Adhesive Bonding Machine Operators and Tenders,"Administrative Law Judges, Adjudicators, and Hearing Officers",Administrative Services Managers,Adult Basic and Secondary Education and Literacy Teachers and Instructors,...,"Welding, Soldering, and Brazing Machine Setters, Operators, and Tenders",Wellhead Pumpers,"Wholesale and Retail Buyers, Except Farm Products",Wind Energy Engineers,Wind Energy Operations Managers,Wind Energy Project Managers,Wind Turbine Service Technicians,"Woodworking Machine Setters, Operators, and Tenders, Except Sawing",Word Processors and Typists,Zoologists and Wildlife Biologists
Accountants,,-0.232192,-1.231166,-1.343769,-0.572105,-1.103413,1.134355,-1.129204,-1.140781,-1.063139,...,0.726581,1.567913,-1.525925,-0.817695,-0.316332,-1.144954,1.109038,1.435817,-0.110304,-0.934922
Actors,-0.232192,,1.136940,-0.746946,0.663868,-0.444427,1.199026,-0.376028,0.049845,-0.946980,...,0.331044,1.243432,0.199842,1.012206,1.071827,0.547006,1.572133,1.323007,-0.682780,0.499415
Actuaries,-1.231166,1.136940,,-0.292468,-0.288049,-0.276207,1.966989,-0.646019,-0.467885,0.037310,...,1.808031,2.491332,-0.975188,-0.933491,-0.088138,-0.834385,1.662193,2.283982,1.218300,-0.706944
Acupuncturists,-1.343769,-0.746946,-0.292468,,-0.939257,-1.496659,0.573997,-0.909846,-0.954193,-1.141138,...,0.083364,0.990593,-1.083227,-0.670565,-0.312289,-0.976759,0.607291,0.893135,-0.566595,-1.162061
Acute Care Nurses,-0.572105,0.663868,-0.288049,-0.939257,,-1.168764,0.831865,-0.345816,-0.839026,-0.281779,...,0.677219,1.419451,-0.558503,-1.306457,-0.923236,-1.050700,0.542298,1.359072,0.914576,-1.552209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wind Energy Project Managers,-1.144954,0.547006,-0.834385,-0.976759,-1.050700,-1.114667,0.909341,-0.597025,-1.836890,-0.282424,...,0.802140,1.459114,-1.836637,-0.914224,-1.597539,,0.645006,1.390519,0.473065,-1.145214
Wind Turbine Service Technicians,1.109038,1.572133,1.662193,0.607291,0.542298,0.753377,-1.281451,1.703529,0.831136,1.345588,...,-0.789641,-0.855040,1.207617,0.564500,-0.199079,0.645006,,-0.965950,0.924188,0.745823
"Woodworking Machine Setters, Operators, and Tenders, Except Sawing",1.435817,1.323007,2.283982,0.893135,1.359072,1.333042,-1.608119,2.094264,1.386387,1.653915,...,-1.714726,-1.738756,1.662305,1.222440,0.910910,1.390519,-0.965950,,0.158089,1.482784
Word Processors and Typists,-0.110304,-0.682780,1.218300,-0.566595,0.914576,0.154761,0.149986,0.560918,0.231741,-0.010130,...,-0.508398,0.314869,0.345421,0.764722,0.926314,0.473065,0.924188,0.158089,,0.632334


In [9]:
skills_dist.loc['Retail Salespersons'].sort_values()[0:10]

Demonstrators and Product Promoters                                                            -1.970922
Bartenders                                                                                     -1.942456
Psychiatric Aides                                                                              -1.928493
Home Health Aides                                                                              -1.889651
Reservation and Transportation Ticket Agents and Travel Clerks                                 -1.885947
Sales Representatives, Wholesale and Manufacturing, Except Technical and Scientific Products   -1.885568
Travel Agents                                                                                  -1.856308
Order Clerks                                                                                   -1.822059
Counter and Rental Clerks                                                                      -1.810728
Concierges                                             

### Percentages

In [10]:
cps_retail_pct = df_cps[df_cps['ONET18_Title_LY'] == 'Retail Salespersons'].sort_values(by = 'pct_tot', ascending = False)
# cps_retail_pct[cps_retail_pct['ONET18_Title'] == 'Cashiers']['pct_tot']
cps_retail_pct[0:10]['pct_tot'].sum()

1.170055727346882

In [11]:
def percentage_default(occ,df_cps,a,b):
    df_occ = df_cps[df_cps['ONET18_Title_LY'] == occ].sort_values(by = 'pct_tot', ascending = False)
    return df_occ[a:b]['pct_tot'].sum()

In [12]:
retail_default = percentage_default('Retail Salespersons',df_cps,0,10)

In [13]:
nurse_default = percentage_default('Registered Nurses',df_cps,0,10)

In [14]:
janitors_default = percentage_default('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_cps,0,10)

In [15]:
sales_default = percentage_default('Sales Managers',df_cps,0,10)

In [39]:
def percentage(occ,df_onet,df_cps,a,b):
    '''
    Get total percentage that ONET transition takes ups
    @param occ occupation of interests
    @param df_onet dataFrame for ONET
    @param df_cps dataFrame for CPS data
    @param a index of first position for onet
    @param index of last position
    
    return the percentage composition for the ath to bth place of ONET
    '''
    
    total = 0
    onet_list = df_onet.loc[occ].sort_values()[a:b].index.to_list()
    cps_occ = df_cps[df_cps['ONET18_Title_LY'] == occ]
    for occ in onet_list:
#         print(occ)
#         print(cps_occ[cps_occ['ONET18_Title'] == occ]['pct_tot'].values[0])
        val = cps_occ[cps_occ['ONET18_Title'] == occ]['pct_tot'].values
        if len(val) == 0:
            continue
        else:
            total += val[0]
    
    return total


In [59]:
retail_skills = percentage('Retail Salespersons',skills_dist,df_cps,0,10)
print(retail_default)
print(retail_skills)
print(retail_skills/retail_default*100)

1.170055727346882
0.17881689447701657
15.28276733301297


In [61]:
nurse_skills = percentage('Registered Nurses',skills_dist,df_cps,0,10)
print(nurse_default)
print(nurse_skills)
print(nurse_skills/nurse_default*100)

0.4699106328500127
0.06650253460214071
14.15216638082909


In [62]:
janitors_skills = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',skills_dist,df_cps,0,10)
print(janitors_default)
print(janitors_skills)
print(janitors_skills/janitors_default*100)

0.38844678703671565
0.048634099007768
12.52014449103196


In [63]:
sales_skills = percentage('Sales Managers',skills_dist,df_cps,0,10)
print(janitors_default)
print(sales_skills)
print(sales_skills/sales_default*100)

0.38844678703671565
0.010855691314113004
7.085190227974743


## ONET Regular

In [44]:
df_ab = get_data(connection,'abilities')
df_in = get_data(connection,'interests')
df_ws = get_data(connection,'work_styles')
df_wv = get_data(connection, 'work_values')
df_wc = get_data(connection,'work_context')
df_knowledge = get_data(connection,'knowledge')
df_gwas = get_data(connection,'work_activities')
df_jz = get_data(connection,'job_zones')
df_jz = df_jz[df_jz['title'] != 'Legislators']


In [45]:
wc_dist = distance_table(df_wc)
knowledge_dist = distance_table(df_knowledge)
skills_dist = distance_table(df_skills)
gwas_dist = distance_table(df_gwas)
jz_dist = distance_table(df_jz,True)
wv_dist = distance_table(df_wv)
in_dist = distance_table(df_in)
ab_dist = distance_table(df_ab)
ws_dist = distance_table(df_ws)
df_concat = pd.concat([wc_dist,knowledge_dist,skills_dist,gwas_dist,jz_dist.multiply(1.3)]).groupby(level =0).mean()
# df_concat

In [64]:
retail_onet = percentage('Retail Salespersons',df_concat,df_cps,0,10)
print(retail_default)
print(retail_onet)
print(retail_onet/retail_default*100)

1.170055727346882
0.3882005992365365
33.17795812313888


In [66]:
nurse_onet = percentage('Registered Nurses',df_concat,df_cps,0,10)
print(nurse_default)
print(nurse_onet)
print(nurse_onet/nurse_default*100)

0.4699106328500127
0.188504705820525
40.11501180069965


In [67]:
janitors_onet = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_concat,df_cps,0,10)
print(janitors_default)
print(janitors_onet)
print(janitors_onet/janitors_default*100)

0.38844678703671565
0.18237811004245813
46.950603307531004


In [68]:
sales_onet = percentage('Sales Managers',df_concat,df_cps,0,10)
print(sales_default)
print(sales_onet)
print(sales_onet/sales_default*100)

0.15321665283242558
0.010330260312581583
6.742256877181544


In [50]:
## ONET Custom

In [51]:
def filt(df,d):
    '''
    @params: df: Dataframe to filter, d: set of all relevant names
    @returns: filtered Dataframe
    '''
    return df.loc[df.index.isin(d)]

In [52]:
# df_rel = pd.concat([knowledge_dist.multiply(2),skills_dist.multiply(3),gwas_dist]).groupby(level =0).mean()
# df_jz_filt = get_data(connection,'job_zones',True,[2])
df_jz_filt = get_data(connection,'job_zones')
# df_ab_filt = get_data(connection,'abilities',True,'1.A.1.')
df_ab_filt = get_data(connection,'abilities')
# jz_dist_filt = distance_table(df_jz_filt,True)
jz_set = set(df_jz_filt['title'].unique()) #to filter out same job_zones jobs

In [53]:
abilities_filtered = filt(df_ab_filt,jz_set)
skills_filtered = filt(df_skills,jz_set)
knowledge_filtered = filt(df_knowledge,jz_set)
in_filtered = filt(df_in,jz_set)
gwas_filtered = filt(df_gwas,jz_set)

In [54]:
ab_dist_filt = distance_table(abilities_filtered) #NaN?
skills_dist_filt = distance_table(skills_filtered)
knowledge_dist_filt = distance_table(knowledge_filtered)
in_dist_filt = distance_table(in_filtered)
gwas_dist_filt = distance_table(gwas_filtered)
df_concat_filt = pd.concat([ab_dist_filt.multiply(4.0),skills_dist_filt.multiply(3.0),knowledge_dist_filt.multiply(2.0),in_dist_filt,gwas_dist_filt]).groupby(level =0).mean()

In [69]:
retail_alg1 = percentage('Retail Salespersons',df_concat_filt,df_cps,0,10)
print(retail_default)
print(retail_alg1)
print(retail_alg1/retail_default*100)

1.170055727346882
0.3821560187149767
32.661351915393034


In [70]:
nurse_alg1 = percentage('Registered Nurses',df_concat_filt,df_cps,0,10)
print(nurse_default)
print(nurse_alg1)
print(nurse_alg1/nurse_default*100)

0.4699106328500127
0.2835666863770614
60.344811662853125


In [73]:
janitor_alg1 = percentage('Janitors and Cleaners, Except Maids and Housekeeping Cleaners',df_concat_filt,df_cps,0,10)
print(janitors_default)
print(janitor_alg1)
print(janitor_alg1/janitors_default*100)

0.38844678703671565
0.19090285916342736
49.145176516901756


In [74]:
sales_alg1 = percentage('Sales Managers',df_concat_filt,df_cps,0,10)
print(sales_default)
print(sales_alg1)
print(sales_alg1/sales_default*100)

0.15321665283242558
0.010855691314113004
7.085190227974743
