In [1]:
import pandas as pd
from datasets import load_dataset

expert_ratings = load_dataset("SALT-NLP/WORKBank", data_files="expert_ratings/expert_rated_technological_capability.csv")["train"].to_pandas()
task_statement_metadata = load_dataset("SALT-NLP/WORKBank", data_files="task_data/task_statement_with_metadata.csv")["train"].to_pandas()

In [2]:
all_skills = set()
for _, row in task_statement_metadata.iterrows():
	skills = eval(row['Skill (O*NET Work Activity)'])
	for skill in skills:
		if isinstance(skill, str):
			all_skills.add(skill)

print(len(all_skills))

35


In [3]:
# Compute the Average Human Agency Scale Rating
avg_has = expert_ratings.groupby("Task")["Human Agency Scale Rating"].mean()
task_statement_metadata = task_statement_metadata.merge(avg_has, on="Task")
task_statement_metadata.head()

Unnamed: 0,O*NET-SOC Code,Occupation (O*NET-SOC Title),Task ID,Task,Task Type,Date,Category,Frequency,Importance,Relevance,Occupation Mean Annual Wage,Occupation Employment,Skill (O*NET Work Activity),Skill ID (O*NET Generalized Work Activity ID),Human Agency Scale Rating
0,11-2011.00,Advertising and Promotions Managers,3226,"Inspect layouts and advertising copy, and edit...",Core,08/2018,3.0,3.0,4.07,87.26,149270.0,21100.0,['Evaluating Information to Determine Complian...,['4.A.2.a.3'],2.0
1,11-2011.00,Advertising and Promotions Managers,3242,Coordinate with the media to disseminate adver...,Core,08/2018,3.0,3.0,3.87,80.19,149270.0,21100.0,"['Communicating with Supervisors, Peers, or Su...",['4.A.4.a.2'],3.0
2,11-2011.00,Advertising and Promotions Managers,3223,Prepare budgets and submit estimates for progr...,Core,08/2018,3.0,3.0,3.67,81.73,149270.0,21100.0,"['Guiding, Directing, and Motivating Subordina...",['4.A.4.b.4'],2.0
3,11-2011.00,Advertising and Promotions Managers,3243,Contact organizations to explain services and ...,Core,08/2018,5.0,5.0,3.61,79.02,149270.0,21100.0,['Selling or Influencing Others'],['4.A.4.a.6'],4.0
4,11-2011.00,Advertising and Promotions Managers,3233,Monitor and analyze sales promotion results to...,Core,08/2018,3.0,3.0,3.59,77.04,149270.0,21100.0,['Analyzing Data or Information'],['4.A.2.a.4'],1.5


In [4]:
skill_to_has_ratings = {}
for _, row in task_statement_metadata.iterrows():
	skills = eval(row['Skill (O*NET Work Activity)'])
	has_rating = row['Human Agency Scale Rating']
	for skill in skills:
		if isinstance(skill, str):
			if skill not in skill_to_has_ratings:
				skill_to_has_ratings[skill] = []
			skill_to_has_ratings[skill].append(has_rating)

In [5]:
skill_to_wage = {}
for _, row in task_statement_metadata.iterrows():
	skills = eval(row['Skill (O*NET Work Activity)'])
	wage = row['Occupation Mean Annual Wage']
	for skill in skills:
		if isinstance(skill, str):
			if skill not in skill_to_wage:
				skill_to_wage[skill] = []
			skill_to_wage[skill].append(wage)

In [6]:
skill_info = {
	"skill": [],
	"mean_has_rating": [],
	"1_rating_cnt": [],
	"2_rating_cnt": [],
	"3_rating_cnt": [],
	"4_rating_cnt": [],
	"5_rating_cnt": [],
	"mean_wage": []
}
for skill, ratings in skill_to_has_ratings.items():
	skill_info["skill"].append(skill)
	skill_info["mean_has_rating"].append(sum(ratings) / len(ratings))
	_1_rating_cnt = 0
	_2_rating_cnt = 0
	_3_rating_cnt = 0
	_4_rating_cnt = 0
	_5_rating_cnt = 0
	for rating in ratings:
		if 1.0 <= rating < 1.5:
			_1_rating_cnt += 1
		elif 1.5 <= rating < 2.5:
			_2_rating_cnt += 1
		elif 2.5 <= rating < 3.5:
			_3_rating_cnt += 1
		elif 3.5 <= rating < 4.5:
			_4_rating_cnt += 1
		elif 4.5 <= rating <= 5.0:
			_5_rating_cnt += 1
	skill_info["1_rating_cnt"].append(_1_rating_cnt)
	skill_info["2_rating_cnt"].append(_2_rating_cnt)
	skill_info["3_rating_cnt"].append(_3_rating_cnt)
	skill_info["4_rating_cnt"].append(_4_rating_cnt)
	skill_info["5_rating_cnt"].append(_5_rating_cnt)
	wage_without_nan = [wage for wage in skill_to_wage.get(skill, []) if not pd.isna(wage)]
	if wage_without_nan:
		skill_info["mean_wage"].append(sum(wage_without_nan) / len(wage_without_nan))
	else:
		skill_info["mean_wage"].append(0)

skill_info_df = pd.DataFrame(skill_info)
skill_info_df = skill_info_df.sort_values("mean_wage", ascending=False)
skill_info_df.to_csv("../local/skill_info.csv", index=False)

# Drop points with mean_wage == 0
skill_info_df = skill_info_df[skill_info_df["mean_wage"] > 0]

In [7]:
import plotly.express as px

# Create an interactive scatter plot
fig = px.scatter(
    skill_info_df,
    x="mean_has_rating",
    y="mean_wage",
    hover_name="skill",
    labels={
        "mean_has_rating": "Mean HAS Rating",
        "mean_wage": "Mean Wage"
    },
    title="Mean HAS vs. Mean Wage (by Skill)"
)

fig.update_traces(marker=dict(size=10, opacity=0.7, line=dict(width=1, color='DarkSlateGrey')),
                  selector=dict(mode="markers"))

fig.show()
