In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
from collections import defaultdict

df = pd.read_csv('jobs_jd.csv')  

In [None]:
stem_text = """active-
adventurous-
aggress-
ambitio-
analy-
assert-
athlet-
autonom-
battle-
boast-
challeng-
champion-
compet-
confident-
courag-
decid-
decision-
decisive-
defend-
determin-
domina-
dominant-
driven-
fearless-
fight-
force-
greedy-
head-strong-
headstrong-
hierarch-
hostil-
impulsive-
independen-
individual-
intellect-
lead-
logic-
objective-
opinion-
outspoken-
persist-
principle-
reckless-
self-confiden-
self-relian-
self-sufficien-
selfconfiden-
selfrelian-
selfsufficien-
stubborn-
superior-
unreasonab-
agree-
affectionate-
child-
cheer-
collab-
commit-
communal-
compassion-
connect-
considerate-
cooperat-
co-operat-
depend-
emotiona-
empath-
feel-
flatterable-
gentle-
honest-
interpersonal-
interdependen-
interpersona-
inter-personal-
inter-dependen-
inter-persona-
kind-
kinship-
loyal-
modesty-
nag-
nurtur-
pleasant-
polite-
quiet-
respon-
sensitiv-
submissive-
support-
sympath-
tender-
together-
trust-
understand-
warm-
whin-
enthusias-
inclusive-
yield-
share-
sharin-"""

stems = [s.strip('-').lower() for s in stem_text.strip().split('\n') if s.strip()]


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased", output_attentions=True)
model.eval()

def extract_stem_attention(texts, stem_list):
    stem_attention = defaultdict(list)

    for text in texts:
        inputs = tokenizer(text, return_tensors='pt', truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)

        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        attentions = outputs.attentions[-1].mean(dim=1).squeeze(0)  # shape: [seq_len, seq_len]

        for i, token in enumerate(tokens):
            clean_token = token.replace("##", "").lower()
            for stem in stem_list:
                if stem in clean_token:
                    score = attentions[:, i].mean().item()
                    stem_attention[stem].append(score)

    return stem_attention

high_texts = df[df["women_proportion"] >= 0.5]["description"].tolist()
low_texts = df[df["women_proportion"] < 0.5]["description"].tolist()

high_scores = extract_stem_attention(high_texts, stems)
low_scores = extract_stem_attention(low_texts, stems)

high_avg = {stem: sum(scores) / len(scores) for stem, scores in high_scores.items() if len(scores) > 0}
low_avg = {stem: sum(scores) / len(scores) for stem, scores in low_scores.items() if len(scores) > 0}

attention_diff = {
    stem: high_avg.get(stem, 0) - low_avg.get(stem, 0)
    for stem in stems
}

diff_df = pd.DataFrame([
    {
        "Stem": stem,
        "High Avg Attention": high_avg.get(stem, 0),
        "Low Avg Attention": low_avg.get(stem, 0),
        "High - Low Attention Δ": diff
    }
    for stem, diff in attention_diff.items()
])

diff_df = diff_df.sort_values(by="High - Low Attention Δ", ascending=True)

diff_df.to_csv("bert_attention_diff_sorted.csv", index=False, encoding="utf-8-sig")
