In [1]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="`tokenizer` is deprecated and will be removed in version 5.0.0"
)

import transformers
transformers.utils.logging.set_verbosity_error()

In [2]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
MAP_DIR="./tokenizer/${MODE_TAG}"
mkdir -p "${CORPUS_DIR}" "${MAP_DIR}"

for Y in 1990 1991 1992 1993 1994 1995; do
  python ./scripts/make_walk_corpus_encoded.py \
    --pkl ./data/news/news_network_${Y}_m0.0-M1.0.pkl \
    --out "${CORPUS_DIR}/news_${Y}.txt" \
    --walk_length 20 --num_walks 20000 --seed 42 \
    --start_mode "${START_MODE}"\
    --mapping_json "${MAP_DIR}/news_mapping_${Y}.json"

  python ./scripts/make_walk_corpus_encoded.py \
    --pkl ./data/paper/paper_network_${Y}_m0.0-M0.5.pkl \
    --out "${CORPUS_DIR}/paper_${Y}.txt" \
    --walk_length 20 --num_walks 20000 --seed 42 \
    --start_mode "${START_MODE}" \
    --mapping_json "${MAP_DIR}/paper_mapping_${Y}.json"
done


[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1990.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1990.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1991.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1991.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1992.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1992.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1993.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1993.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1994.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1994.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1995.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1995.txt


In [3]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
DATASET_DIR="./datasets/${MODE_TAG}"
mkdir -p "${DATASET_DIR}"

python scripts/make_hf_dataset.py \
  --corpus_root "${CORPUS_DIR}" \
  --out_dir "${DATASET_DIR}" \
  --years 1990 1991 1992 1993 1994 1995 \
  --domains news paper

Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1151759.23 examples/s]


[OK] saved news_1990 -> ./datasets/start=uniform/news_1990 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1516872.45 examples/s]


[OK] saved paper_1990 -> ./datasets/start=uniform/paper_1990 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1513833.94 examples/s]


[OK] saved news_1991 -> ./datasets/start=uniform/news_1991 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1525922.80 examples/s]


[OK] saved paper_1991 -> ./datasets/start=uniform/paper_1991 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1577873.75 examples/s]


[OK] saved news_1992 -> ./datasets/start=uniform/news_1992 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1523096.81 examples/s]


[OK] saved paper_1992 -> ./datasets/start=uniform/paper_1992 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1574467.99 examples/s]


[OK] saved news_1993 -> ./datasets/start=uniform/news_1993 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1497350.73 examples/s]


[OK] saved paper_1993 -> ./datasets/start=uniform/paper_1993 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1545261.76 examples/s]


[OK] saved news_1994 -> ./datasets/start=uniform/news_1994 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1484761.94 examples/s]


[OK] saved paper_1994 -> ./datasets/start=uniform/paper_1994 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1491944.65 examples/s]


[OK] saved news_1995 -> ./datasets/start=uniform/news_1995 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1519565.25 examples/s]


[OK] saved paper_1995 -> ./datasets/start=uniform/paper_1995 (n=20000)


In [8]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

y=1990
DATASET_DIR="./datasets/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
mkdir -p "${CKPT_DIR}"

python scripts/train_mlm_preinit.py \
  --dataset_dir "${DATASET_DIR}/news_${y}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --output_dir "${CKPT_DIR}/news_${y}_seed" \
  --num_train_epochs 5 \
  --learning_rate 5e-5 \
  --mlm_prob 0.15

python scripts/train_mlm_preinit.py \
  --dataset_dir "${DATASET_DIR}/paper_${y}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --output_dir "${CKPT_DIR}/paper_${y}_seed" \
  --num_train_epochs 5 \
  --learning_rate 5e-5 \
  --mlm_prob 0.15

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[vocab] loaded node tokens: 22577
[tokenizer] added tokens: 22577 (new vocab size=50345)
[init] initialized added node token embeddings: 19823, fallback_to_UNK: 0
[mask] node_token_ids in tokenizer: 22577


100%|██████████| 3095/3095 [03:06<00:00, 16.55it/s]


{'loss': 7.4658, 'grad_norm': 9.799673080444336, 'learning_rate': 4.975936748023376e-05, 'epoch': 0.32}
{'loss': 6.2519, 'grad_norm': 7.620432376861572, 'learning_rate': 4.6321760055001723e-05, 'epoch': 0.65}
{'loss': 5.964, 'grad_norm': 8.510808944702148, 'learning_rate': 4.288415262976968e-05, 'epoch': 0.97}
{'loss': 5.8287, 'grad_norm': 7.5809760093688965, 'learning_rate': 3.9446545204537646e-05, 'epoch': 1.29}
{'loss': 5.6457, 'grad_norm': 8.092015266418457, 'learning_rate': 3.600893777930561e-05, 'epoch': 1.62}
{'loss': 5.5564, 'grad_norm': 8.533291816711426, 'learning_rate': 3.257133035407357e-05, 'epoch': 1.94}
{'loss': 5.418, 'grad_norm': 10.043675422668457, 'learning_rate': 2.913372292884153e-05, 'epoch': 2.26}
{'loss': 5.3086, 'grad_norm': 8.49070930480957, 'learning_rate': 2.569611550360949e-05, 'epoch': 2.58}
{'loss': 5.259, 'grad_norm': 8.21644401550293, 'learning_rate': 2.225850807837745e-05, 'epoch': 2.91}
{'loss': 5.1515, 'grad_norm': 9.598251342773438, 'learning_rate':

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[vocab] loaded node tokens: 22577
[tokenizer] added tokens: 22577 (new vocab size=50345)
[init] initialized added node token embeddings: 19823, fallback_to_UNK: 0
[mask] node_token_ids in tokenizer: 22577


100%|██████████| 3095/3095 [03:07<00:00, 16.48it/s]


{'loss': 8.5941, 'grad_norm': 9.487297058105469, 'learning_rate': 4.975936748023376e-05, 'epoch': 0.32}
{'loss': 7.517, 'grad_norm': 8.29940414428711, 'learning_rate': 4.6321760055001723e-05, 'epoch': 0.65}
{'loss': 7.2814, 'grad_norm': 9.577744483947754, 'learning_rate': 4.288415262976968e-05, 'epoch': 0.97}
{'loss': 7.1115, 'grad_norm': 7.894964218139648, 'learning_rate': 3.9446545204537646e-05, 'epoch': 1.29}
{'loss': 7.016, 'grad_norm': 9.48759651184082, 'learning_rate': 3.600893777930561e-05, 'epoch': 1.62}
{'loss': 6.9868, 'grad_norm': 8.025230407714844, 'learning_rate': 3.257133035407357e-05, 'epoch': 1.94}
{'loss': 6.8655, 'grad_norm': 9.795790672302246, 'learning_rate': 2.913372292884153e-05, 'epoch': 2.26}
{'loss': 6.7965, 'grad_norm': 8.805621147155762, 'learning_rate': 2.569611550360949e-05, 'epoch': 2.58}
{'loss': 6.7957, 'grad_norm': 7.913365364074707, 'learning_rate': 2.225850807837745e-05, 'epoch': 2.91}
{'loss': 6.6985, 'grad_norm': 8.156784057617188, 'learning_rate': 

In [9]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

init_y=1990
DATASET_DIR="./datasets/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
mkdir -p "${CKPT_DIR}"

d="news"
python scripts/train_annual_2stage_preinit.py \
  --init_dir "${CKPT_DIR}/${d}_${init_y}_seed" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --years 1991 \
  --dataset_template "${DATASET_DIR}/${d}_{year}" \
  --output_base_dir "${CKPT_DIR}/${d}_annual" \
  --exact_k_masks 1 \
  --num_train_epochs 2 \
  --learning_rate 1e-5 \

d="paper"
python scripts/train_annual_2stage_preinit.py \
  --init_dir "${CKPT_DIR}/${d}_${init_y}_seed" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --years 1991 \
  --dataset_template "${DATASET_DIR}/${d}_{year}" \
  --output_base_dir "${CKPT_DIR}/${d}_annual" \
  --exact_k_masks 1 \
  --num_train_epochs 2 \
  --learning_rate 1e-5

[plan]
  - 1991: ./datasets/start=uniform/news_1991
[vocab] node tokens: 22577

[year 1991] init_dir = ./checkpoints/start=uniform/news_1990_seed
[year 1991] dataset_dir = ./datasets/start=uniform/news_1991
[year 1991] node_token_ids in tokenizer: 22577


Map: 100%|██████████| 20000/20000 [00:00<00:00, 28260.43 examples/s]
100%|██████████| 1238/1238 [01:18<00:00, 15.71it/s]


{'loss': 6.6131, 'grad_norm': 15.475471496582031, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 5.9081, 'grad_norm': 15.723024368286133, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 5.7181, 'grad_norm': 14.931378364562988, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 5.5019, 'grad_norm': 16.500917434692383, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 5.4637, 'grad_norm': 16.86391830444336, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 5.357, 'grad_norm': 19.0028018951416, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 78.8105, 'train_samples_per_second': 502.471, 'train_steps_per_second': 15.709, 'train_loss': 5.746613140445149, 'epoch': 2.0}
[year 1991] saved: ./checkpoints/start=uniform/news_annual/1991

[done] all years finished.
[done] final checkpoint: ./checkpoints/start=uniform/news_annual/1991
[plan]
  - 1991: ./datasets/start=uniform/paper_1991
[vocab] nod

Map: 100%|██████████| 20000/20000 [00:00<00:00, 28075.88 examples/s]
100%|██████████| 1238/1238 [01:18<00:00, 15.78it/s]


{'loss': 6.9059, 'grad_norm': 15.247794151306152, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 6.7895, 'grad_norm': 13.682376861572266, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 6.7953, 'grad_norm': 14.370524406433105, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 6.7622, 'grad_norm': 13.888494491577148, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 6.7942, 'grad_norm': 15.463550567626953, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 6.7655, 'grad_norm': 13.951034545898438, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 78.4688, 'train_samples_per_second': 504.659, 'train_steps_per_second': 15.777, 'train_loss': 6.798774337152287, 'epoch': 2.0}
[year 1991] saved: ./checkpoints/start=uniform/paper_annual/1991

[done] all years finished.
[done] final checkpoint: ./checkpoints/start=uniform/paper_annual/1991


In [10]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
OUT_DIR="./outputs/${MODE_TAG}"
mkdir -p "${OUT_DIR}"

python scripts/predict_neighbors_by_distance.py \
  --years 1991 \
  --domains news paper \
  --ckpt_root "${CKPT_DIR}" \
  --corpus_root "${CORPUS_DIR}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --targets automation productivity health_care legislation \
  --distance 1 \
  --topk 30 \
  --max_contexts 1000 \
  --out_root "${OUT_DIR}/mask_pred" \

[INFO] loaded node vocab: 22577
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=automation.tsv
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=productivity.tsv
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=health_care.tsv
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=legislation.tsv


In [11]:
import numpy as np, pandas as pd

targets = [
    "automation", "productivity", "health_care", "legislation",
]

base = "/root/science-society/outputs/start=uniform/mask_pred/dist=1/year=1991"
for t in targets:
    path = f"{base}/target={t}.tsv"
    print(f"\n============================== TARGET: {t} ==============================")
    df = pd.read_csv(path, sep="\t")
    display(df.head(20))





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,dining_rooms,0.389015,bidding,0.141014
1,2,bureaucracy,0.248747,thematic_map,0.080273
2,3,firearms,0.169783,attendance,0.062786
3,4,golf,0.153054,drilling,0.060991
4,5,puzzles,0.151183,nuclear_reactor,0.059015
5,6,golf_courses,0.114111,buddhism,0.04264
6,7,mathematics,0.105563,traffic_congestion,0.040231
7,8,influence,0.099648,hazard,0.039555
8,9,appliances,0.098346,accreditation,0.039422
9,10,deafness,0.093943,geometric_design,0.034473





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,appropriations,0.491235,early_childhood,0.073032
1,2,influence,0.127337,scalp,0.046143
2,3,watches_and_clocks,0.092129,lease,0.04206
3,4,vendors,0.086512,exhibition,0.041916
4,5,cameras,0.069361,network_packet,0.041766
5,6,natural_gas,0.063958,database_transaction,0.039842
6,7,technology_transfer,0.058605,greenhouse,0.037109
7,8,advertising_agencies,0.055034,active_listening,0.031579
8,9,tolls,0.054505,air_pollution,0.031273
9,10,conveyor_lines,0.052666,bearing_capacity,0.031188





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,building_materials,0.279601,fishing,0.087897
1,2,stock_exchanges,0.187968,palestine,0.069444
2,3,acquisitions_&_mergers,0.105882,self,0.06707
3,4,robots,0.081753,primate,0.053679
4,5,parents_&_parenting,0.06968,skepticism,0.052955
5,6,stock_prices,0.065067,font,0.052327
6,7,heart,0.060778,craft,0.047408
7,8,spelling,0.059364,retail_banking,0.038284
8,9,politics,0.056352,news_media,0.036774
9,10,chemicals,0.05579,journalism,0.034827





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,mathematicians,0.299448,enlightenment,0.110821
1,2,bids,0.264196,diesel_fuel,0.064273
2,3,heart,0.148599,cracking,0.03699
3,4,nuclear_weapons,0.13801,engineering_design_process,0.034086
4,5,corrections,0.129359,petroleum,0.028044
5,6,sensors,0.084653,national_security,0.027946
6,7,treasury,0.078682,nuclear_power,0.027069
7,8,tuition,0.07684,resource_allocation,0.026697
8,9,treasury_notes,0.063426,documentation,0.023119
9,10,weapons,0.057049,cancer,0.019358


In [12]:
%%bash
cd /root/science-society

python scripts/check_conditional_from_corpus.py \
  --corpus ./corpus/news_1991.txt \
  --targets automation productivity health_care legislation \
  --distance 1 \
  --topk 30

python scripts/check_conditional_from_corpus.py \
  --corpus ./corpus/paper_1991.txt \
  --targets automation productivity health_care legislation \
  --distance 1 \
  --topk 30


Traceback (most recent call last):
  File "/root/science-society/scripts/check_conditional_from_corpus.py", line 110, in <module>
    main()
  File "/root/science-society/scripts/check_conditional_from_corpus.py", line 63, in main
    for li, toks in enumerate(iter_corpus(args.corpus)):
  File "/root/science-society/scripts/check_conditional_from_corpus.py", line 7, in iter_corpus
    with open(path, encoding="utf-8") as f:
FileNotFoundError: [Errno 2] No such file or directory: './corpus/news_1991.txt'
Traceback (most recent call last):
  File "/root/science-society/scripts/check_conditional_from_corpus.py", line 110, in <module>
    main()
  File "/root/science-society/scripts/check_conditional_from_corpus.py", line 63, in main
    for li, toks in enumerate(iter_corpus(args.corpus)):
  File "/root/science-society/scripts/check_conditional_from_corpus.py", line 7, in iter_corpus
    with open(path, encoding="utf-8") as f:
FileNotFoundError: [Errno 2] No such file or directory: './corp

CalledProcessError: Command 'b'cd /root/science-society\n\npython scripts/check_conditional_from_corpus.py \\\n  --corpus ./corpus/news_1991.txt \\\n  --targets automation productivity health_care legislation \\\n  --distance 1 \\\n  --topk 30\n\npython scripts/check_conditional_from_corpus.py \\\n  --corpus ./corpus/paper_1991.txt \\\n  --targets automation productivity health_care legislation \\\n  --distance 1 \\\n  --topk 30\n'' returned non-zero exit status 1.