In [1]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="`tokenizer` is deprecated and will be removed in version 5.0.0"
)

import transformers
transformers.utils.logging.set_verbosity_error()

In [2]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
MAP_DIR="./tokenizer/${MODE_TAG}"
mkdir -p "${CORPUS_DIR}" "${MAP_DIR}"

for Y in 1990 1991 1992 1993 1994 1995; do
  python ./scripts/make_walk_corpus_encoded.py \
    --pkl ./data/news/news_network_${Y}_m0.0-M1.0.pkl \
    --out "${CORPUS_DIR}/news_${Y}.txt" \
    --walk_length 20 --num_walks 20000 --seed 42 \
    --start_mode "${START_MODE}"\
    --mapping_json "${MAP_DIR}/news_mapping_${Y}.json"

  python ./scripts/make_walk_corpus_encoded.py \
    --pkl ./data/paper/paper_network_${Y}_m0.0-M0.5.pkl \
    --out "${CORPUS_DIR}/paper_${Y}.txt" \
    --walk_length 20 --num_walks 20000 --seed 42 \
    --start_mode "${START_MODE}" \
    --mapping_json "${MAP_DIR}/paper_mapping_${Y}.json"
done


[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1990.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1990.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1991.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1991.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1992.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1992.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1993.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1993.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1994.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1994.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/news_1995.txt
[OK] start_mode=uniform wrote 20000 walks to ./corpus/start=uniform/paper_1995.txt


In [3]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
DATASET_DIR="./datasets/${MODE_TAG}"
mkdir -p "${DATASET_DIR}"

python scripts/make_hf_dataset.py \
  --corpus_root "${CORPUS_DIR}" \
  --out_dir "${DATASET_DIR}" \
  --years 1990 1991 1992 1993 1994 1995 \
  --domains news paper

Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1151759.23 examples/s]


[OK] saved news_1990 -> ./datasets/start=uniform/news_1990 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1516872.45 examples/s]


[OK] saved paper_1990 -> ./datasets/start=uniform/paper_1990 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1513833.94 examples/s]


[OK] saved news_1991 -> ./datasets/start=uniform/news_1991 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1525922.80 examples/s]


[OK] saved paper_1991 -> ./datasets/start=uniform/paper_1991 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1577873.75 examples/s]


[OK] saved news_1992 -> ./datasets/start=uniform/news_1992 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1523096.81 examples/s]


[OK] saved paper_1992 -> ./datasets/start=uniform/paper_1992 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1574467.99 examples/s]


[OK] saved news_1993 -> ./datasets/start=uniform/news_1993 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1497350.73 examples/s]


[OK] saved paper_1993 -> ./datasets/start=uniform/paper_1993 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1545261.76 examples/s]


[OK] saved news_1994 -> ./datasets/start=uniform/news_1994 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1484761.94 examples/s]


[OK] saved paper_1994 -> ./datasets/start=uniform/paper_1994 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1491944.65 examples/s]


[OK] saved news_1995 -> ./datasets/start=uniform/news_1995 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1519565.25 examples/s]


[OK] saved paper_1995 -> ./datasets/start=uniform/paper_1995 (n=20000)


In [1]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

y=1990
DATASET_DIR="./datasets/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
mkdir -p "${CKPT_DIR}"

python scripts/train_mlm_preinit.py \
  --dataset_dir "${DATASET_DIR}/news_${y}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --output_dir "${CKPT_DIR}/news_${y}_seed" \
  --num_train_epochs 5 \
  --learning_rate 5e-5 \
  --mlm_prob 0.15 \
  --mask_replace_prob 0.9 \
  --random_replace_prob 0.1

python scripts/train_mlm_preinit.py \
  --dataset_dir "${DATASET_DIR}/paper_${y}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --output_dir "${CKPT_DIR}/paper_${y}_seed" \
  --num_train_epochs 5 \
  --learning_rate 5e-5 \
  --mlm_prob 0.15 \
  --mask_replace_prob 0.9 \
  --random_replace_prob 0.1

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[vocab] loaded node tokens: 22577
[tokenizer] added tokens: 22577 (new vocab size=50345)
[init] initialized added node token embeddings: 19823, fallback_to_UNK: 0
[mask] node_token_ids in tokenizer: 22577


100%|██████████| 3095/3095 [03:09<00:00, 16.37it/s]


{'loss': 7.5141, 'grad_norm': 8.922683715820312, 'learning_rate': 4.975936748023376e-05, 'epoch': 0.32}
{'loss': 6.3255, 'grad_norm': 7.7600884437561035, 'learning_rate': 4.6321760055001723e-05, 'epoch': 0.65}
{'loss': 6.0977, 'grad_norm': 7.388082027435303, 'learning_rate': 4.288415262976968e-05, 'epoch': 0.97}
{'loss': 5.9262, 'grad_norm': 7.509083271026611, 'learning_rate': 3.9446545204537646e-05, 'epoch': 1.29}
{'loss': 5.7389, 'grad_norm': 9.221393585205078, 'learning_rate': 3.600893777930561e-05, 'epoch': 1.62}
{'loss': 5.6237, 'grad_norm': 8.383805274963379, 'learning_rate': 3.257133035407357e-05, 'epoch': 1.94}
{'loss': 5.4937, 'grad_norm': 9.7520751953125, 'learning_rate': 2.913372292884153e-05, 'epoch': 2.26}
{'loss': 5.4093, 'grad_norm': 9.862661361694336, 'learning_rate': 2.569611550360949e-05, 'epoch': 2.58}
{'loss': 5.3022, 'grad_norm': 8.153643608093262, 'learning_rate': 2.225850807837745e-05, 'epoch': 2.91}
{'loss': 5.2436, 'grad_norm': 8.328288078308105, 'learning_rate

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[vocab] loaded node tokens: 22577
[tokenizer] added tokens: 22577 (new vocab size=50345)
[init] initialized added node token embeddings: 19823, fallback_to_UNK: 0
[mask] node_token_ids in tokenizer: 22577


100%|██████████| 3095/3095 [03:07<00:00, 16.53it/s]


{'loss': 8.6766, 'grad_norm': 10.301163673400879, 'learning_rate': 4.975936748023376e-05, 'epoch': 0.32}
{'loss': 7.6179, 'grad_norm': 8.883499145507812, 'learning_rate': 4.6321760055001723e-05, 'epoch': 0.65}
{'loss': 7.4292, 'grad_norm': 7.8002610206604, 'learning_rate': 4.288415262976968e-05, 'epoch': 0.97}
{'loss': 7.2233, 'grad_norm': 8.817094802856445, 'learning_rate': 3.9446545204537646e-05, 'epoch': 1.29}
{'loss': 7.1313, 'grad_norm': 8.841720581054688, 'learning_rate': 3.600893777930561e-05, 'epoch': 1.62}
{'loss': 7.07, 'grad_norm': 7.936330318450928, 'learning_rate': 3.257133035407357e-05, 'epoch': 1.94}
{'loss': 6.9678, 'grad_norm': 8.389918327331543, 'learning_rate': 2.913372292884153e-05, 'epoch': 2.26}
{'loss': 6.9288, 'grad_norm': 8.752935409545898, 'learning_rate': 2.569611550360949e-05, 'epoch': 2.58}
{'loss': 6.8726, 'grad_norm': 8.191651344299316, 'learning_rate': 2.225850807837745e-05, 'epoch': 2.91}
{'loss': 6.8351, 'grad_norm': 9.218463897705078, 'learning_rate':

In [2]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

init_y=1990
DATASET_DIR="./datasets/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
mkdir -p "${CKPT_DIR}"

d="news"
python scripts/train_annual_2stage_preinit.py \
  --init_dir "${CKPT_DIR}/${d}_${init_y}_seed" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --years 1991 1992 1993 1994 1995 \
  --dataset_template "${DATASET_DIR}/${d}_{year}" \
  --output_base_dir "${CKPT_DIR}/${d}_annual" \
  --exact_k_masks 1 \
  --num_train_epochs 2 \
  --learning_rate 1e-5 \
  --mask_replace_prob 0.8 \
  --random_replace_prob 0.1

d="paper"
python scripts/train_annual_2stage_preinit.py \
  --init_dir "${CKPT_DIR}/${d}_${init_y}_seed" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --years 1991 1992 1993 1994 1995 \
  --dataset_template "${DATASET_DIR}/${d}_{year}" \
  --output_base_dir "${CKPT_DIR}/${d}_annual" \
  --exact_k_masks 1 \
  --num_train_epochs 2 \
  --learning_rate 1e-5 \
  --mask_replace_prob 0.8 \
  --random_replace_prob 0.1

[plan]
  - 1991: ./datasets/start=uniform/news_1991
[vocab] node tokens: 22577

[year 1991] init_dir = ./checkpoints/start=uniform/news_1990_seed
[year 1991] dataset_dir = ./datasets/start=uniform/news_1991
[year 1991] node_token_ids in tokenizer: 22577


100%|██████████| 1238/1238 [01:17<00:00, 15.91it/s]


{'loss': 7.1425, 'grad_norm': 16.134113311767578, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 6.5777, 'grad_norm': 18.4716796875, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 6.2562, 'grad_norm': 17.341468811035156, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 6.1633, 'grad_norm': 16.555570602416992, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 5.9661, 'grad_norm': 17.260927200317383, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 5.9699, 'grad_norm': 19.834794998168945, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 77.8009, 'train_samples_per_second': 508.992, 'train_steps_per_second': 15.912, 'train_loss': 6.334916837381045, 'epoch': 2.0}
[year 1991] saved: ./checkpoints/start=uniform/news_annual/1991

[done] all years finished.
[done] final checkpoint: ./checkpoints/start=uniform/news_annual/1991
[plan]
  - 1991: ./datasets/start=uniform/paper_1991
[vocab] node

100%|██████████| 1238/1238 [01:18<00:00, 15.87it/s]


{'loss': 7.7113, 'grad_norm': 14.740870475769043, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 7.6502, 'grad_norm': 16.468698501586914, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 7.5727, 'grad_norm': 14.696656227111816, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 7.5372, 'grad_norm': 14.171396255493164, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 7.5144, 'grad_norm': 16.50183868408203, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 7.5124, 'grad_norm': 15.455684661865234, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 78.0063, 'train_samples_per_second': 507.651, 'train_steps_per_second': 15.871, 'train_loss': 7.578599871264136, 'epoch': 2.0}
[year 1991] saved: ./checkpoints/start=uniform/paper_annual/1991

[done] all years finished.
[done] final checkpoint: ./checkpoints/start=uniform/paper_annual/1991


In [3]:
%%bash
cd /root/science-society

START_MODE="uniform"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
OUT_DIR="./outputs/${MODE_TAG}"
mkdir -p "${OUT_DIR}"

python scripts/predict_neighbors_by_distance.py \
  --years 1991 \
  --domains news paper \
  --ckpt_root "${CKPT_DIR}" \
  --corpus_root "${CORPUS_DIR}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --targets automation productivity health_care legislation \
  --distance 1 \
  --topk 30 \
  --max_contexts 1000 \
  --out_root "${OUT_DIR}/mask_pred" \

[INFO] loaded node vocab: 22577
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=automation.tsv
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=productivity.tsv
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=health_care.tsv
[OK] saved ./outputs/start=uniform/mask_pred/dist=1/year=1991/target=legislation.tsv


In [4]:
import numpy as np, pandas as pd

targets = [
    "automation", "productivity", "health_care", "legislation",
]

base = "/root/science-society/outputs/start=uniform/mask_pred/dist=1/year=1991"
for t in targets:
    path = f"{base}/target={t}.tsv"
    print(f"\n============================== TARGET: {t} ==============================")
    df = pd.read_csv(path, sep="\t")
    display(df.head(20))





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,dining_rooms,0.274881,bidding,0.083016
1,2,firearms,0.177718,nuclear_reactor,0.065119
2,3,appliances,0.146838,gasoline,0.042574
3,4,airlines_and_airplanes,0.123106,buddhism,0.033282
4,5,bureaucracy,0.114293,traumatic_stress,0.033275
5,6,golf,0.104868,rehabilitation,0.032782
6,7,reconciliation,0.097166,drilling,0.02842
7,8,poultry,0.088445,methane,0.027653
8,9,organizational_behavior,0.061421,censorship,0.026326
9,10,rubber,0.06118,antigen,0.025978





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,appropriations,0.287721,kannada,0.070551
1,2,senators,0.102447,lease,0.039882
2,3,influence,0.080878,peat,0.036954
3,4,petitions,0.070119,telugu,0.03458
4,5,environmental_cleanup,0.069194,doors,0.033539
5,6,mars,0.065319,collar,0.033268
6,7,planets,0.059461,bengali,0.030059
7,8,globalization,0.055566,database_transaction,0.029016
8,9,watches_and_clocks,0.05437,exhibition,0.028796
9,10,salmonella,0.052094,corrosion,0.027157





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,spelling,0.27793,saint,0.139366
1,2,acquisitions_&_mergers,0.214094,craft,0.109672
2,3,stock_exchanges,0.172921,fishing,0.063341
3,4,robots,0.073005,occupational_exposure,0.04946
4,5,data_processing_(computers),0.066685,teamwork,0.049407
5,6,stock_prices,0.061635,retail_banking,0.04922
6,7,legislatures,0.061521,early_childhood,0.041809
7,8,health_care,0.054959,primate,0.039752
8,9,politics,0.052042,self,0.029486
9,10,heart,0.051772,early_childhood_education,0.028903





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,mathematicians,0.35789,cracking,0.096354
1,2,nuclear_weapons,0.218887,enlightenment,0.095019
2,3,treasury_notes,0.100481,cancer,0.063249
3,4,sensors,0.079233,diesel_fuel,0.061703
4,5,bids,0.070528,swallowing,0.051184
5,6,treasury_bonds,0.06885,gasoline,0.049381
6,7,poultry,0.068645,petroleum,0.035806
7,8,recycling,0.063552,recession,0.03373
8,9,mathematics,0.063363,nuclear_power,0.027603
9,10,personality,0.057247,resource_allocation,0.027365


In [5]:
import numpy as np, pandas as pd

targets = [
    "automation", "productivity", "health_care", "legislation",
]

base = "/root/science-society/outputs/start=uniform/mask_pred/dist=1/year=1994"
for t in targets:
    path = f"{base}/target={t}.tsv"
    print(f"\n============================== TARGET: {t} ==============================")
    df = pd.read_csv(path, sep="\t")
    display(df.head(20))





FileNotFoundError: [Errno 2] No such file or directory: '/root/science-society/outputs/start=uniform/mask_pred/dist=1/year=1994/target=automation.tsv'