In [32]:
import warnings
warnings.filterwarnings(
    "ignore",
    message="`tokenizer` is deprecated and will be removed in version 5.0.0"
)

import transformers
transformers.utils.logging.set_verbosity_error()

In [25]:
%%bash
cd /root/science-society

START_MODE="per_node"
PER_NODE_SCHEME="degree"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
MAP_DIR="./tokenizer/${MODE_TAG}"
mkdir -p "${CORPUS_DIR}" "${MAP_DIR}"

for Y in 1990 1991 1992 1993 1994 1995; do
  python ./scripts/make_walk_corpus_encoded.py \
    --pkl ./data/news/news_network_${Y}_m0.0-M1.0.pkl \
    --out "${CORPUS_DIR}/news_${Y}.txt" \
    --walk_length 20 --num_walks 20000 --seed 42 \
    --start_mode "${START_MODE}" --per_node_scheme "${PER_NODE_SCHEME}" \
    --mapping_json "${MAP_DIR}/news_mapping_${Y}.json"

  python ./scripts/make_walk_corpus_encoded.py \
    --pkl ./data/paper/paper_network_${Y}_m0.0-M0.5.pkl \
    --out "${CORPUS_DIR}/paper_${Y}.txt" \
    --walk_length 20 --num_walks 20000 --seed 42 \
    --start_mode "${START_MODE}" --per_node_scheme "${PER_NODE_SCHEME}" \
    --mapping_json "${MAP_DIR}/paper_mapping_${Y}.json"
done


[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/news_1990.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/paper_1990.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/news_1991.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/paper_1991.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/news_1992.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/paper_1992.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/news_1993.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=degree/paper_1993.txt
[OK] start_mode=per_node scheme=degree wrote 20000 walks to ./corpus/start=per_node__scheme=

In [26]:
%%bash
cd /root/science-society

START_MODE="per_node"
PER_NODE_SCHEME="degree"

if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
DATASET_DIR="./datasets/${MODE_TAG}"
mkdir -p "${DATASET_DIR}"

python scripts/make_hf_dataset.py \
  --corpus_root "${CORPUS_DIR}" \
  --out_dir "${DATASET_DIR}" \
  --years 1990 1991 1992 1993 1994 1995 \
  --domains news paper

Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1598347.66 examples/s]


[OK] saved news_1990 -> ./datasets/start=per_node__scheme=degree/news_1990 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1586587.99 examples/s]


[OK] saved paper_1990 -> ./datasets/start=per_node__scheme=degree/paper_1990 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1649255.45 examples/s]


[OK] saved news_1991 -> ./datasets/start=per_node__scheme=degree/news_1991 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1588721.43 examples/s]


[OK] saved paper_1991 -> ./datasets/start=per_node__scheme=degree/paper_1991 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1641895.44 examples/s]


[OK] saved news_1992 -> ./datasets/start=per_node__scheme=degree/news_1992 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1510834.79 examples/s]


[OK] saved paper_1992 -> ./datasets/start=per_node__scheme=degree/paper_1992 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1664010.16 examples/s]


[OK] saved news_1993 -> ./datasets/start=per_node__scheme=degree/news_1993 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1521798.16 examples/s]


[OK] saved paper_1993 -> ./datasets/start=per_node__scheme=degree/paper_1993 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1637057.10 examples/s]


[OK] saved news_1994 -> ./datasets/start=per_node__scheme=degree/news_1994 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1480987.25 examples/s]


[OK] saved paper_1994 -> ./datasets/start=per_node__scheme=degree/paper_1994 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1531830.10 examples/s]


[OK] saved news_1995 -> ./datasets/start=per_node__scheme=degree/news_1995 (n=20000)


Saving the dataset (1/1 shards): 100%|██████████| 20000/20000 [00:00<00:00, 1487658.37 examples/s]


[OK] saved paper_1995 -> ./datasets/start=per_node__scheme=degree/paper_1995 (n=20000)


In [31]:
%%bash
cd /root/science-society

START_MODE="per_node"
PER_NODE_SCHEME="degree"
if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

y=1990
DATASET_DIR="./datasets/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
mkdir -p "${CKPT_DIR}"

python scripts/train_mlm_preinit.py \
  --dataset_dir "${DATASET_DIR}/news_${y}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --output_dir "${CKPT_DIR}/news_${y}_seed" \
  --num_train_epochs 5 \
  --learning_rate 5e-5 \
  --mlm_prob 0.15

python scripts/train_mlm_preinit.py \
  --dataset_dir "${DATASET_DIR}/paper_${y}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --output_dir "${CKPT_DIR}/paper_${y}_seed" \
  --num_train_epochs 5 \
  --learning_rate 5e-5 \
  --mlm_prob 0.15

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[vocab] loaded node tokens: 22577
[tokenizer] added tokens: 22577 (new vocab size=50345)
[init] initialized added node token embeddings: 19823, fallback_to_UNK: 0


Map: 100%|██████████| 20000/20000 [00:00<00:00, 27320.66 examples/s]


[mask] node_token_ids in tokenizer: 22577


100%|██████████| 3095/3095 [03:04<00:00, 16.75it/s]


{'loss': 7.9307, 'grad_norm': 8.476469039916992, 'learning_rate': 4.975936748023376e-05, 'epoch': 0.32}
{'loss': 6.8197, 'grad_norm': 6.869620323181152, 'learning_rate': 4.6321760055001723e-05, 'epoch': 0.65}
{'loss': 6.6206, 'grad_norm': 8.264772415161133, 'learning_rate': 4.288415262976968e-05, 'epoch': 0.97}
{'loss': 6.4218, 'grad_norm': 7.83885383605957, 'learning_rate': 3.9446545204537646e-05, 'epoch': 1.29}
{'loss': 6.2753, 'grad_norm': 8.379073143005371, 'learning_rate': 3.600893777930561e-05, 'epoch': 1.62}
{'loss': 6.1651, 'grad_norm': 8.86072063446045, 'learning_rate': 3.257133035407357e-05, 'epoch': 1.94}
{'loss': 6.0564, 'grad_norm': 9.469405174255371, 'learning_rate': 2.913372292884153e-05, 'epoch': 2.26}
{'loss': 5.9395, 'grad_norm': 9.132473945617676, 'learning_rate': 2.569611550360949e-05, 'epoch': 2.58}
{'loss': 5.8618, 'grad_norm': 8.650469779968262, 'learning_rate': 2.225850807837745e-05, 'epoch': 2.91}
{'loss': 5.7618, 'grad_norm': 10.042160034179688, 'learning_rate

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[vocab] loaded node tokens: 22577
[tokenizer] added tokens: 22577 (new vocab size=50345)
[init] initialized added node token embeddings: 19823, fallback_to_UNK: 0


Map: 100%|██████████| 20000/20000 [00:00<00:00, 30231.97 examples/s]


[mask] node_token_ids in tokenizer: 22577


100%|██████████| 3095/3095 [03:04<00:00, 16.76it/s]


{'loss': 9.1048, 'grad_norm': 8.945556640625, 'learning_rate': 4.975936748023376e-05, 'epoch': 0.32}
{'loss': 8.1501, 'grad_norm': 7.651319980621338, 'learning_rate': 4.6321760055001723e-05, 'epoch': 0.65}
{'loss': 7.9542, 'grad_norm': 7.676267623901367, 'learning_rate': 4.288415262976968e-05, 'epoch': 0.97}
{'loss': 7.7789, 'grad_norm': 7.2110981941223145, 'learning_rate': 3.9446545204537646e-05, 'epoch': 1.29}
{'loss': 7.7018, 'grad_norm': 9.12881088256836, 'learning_rate': 3.600893777930561e-05, 'epoch': 1.62}
{'loss': 7.6291, 'grad_norm': 8.238570213317871, 'learning_rate': 3.257133035407357e-05, 'epoch': 1.94}
{'loss': 7.5294, 'grad_norm': 11.218135833740234, 'learning_rate': 2.913372292884153e-05, 'epoch': 2.26}
{'loss': 7.4944, 'grad_norm': 9.016424179077148, 'learning_rate': 2.569611550360949e-05, 'epoch': 2.58}
{'loss': 7.439, 'grad_norm': 7.707988262176514, 'learning_rate': 2.225850807837745e-05, 'epoch': 2.91}
{'loss': 7.3573, 'grad_norm': 9.342020988464355, 'learning_rate':

In [41]:
%%bash
cd /root/science-society

START_MODE="per_node"
PER_NODE_SCHEME="degree"
if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

init_y=1990
DATASET_DIR="./datasets/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
mkdir -p "${CKPT_DIR}"

d="news"
python scripts/train_annual_2stage_preinit.py \
  --init_dir "${CKPT_DIR}/${d}_${init_y}_seed" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --years 1991 1992 1993 1994 \
  --dataset_template "${DATASET_DIR}/${d}_{year}" \
  --output_base_dir "${CKPT_DIR}/${d}_annual" \
  --exact_k_masks 1 \
  --num_train_epochs 2 \
  --learning_rate 1e-5 \

d="paper"
python scripts/train_annual_2stage_preinit.py \
  --init_dir "${CKPT_DIR}/${d}_${init_y}_seed" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --years 1991 1992 1993 1994 \
  --dataset_template "${DATASET_DIR}/${d}_{year}" \
  --output_base_dir "${CKPT_DIR}/${d}_annual" \
  --exact_k_masks 1 \
  --num_train_epochs 2 \
  --learning_rate 1e-5

[plan]
  - 1991: ./datasets/start=per_node__scheme=degree/news_1991
  - 1992: ./datasets/start=per_node__scheme=degree/news_1992
  - 1993: ./datasets/start=per_node__scheme=degree/news_1993
  - 1994: ./datasets/start=per_node__scheme=degree/news_1994
[vocab] node tokens: 22577

[year 1991] init_dir = ./checkpoints/start=per_node__scheme=degree/news_1990_seed
[year 1991] dataset_dir = ./datasets/start=per_node__scheme=degree/news_1991
[year 1991] node_token_ids in tokenizer: 22577


100%|██████████| 1238/1238 [02:24<00:00,  8.56it/s]


{'loss': 7.1423, 'grad_norm': 15.847662925720215, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 6.5609, 'grad_norm': 14.771403312683105, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 6.3241, 'grad_norm': 14.427084922790527, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 6.1209, 'grad_norm': 15.409149169921875, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 6.0364, 'grad_norm': 15.750102996826172, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 6.0232, 'grad_norm': 16.35198974609375, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 144.6702, 'train_samples_per_second': 273.726, 'train_steps_per_second': 8.557, 'train_loss': 6.355736592282001, 'epoch': 2.0}
[year 1991] saved: ./checkpoints/start=per_node__scheme=degree/news_annual/1991

[year 1992] init_dir = ./checkpoints/start=per_node__scheme=degree/news_annual/1991
[year 1992] dataset_dir = ./datasets/start=per_node__schem

Map: 100%|██████████| 20000/20000 [00:00<00:00, 31618.89 examples/s]
100%|██████████| 1238/1238 [02:11<00:00,  9.39it/s]


{'loss': 7.1675, 'grad_norm': 11.60882568359375, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 6.8658, 'grad_norm': 12.282975196838379, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 6.6841, 'grad_norm': 13.733546257019043, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 6.5773, 'grad_norm': 12.897802352905273, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 6.5494, 'grad_norm': 12.483098030090332, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 6.499, 'grad_norm': 14.429282188415527, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 131.7898, 'train_samples_per_second': 300.479, 'train_steps_per_second': 9.394, 'train_loss': 6.718625920654691, 'epoch': 2.0}
[year 1992] saved: ./checkpoints/start=per_node__scheme=degree/news_annual/1992

[year 1993] init_dir = ./checkpoints/start=per_node__scheme=degree/news_annual/1992
[year 1993] dataset_dir = ./datasets/start=per_node__scheme

Map: 100%|██████████| 20000/20000 [00:00<00:00, 28827.22 examples/s]
100%|██████████| 1238/1238 [02:24<00:00,  8.58it/s]


{'loss': 6.9908, 'grad_norm': 13.227694511413574, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 6.7722, 'grad_norm': 14.232114791870117, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 6.6868, 'grad_norm': 12.257463455200195, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 6.5788, 'grad_norm': 12.413376808166504, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 6.5024, 'grad_norm': 13.275470733642578, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 6.5039, 'grad_norm': 13.886439323425293, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 144.2377, 'train_samples_per_second': 274.547, 'train_steps_per_second': 8.583, 'train_loss': 6.666881173030625, 'epoch': 2.0}
[year 1993] saved: ./checkpoints/start=per_node__scheme=degree/news_annual/1993

[year 1994] init_dir = ./checkpoints/start=per_node__scheme=degree/news_annual/1993
[year 1994] dataset_dir = ./datasets/start=per_node__sche

Map: 100%|██████████| 20000/20000 [00:00<00:00, 26730.69 examples/s]
100%|██████████| 1238/1238 [02:10<00:00,  9.49it/s]


{'loss': 7.0321, 'grad_norm': 12.06840991973877, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 6.878, 'grad_norm': 11.390673637390137, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 6.7715, 'grad_norm': 13.0507173538208, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 6.688, 'grad_norm': 13.103570938110352, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 6.5971, 'grad_norm': 12.631701469421387, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 6.5846, 'grad_norm': 13.459339141845703, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 130.4332, 'train_samples_per_second': 303.604, 'train_steps_per_second': 9.491, 'train_loss': 6.752154126883709, 'epoch': 2.0}
[year 1994] saved: ./checkpoints/start=per_node__scheme=degree/news_annual/1994

[done] all years finished.
[done] final checkpoint: ./checkpoints/start=per_node__scheme=degree/news_annual/1994
[plan]
  - 1991: ./datasets/start

100%|██████████| 1238/1238 [02:13<00:00,  9.26it/s]


{'loss': 7.5964, 'grad_norm': 15.339761734008789, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 7.4909, 'grad_norm': 14.729979515075684, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 7.5198, 'grad_norm': 14.912036895751953, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 7.4521, 'grad_norm': 17.014968872070312, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 7.4064, 'grad_norm': 16.822351455688477, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 7.4745, 'grad_norm': 16.453340530395508, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 133.6495, 'train_samples_per_second': 296.297, 'train_steps_per_second': 9.263, 'train_loss': 7.485237158558095, 'epoch': 2.0}
[year 1991] saved: ./checkpoints/start=per_node__scheme=degree/paper_annual/1991

[year 1992] init_dir = ./checkpoints/start=per_node__scheme=degree/paper_annual/1991
[year 1992] dataset_dir = ./datasets/start=per_node__sc

Map: 100%|██████████| 20000/20000 [00:00<00:00, 31430.77 examples/s]
100%|██████████| 1238/1238 [02:12<00:00,  9.35it/s]


{'loss': 7.5887, 'grad_norm': 13.813695907592773, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 7.517, 'grad_norm': 15.242385864257812, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 7.501, 'grad_norm': 17.38028335571289, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 7.4708, 'grad_norm': 15.156035423278809, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 7.454, 'grad_norm': 14.998969078063965, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 7.4685, 'grad_norm': 16.39737319946289, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 132.4396, 'train_samples_per_second': 299.004, 'train_steps_per_second': 9.348, 'train_loss': 7.497270353006045, 'epoch': 2.0}
[year 1992] saved: ./checkpoints/start=per_node__scheme=degree/paper_annual/1992

[year 1993] init_dir = ./checkpoints/start=per_node__scheme=degree/paper_annual/1992
[year 1993] dataset_dir = ./datasets/start=per_node__scheme=

Map: 100%|██████████| 20000/20000 [00:00<00:00, 24388.91 examples/s]
100%|██████████| 1238/1238 [01:26<00:00, 14.31it/s]


{'loss': 7.5388, 'grad_norm': 14.742157936096191, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 7.5076, 'grad_norm': 14.776463508605957, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 7.5252, 'grad_norm': 15.644247055053711, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 7.4702, 'grad_norm': 15.093833923339844, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 7.473, 'grad_norm': 14.166508674621582, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 7.469, 'grad_norm': 15.781371116638184, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 86.5005, 'train_samples_per_second': 457.801, 'train_steps_per_second': 14.312, 'train_loss': 7.494220764455965, 'epoch': 2.0}
[year 1993] saved: ./checkpoints/start=per_node__scheme=degree/paper_annual/1993

[year 1994] init_dir = ./checkpoints/start=per_node__scheme=degree/paper_annual/1993
[year 1994] dataset_dir = ./datasets/start=per_node__sche

Map: 100%|██████████| 20000/20000 [00:00<00:00, 28696.08 examples/s]
100%|██████████| 1238/1238 [01:16<00:00, 16.24it/s]


{'loss': 7.5824, 'grad_norm': 14.554081916809082, 'learning_rate': 8.92519346517627e-06, 'epoch': 0.32}
{'loss': 7.5948, 'grad_norm': 13.38991928100586, 'learning_rate': 7.2055030094582974e-06, 'epoch': 0.65}
{'loss': 7.5493, 'grad_norm': 15.710368156433105, 'learning_rate': 5.485812553740327e-06, 'epoch': 0.97}
{'loss': 7.5371, 'grad_norm': 14.368940353393555, 'learning_rate': 3.7661220980223563e-06, 'epoch': 1.29}
{'loss': 7.5192, 'grad_norm': 13.160619735717773, 'learning_rate': 2.0464316423043853e-06, 'epoch': 1.62}
{'loss': 7.4537, 'grad_norm': 18.83925437927246, 'learning_rate': 3.2674118658641445e-07, 'epoch': 1.94}
{'train_runtime': 76.24, 'train_samples_per_second': 519.412, 'train_steps_per_second': 16.238, 'train_loss': 7.537785716511321, 'epoch': 2.0}
[year 1994] saved: ./checkpoints/start=per_node__scheme=degree/paper_annual/1994

[done] all years finished.
[done] final checkpoint: ./checkpoints/start=per_node__scheme=degree/paper_annual/1994


In [42]:
%%bash
cd /root/science-society

START_MODE="per_node"
PER_NODE_SCHEME="degree"
if [ "${START_MODE}" = "per_node" ]; then
  MODE_TAG="start=${START_MODE}__scheme=${PER_NODE_SCHEME}"
else
  MODE_TAG="start=${START_MODE}"
fi

CORPUS_DIR="./corpus/${MODE_TAG}"
CKPT_DIR="./checkpoints/${MODE_TAG}"
OUT_DIR="./outputs/${MODE_TAG}"
mkdir -p "${OUT_DIR}"

python scripts/predict_neighbors_by_distance.py \
  --years 1991 1992 1993 1994 \
  --domains news paper \
  --ckpt_root "${CKPT_DIR}" \
  --corpus_root "${CORPUS_DIR}" \
  --node_vocab_txt ./tokenizer/concept_vocab_encoded.txt \
  --targets automation productivity health_care legislation \
  --distance 1 \
  --topk 30 \
  --max_contexts 1000 \
  --out_root "${OUT_DIR}/mask_pred" \

[INFO] loaded node vocab: 22577
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1991/target=automation.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1991/target=productivity.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1991/target=health_care.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1991/target=legislation.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1992/target=automation.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1992/target=productivity.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1992/target=health_care.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1992/target=legislation.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1993/target=automation.tsv
[OK] saved ./outputs/start=per_node__scheme=degree/mask_pred

In [43]:
import numpy as np, pandas as pd

targets = [
    "automation", "productivity", "health_care", "legislation",
]

base = "/root/science-society/outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1991"
for t in targets:
    path = f"{base}/target={t}.tsv"
    print(f"\n============================== TARGET: {t} ==============================")
    df = pd.read_csv(path, sep="\t")
    display(df.head(20))





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,poultry,0.386409,video_production,0.157205
1,2,golf,0.377192,downtown,0.081776
2,3,warships,0.245459,drilling,0.073364
3,4,tax_incentives,0.150103,cost_estimate,0.046444
4,5,speech,0.117597,queue,0.042701
5,6,public_hearings,0.112206,space_exploration,0.038435
6,7,renminbi,0.105525,library_catalog,0.038322
7,8,golf_courses,0.081944,video_editing,0.034282
8,9,railroads,0.076622,camouflage,0.034101
9,10,complaints,0.068055,critical_thinking,0.030201





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,clergy,0.275154,recreation,0.243869
1,2,health_care,0.100561,food_products,0.088015
2,3,bonds,0.087122,real_estate,0.063439
3,4,demographics,0.084714,food_packaging,0.049514
4,5,indictments,0.070439,property_rights,0.038939
5,6,air_pollution,0.066885,marketing_mix,0.038795
6,7,trucks,0.065691,polymer,0.038467
7,8,globalization,0.061236,estate,0.037392
8,9,subsidies,0.052057,humidity,0.037056
9,10,influence,0.047745,radioactive_waste,0.033142





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,motion_pictures,0.069192,lease,0.103274
1,2,heart,0.06361,acute_pain,0.089652
2,3,acquisitions_&_mergers,0.060948,right_hemisphere,0.067478
3,4,families_&_family_life,0.050937,supreme_court,0.062931
4,5,books,0.048548,nosology,0.06226
5,6,stock_exchanges,0.045253,teamwork,0.051941
6,7,health_care,0.044998,outbreak,0.048319
7,8,medical_technology,0.04102,management_accounting,0.047588
8,9,robots,0.040828,manufacturing,0.043547
9,10,international_trade,0.040127,national_accounts,0.037417





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,accreditation,0.177031,nuclear_weapon,0.264267
1,2,exports,0.152879,ozone,0.082742
2,3,grants,0.086077,regression,0.082009
3,4,professionals,0.078219,foraging,0.073079
4,5,algorithms,0.076359,urban_planning,0.064802
5,6,landfill,0.072879,nuclear_material,0.054423
6,7,poultry,0.067716,transportation_planning,0.05385
7,8,recycling,0.062029,resistance_(ecology),0.032839
8,9,reorganization,0.060299,drama,0.032238
9,10,shipments,0.046145,addiction,0.031689


In [44]:
import numpy as np, pandas as pd

targets = [
    "automation", "productivity", "health_care", "legislation",
]

base = "/root/science-society/outputs/start=per_node__scheme=degree/mask_pred/dist=1/year=1994"
for t in targets:
    path = f"{base}/target={t}.tsv"
    print(f"\n============================== TARGET: {t} ==============================")
    df = pd.read_csv(path, sep="\t")
    display(df.head(20))





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,silver,0.379643,daylight,0.148422
1,2,fish,0.16755,drilling,0.074541
2,3,timber,0.137276,hazard,0.064758
3,4,sanctions,0.093883,mill,0.04952
4,5,united_states_economy,0.087518,infantry,0.043443
5,6,corn,0.063737,drill,0.03298
6,7,candidates,0.05352,video_camera,0.031461
7,8,political_advertising,0.049332,hierarchy,0.029253
8,9,mining,0.046791,graphical_display,0.028804
9,10,society,0.045992,runway,0.028672





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,fish,0.116435,longitudinal_study,0.088931
1,2,tobacco,0.105407,foraging,0.063066
2,3,psychologists,0.053729,thermal_comfort,0.035309
3,4,beer,0.051262,shoot,0.035083
4,5,smoking,0.048254,metadata,0.034167
5,6,municipal_bonds,0.044027,taste,0.032196
6,7,laser_discs,0.043616,determinism,0.031673
7,8,bonds,0.042641,nuclear_reactor,0.027797
8,9,satellites,0.040279,christian_ministry,0.027512
9,10,travel,0.039783,thermal,0.027231





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,fish,0.192984,blindness,0.181522
1,2,state_elections,0.125496,autopsy,0.109895
2,3,candidates,0.065064,ultrasonography,0.061465
3,4,tax_increases,0.059185,standardized_test,0.059703
4,5,politics,0.055014,lumbar,0.054498
5,6,elections,0.042189,space_shuttle,0.044384
6,7,expansion,0.038477,club,0.043305
7,8,training,0.030406,commit,0.036683
8,9,surgery,0.029247,research_ethics,0.034512
9,10,grants,0.028958,tomography,0.033743





Unnamed: 0,rank,news_neighbor,news_avg_prob,paper_neighbor,paper_avg_prob
0,1,cooperation,0.06373,wheelchair,0.76084
1,2,budgets,0.047247,undo,0.391474
2,3,computer_programming,0.041471,breathing,0.100009
3,4,credit,0.038547,motor_control,0.04211
4,5,engineers,0.037664,ask_price,0.041984
5,6,nonfiction,0.037007,alcohol,0.041323
6,7,recordings_(video),0.031603,asthma,0.037035
7,8,railroads,0.030052,sanctions,0.035191
8,9,fares,0.02725,verb,0.033581
9,10,legislation,0.026786,nuclear_weapon,0.032731
