-
Notifications
You must be signed in to change notification settings - Fork 0
/
Makefile.riksprot
146 lines (121 loc) · 6.34 KB
/
Makefile.riksprot
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
include .env
# These need to be defined
ifndef RIKSPROT_VERSION
$(error RIKSPROT_VERSION is undefined)
endif
today=$(shell date '+%Y%m%d')
DATA_FOLDER=/data/riksdagen_corpus_data
SPEACH_CORPUS_FORMAT=feather
##########################################################################################################
# Default DTM model recepies (penelope)
##########################################################################################################
.PHONY: default-riksprot-dtms
default-riksprot-dtms: data_dirs
@for opts_file in ./opts/dtm/*.yml; do \
lemmatize=$$(yq '.lemmatize' $${opts_file}) ; \
if [ "$${lemmatize}" == "true" ]; then \
tok="lemma"; \
else \
tok="text"; \
fi; \
filename=$$(basename $${opts_file}); \
dtm_tag="$${filename%.*}" ; \
vrt_speech_corpus_folder=$(DATA_FOLDER)/$(RIKSPROT_VERSION)/corpus/$${tok}_vrt_speeches.$(SPEACH_CORPUS_FORMAT) ; \
dtm_target_folder=$(DATA_FOLDER)/$(RIKSPROT_VERSION)/dtm ; \
if [ ! -d "$${vrt_speech_corpus_folder}" ]; then \
echo "skipping: VRT corpus $${vrt_speech_corpus_folder} not found"; \
continue; \
fi; \
if [ ! -f "$${vrt_speech_corpus_folder}/corpus.yml" ]; then \
echo "skipping: VRT corpus config file $${vrt_speech_corpus_folder}/corpus.yml not found"; \
continue; \
fi; \
echo "info: creating $${tok} DTM speeches in $(SPEACH_CORPUS_FORMAT) format"; \
echo "info: target tag: $${dtm_tag} "; \
PYTHONPATH=. poetry run vectorize-id \
--options-filename $${opts_file} \
--config-filename $${vrt_speech_corpus_folder}/corpus.yml \
--corpus-source $${vrt_speech_corpus_folder} \
--output-folder $${dtm_target_folder} \
--output-tag $${dtm_tag} ; \
done
##########################################################################################################
# Default topic model model recepies (penelope)
##########################################################################################################
TM_OPTS_FILE=./opts/tm_riksprot.yml
TM_TARGET_FOLDER=$(DATA_FOLDER)/$(RIKSPROT_VERSION)/topic_models
OPTS_TM_MAX_TOKENS=$(shell yq '.max_tokens' ${TM_OPTS_FILE})
OPTS_TM_TOP_WORDS=$(shell yq '.num_top_words' ${TM_OPTS_FILE})
OPTS_TM_TF_THRESHOLD=$(shell yq '.tf_threshold' ${TM_OPTS_FILE})
OPTS_TM_TF_MASK=$(shell yq '.tf_threshold_mask' ${TM_OPTS_FILE})
OPTS_TM_LEMMATIZED=$(shell yq '.lemmatize' ${TM_OPTS_FILE})
OPTS_TM_ENGINE=$(shell yq '.engine' ${TM_OPTS_FILE})
ifeq ($(OPTS_TM_TF_MASK), true)
OPTS_TM_TF_MASK=.mask
else
OPTS_TM_TF_MASK=
endif
ifeq ($(OPTS_TM_LEMMATIZED), true)
OPTS_TM_TOK=lemma
else
OPTS_TM_TOK=text
endif
TM_PREFIX=tm_riksprot_$(RIKSPROT_VERSION)
TM_OPTS=$(OPTS_TM_MAX_TOKENS).TF$(OPTS_TM_TF_THRESHOLD)$(OPTS_TM_TF_MASK)_$(OPTS_TM_TOK).$(OPTS_TM_ENGINE)
TM_NAME=$(TM_PREFIX)_%-$(TM_OPTS)
NS := 100 200 500
default-riksprot-topic-models: $(patsubst %,$(TM_TARGET_FOLDER)/$(TM_NAME),$(NS))
$(TM_TARGET_FOLDER)/$(TM_NAME): data_dirs
@rm -rf $(TM_TARGET_FOLDER)/$(TM_PREFIX).$*-$(TM_OPTS)
@mkdir -p $(TM_TARGET_FOLDER)/$(TM_PREFIX).$*-$(TM_OPTS)
@vrt_speech_corpus_folder=$(DATA_FOLDER)/$(RIKSPROT_VERSION)/corpus/$${OPTS_TM_TOK}_vrt_speeches.$(SPEACH_CORPUS_FORMAT) ; \
if [ -d "$${vrt_speech_corpus_folder}" ]; then \
echo "skipping: VRT source corpus $${vrt_speech_corpus_folder} not found"; \
exit 64; \
fi; \
echo PYTHONPATH=. poetry run tm-train-id \
--options-filename ./opts/tm_riksprot.yml \
--corpus-source $${vrt_speech_corpus_folder} \
--n-topics $* \
--target-folder $(TM_TARGET_FOLDER) \
$${vrt_speech_corpus_folder}/corpus.yml \
$(TM_PREFIX).$*-$(TM_OPTS)
# &> $(TM_TARGET_FOLDER)/$(TM_PREFIX).$*-$(TM_OPTS)/tm_run_$*_$(today).log &
data_dirs:
@mkdir -p ./logs $(TM_TARGET_FOLDER)
yq-install:
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O $(HOME)/bin/yq \
&& chmod +x $(HOME)/bin/yq
##########################################################################################################
# Test recepies
##########################################################################################################
TEST_DATA_FOLDER=tests/test_data/riksprot/$(RIKSPROT_VERSION)
PYRIKSPROT_TESTDATA_TAR=riksprot_sample_testdata.$(RIKSPROT_VERSION).tar.gz
riksprot-test-data:
@mv -f $(TEST_DATA_FOLDER)/dtm_test.5files /tmp
@mv -f $(TEST_DATA_FOLDER)/tm_test.5files.mallet /tmp
@rm -rf $(TEST_DATA_FOLDER)
@wget -O /tmp/$(PYRIKSPROT_TESTDATA_TAR) https://raw.githubusercontent.com/welfare-state-analytics/pyriksprot/dev/tests/test_data/dists/$(PYRIKSPROT_TESTDATA_TAR)
@tar -C tests/test_data/riksprot --strip 3 -xvf /tmp/$(PYRIKSPROT_TESTDATA_TAR) tests/test_data/source/$(RIKSPROT_VERSION)
@rm -f /tmp/$(PYRIKSPROT_TESTDATA_TAR)
@cp -f tests/test_data/riksprot/corpus.id.yml $(TEST_DATA_FOLDER)/corpus.yml
@echo "NOTE: test tm model and dtm model must also be updated"
@mv -f /tmp/dtm_test.5files $(TEST_DATA_FOLDER)/
@mv -f /tmp/tm_test.5files.mallet $(TEST_DATA_FOLDER)/
riksprot-test-topic-model:
@echo "FIXME: this is not complete, TM must be created by penelope and copied to tests/test_data/riksprot/$(RIKSPROT_VERSION)"
@echo "Create model using Penelope:"
@echo PYTHONPATH=. poetry run python penelope/scripts/tm/train_id.py --options-filename opts/opts_tm_mallet_riksprot.test.yml
@scp -r roger@130.239.57.54:/home/roger/source/penelope/data/tm_test.5files.mallet tests/test_data/riksprot/$(RIKSPROT_VERSION)
riksprot-test-dtm:
@echo "FIXME: this is not complete, DTM must be created by penelope and copied to tests/test_data/riksprot/$(RIKSPROT_VERSION)"
@echo "Create model using Penelope:"
@echo PYTHONPATH=. poetry run python penelope/scripts/dtm/vectorize_id.py --options-filename opts/opts_vectorize_id_riksprot_test.yml
@echo "NOTE: Make sure vectorize uses the document_index (specified in corpus.yml) found in tagged speech corpus (i.e. feather folder)"
@scp -r roger@130.239.57.54:/home/roger/source/penelope/data/dtm_test.5files tests/test_data/riksprot/$(RIKSPROT_VERSION)
dummy-test-id-config:
echo "scp -r roger@130.239.57.54:/home/roger/source/welfare-state-analytics/pyriksprot/tests/test_data/source/corpus.yml corpus.yml"
RIKSPROT_XML_PATTERN=/data/westac/riksdagen_corpus_data/riksdagen-corpus/corpus/protocols/*/*.xml
riksprot-speaker-notes:
@xmlstarlet sel -N x="http://www.tei-c.org/ns/1.0" -t -m "//x:note[@type='speaker']" \
-v "concat(@type,';',@n,';','\"',normalize-space(translate(text(),';','')),'\"')" -nl $(RIKSPROT_XML_PATTERN)