/
cr-retrieve.sh
executable file
·276 lines (257 loc) · 12.3 KB
/
cr-retrieve.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
#!/bin/bash
#
#3> <> prov:specializationOf <https://github.com/timrdf/csv2rdf4lod-automation/blob/master/bin/cr-retrieve.sh> ;
#3> prov:wasRevisionOf <https://github.com/timrdf/csv2rdf4lod-automation/blob/master/bin/cr-publish-cockpit.sh> .
if [[ "$1" == "--help" || "$1" == "-h" ]]; then
echo "usage: `basename $0` [-w] [--skip-if-exists]"
echo
echo " Create publish/bin/publish.sh and invoke for every conversion cockpit within the current directory tree."
echo
echo " -w : Avoid dryrun; do it. If not provided, will only dry run."
echo " --skip-if-exists : If a version exists for the dataset, do not retrieve it."
exit 1
fi
#see='https://github.com/timrdf/csv2rdf4lod-automation/wiki/CSV2RDF4LOD-not-set'
#CSV2RDF4LOD_HOME=${CSV2RDF4LOD_HOME:?"not set; source csv2rdf4lod/source-me.sh or see $see"}
HOME=$(cd ${0%/*} && echo ${PWD%/*})
export CLASSPATH=$CLASSPATH`$HOME/bin/util/cr-situate-classpaths.sh`
CSV2RDF4LOD_HOME=${CSV2RDF4LOD_HOME:?$HOME}
# cr:data-root cr:source cr:directory-of-datasets cr:dataset cr:directory-of-versions cr:conversion-cockpit
ACCEPTABLE_PWDs="cr:data-root cr:source cr:dataset cr:directory-of-versions cr:conversion-cockpit"
if [ `${CSV2RDF4LOD_HOME}/bin/util/is-pwd-a.sh $ACCEPTABLE_PWDs` != "yes" ]; then
${CSV2RDF4LOD_HOME}/bin/util/pwd-not-a.sh $ACCEPTABLE_PWDs
exit 1
fi
function retrieve_from_metadata {
dcat="$1"
versionID="$2"
if [[ -e "$dcat" ]]; then
url=`grep "dcat:downloadURL" $dcat | head -1 | awk '{print $2}' | sed 's/<//; s/>.*$//'` # TODO: query it as RDF...
# ^^ phasing out; moving to multiple URLs ^^
# Newer: download them all, e.g. grep dcat:downloadURL access.ttl | awk '{print $2}' | sed 's/^.*<//;s/>.*$//'
urls=''
for download in `grep dcat:downloadURL $dcat | awk '{print $2}' | sed 's/^.*<//;s/>.*$//'`; do
# alternative: rdf2nt.sh access.ttl | grep '<http://www.w3.org/ns/dcat#downloadURL>' | awk '{print $3}' | grep http | sed 's/<//;s/>//' | grep -v " "
urls="$urls '$download'"
done
urls=${urls# \'}
urls=${urls%\'}
# e.g. lodcloud/data/source/harth-org/btc-2012/version/latest
google_key=''
if [[ "$url" =~ https://docs.google.com/spreadsheet* ]]; then
#google_key=`echo $url | sed 's/^.*key=//;s/#.*$//'`
# e.g. https://docs.google.com/spreadsheet/ccc?key=tejNArOGrsY_mV1VeZhYCYg#gid=0
# -> 'tejNArOGrsY_mV1VeZhYCYg'
google_key=`echo "$url" | sed 's/^.*key=//;s/&.*$//;s/#.*$//'`
# e.g. https://docs.google.com/spreadsheet/ccc?key=0An84UEjofnaydFRrUF9YWk03Y3NHNjJqUEg0NUhUZXc&usp=sharing#gid=0
# -> '0An84UEjofnaydFRrUF9YWk03Y3NHNjJqUEg0NUhUZXc'
if [ "$dryrun" != "yes" ]; then
cat $0.template_gs > retrieve.sh # NOTE: chmod +w /opt/csv2rdf4lod-automation/bin/cr-retrieve.sh.template
perl -pi -e "s|SPREADSHEET_KEY|$google_key|" retrieve.sh
if [[ ${#versionID} -gt 0 ]]; then
perl -pi -e "s|auto|$versionID|" retrieve.sh
fi
chmod +x retrieve.sh
./retrieve.sh
else
echo "`cr-dataset-uri.sh --uri`:"
echo " Will retrieve google spreadsheet $google_key b/c not yet retrieved $url"
fi
elif [[ "$url" =~ .*.git ]]; then
if [ "$dryrun" != "yes" ]; then
echo "#!/bin/bash" > retrieve.sh
echo "mkdir -p source && pushd source" >> retrieve.sh
echo "git clone $url" >> retrieve.sh
echo "popd" >> retrieve.sh
chmod +x retrieve.sh
./retrieve.sh
else
echo "`cr-dataset-uri.sh --uri`:"
echo " Will retrieve google spreadsheet $google_key b/c not yet retrieved $url"
fi
else
if [ "$dryrun" != "yes" ]; then
#echo template from $0 pwd: `pwd`
cat $0.template > retrieve.sh # NOTE: chmod +w /opt/csv2rdf4lod-automation/bin/cr-retrieve.sh.template
perl -pi -e "s|DOWNLOAD_URL|$urls|" retrieve.sh
if [[ ${#versionID} -gt 0 ]]; then
perl -pi -e "s|cr:auto|$versionID|" retrieve.sh
fi
chmod +x retrieve.sh
./retrieve.sh
else
echo "`cr-dataset-uri.sh --uri`:"
echo " Will retrieve b/c not yet retrieved $urls"
fi
fi
else
echo "$dcat" does not exist
fi
}
sdv=`cr-sdv.sh --slashes --fast`
wasInformed='a prov:Activity; prov:wasInformedBy <#cr-retrieve>;'
function log_start {
echo "#3> <retrieval/$sdv> $wasInformed prov:startedAtTime `dateInXSDDateTime.sh --turtle` ."
}
function log_end {
echo "#3> <retrieval/$sdv> prov:endedAtTime `dateInXSDDateTime.sh --turtle` ."
}
if [[ `is-pwd-a.sh cr:conversion-cockpit` == "yes" ]]; then
if [[ -e access.ttl && ! -e source ]]; then
access=access.ttl #`basename $PWD`/access.ttl
versionID=`basename $PWD`
echo "INFO: `basename $0`: retrieving un-retrieved version-specific access metadata for `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh` `cr-version-id.sh`: $access.2"
log_start
retrieve_from_metadata $access $versionID
log_end
elif [[ -e retrieve.sh && ! -e source ]]; then
if [[ ! -x retrieve.sh ]]; then
chmod +x retrieve.sh
fi
echo "INFO: `basename $0`: pulling un-retrieved retrieval trigger for `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh` `cr-version-id.sh`: $access. 93"
log_start
./retrieve.sh
log_end
fi
elif [[ `is-pwd-a.sh cr:directory-of-versions` == "yes" ]]; then
# TODO: generalize this; https://github.com/timrdf/csv2rdf4lod-automation/issues/323
if [ -e `cr-conversion-root.sh`/csv2rdf4lod-source-me.sh ]; then
source `cr-conversion-root.sh`/csv2rdf4lod-source-me.sh
else
see='https://github.com/timrdf/csv2rdf4lod-automation/wiki/CSV2RDF4LOD-environment-variables-(considerations-for-a-distributed-workflow)'
echo "#3> <> rdfs:seeAlso <$see> ." > `cr-conversion-root.sh`/csv2rdf4lod-source-me.sh
fi
# Include project-specific https://github.com/timrdf/csv2rdf4lod-automation/wiki/CSV2RDF4LOD-environment-variables
for sourceme in `find \`cr-conversion-root.sh\` -maxdepth 1 -name "csv2rdf4lod-source-me-for-*"`; do
source $sourceme
done
if [ -e `cr-conversion-root.sh`/csv2rdf4lod-source-me-as-`whoami`.sh ]; then
source `cr-conversion-root.sh`/csv2rdf4lod-source-me-as-`whoami`.sh
fi
w=''
dryrun="yes"
if [[ "$1" == "-w" || "$1" == "--write" ]]; then
w='-w'
dryrun="no"
shift
fi
skip_if_exists=""
if [[ "$1" == "--skip-if-exists" ]]; then
skip_if_exists="$1"
shift
fi
# It seems as though the pattern should be to find most-specific cases and work backwards,
# allowing all cases to trigger.
# Version-specific access metadata, with no custom retrieval trigger.
#
# e.g. working directory: data/source/us/cr-sparql-sd/version
# find returns: ./latest/access.ttl # depth = 2
for access in `find . -mindepth 2 -maxdepth 2 -name access.ttl`; do
if [[ ! -e `dirname $access`/source && ! -e `dirname $access`/retrieve.sh ]]; then
echo "INFO: `basename $0`: found un-retrieved version-specific access metadata for `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh`: $access. 94"
pushd `dirname $access` &> /dev/null
$0 $w $skip_if_exists
popd &> /dev/null
fi
done
latest_version=`cr-list-versions.sh`
if [[ -e retrieve.sh && `cr-idempotent.sh retrieve.sh` == 'yes' ]]; then
if [[ ! -x retrieve.sh ]]; then
chmod +x retrieve.sh
fi
echo "INFO: `basename $0`: pulling custom retrieval trigger in `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh`. 95"
log_start
./retrieve.sh
log_end
elif [[ `find . -mindepth 2 -maxdepth 2 -name retrieve.sh | wc -l | awk '{print $1}'` -gt 0 ]]; then
# A version-specific custom retrieval trigger.
#
# e.g. working directory: data/source/us/cr-sparql-sd/version
# find returns: ./latest/retrieve.sh # depth = 2
for trigger in `find . -mindepth 2 -maxdepth 2 -name retrieve.sh`; do
echo "INFO: `basename $0`: found custom retrieval trigger in `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh` $trigger. 96"
if [[ `cr-idempotent.sh $trigger` == 'yes' || ! -e `dirname $trigger`/source ]]; then
pushd `dirname $trigger` &> /dev/null
$0 $w $skip_if_exists
popd &> /dev/null
else
echo "INFO: `basename $0`: (but it wasn't idempotent, or it has already been retrieved) `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh` $trigger."
fi
done
elif [[ -n "$skip_if_exists" && ${#latest_version} -gt 0 ]]; then
not='not retrieving b/c --skip-if-exists was specified'
echo "INFO: `basename $0`: version for `cr-source-id.sh`/`cr-dataset-id.sh` already exists ($latest_version); $not."
elif [[ -e access.ttl || -e dcat.ttl || -e ../access.ttl || -e ../dcat.ttl ]]; then
# dcat.ttl was a bad choice of name. It should be named after its purpose, not the specific vocab.
# that currently achieves it. Still triggering on dcat.ttl for backward compatibility.
access='' # RDF file containing distribution information - which file to download for this dataset?
if [ -e access.ttl ]; then
access='access.ttl'
elif [ -e dcat.ttl ]; then
access='dcat.ttl'
elif [ -e ../access.ttl ]; then
access='../access.ttl'
elif [ -e ../dcat.ttl ]; then
access='../dcat.ttl'
fi
if [ -e "$access" ]; then
echo "INFO: `basename $0`: retrieving un-retrieved version-specific access metadata for `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh` `cr-version-id.sh`: $access. 97"
log_start
retrieve_from_metadata $access "" # versionID
log_end
fi
elif [[ ${#latest_version} -eq 0 && ! -e dcat.ttl && ! -e ../dcat.ttl && -e "ls retrieve.*" ]]; then
# There is no version yet, there is no dcat.ttl, but there is a retrieve.sh
chmod +x retrieve.*
echo "INFO: `basename $0`: pulling custom retrieval trigger in `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh`. 98"
log_start
./retrieve.*
log_end
elif [[ -e retrieve.sh ]]; then
if [[ ! -x retrieve.sh ]]; then
chmod +x retrieve.sh
fi
echo "INFO: `basename $0`: pulling custom retrieval trigger in `cr-pwd-type.sh` `cr-source-id.sh` `cr-dataset-id.sh`. 99"
log_start
./retrieve.sh
log_end
else
echo "[WARNING]: did not know how to handle `cr-pwd.sh`; no access metadata available."
fi
elif [[ `is-pwd-a.sh cr:dataset ` == "yes" ]]; then
if [ ! -e version ]; then
mkdir version # See https://github.com/timrdf/csv2rdf4lod-automation/wiki/Directory-Conventions
fi
pushd version > /dev/null
$0 $* # Recursive call
popd > /dev/null
elif [[ `is-pwd-a.sh cr:directory-of-datasets ` == "yes" ]]; then
for next in `directories.sh`; do
pushd $next > /dev/null
$0 $* # Recursive call
popd > /dev/null
done
elif [[ `is-pwd-a.sh cr:source ` == "yes" ]]; then
if [ -d dataset ]; then
# This would conform to the directory structure if
# we had included 'dataset' in the convention.
# This is here in case we ever fully support it.
pushd dataset > /dev/null
$0 $* # Recursive call
popd > /dev/null
else
# Handle the original (3-year old) directory structure
# that does not include 'dataset' as a directory.
for dataset in `cr-list-datasets.sh`; do
pushd $dataset > /dev/null
$0 $* # Recursive call
popd > /dev/null
done
fi
elif [[ `is-pwd-a.sh cr:data-root ` == "yes" ]]; then
for sourceID in `cr-list-sources.sh`; do
pushd $sourceID > /dev/null
$0 $* # Recursive call
popd > /dev/null
done
fi