/
datasets.clj
153 lines (134 loc) · 7.98 KB
/
datasets.clj
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
(ns com.eldrix.deprivare.datasets
(:require [clj-http.client :as client]
[clojure.java.io :as io]
[clojure.set :as set]
[clojure.string :as str]
[clojure.core.async :as a]
[com.eldrix.deprivare.odf :as odf]
[clojure.data.csv :as csv]
[clojure.edn :as edn])
(:import [java.io File]))
(defn parse-double [s] (Double/parseDouble s))
(defn parse-long [s] (Long/parseLong s))
(defn parse-double-as-long [s] (long (Double/parseDouble s)))
(def property-parsers
{:uk-composite-imd-2020-mysoc/income_score parse-double
:uk-composite-imd-2020-mysoc/E_expanded_decile parse-double-as-long
:uk-composite-imd-2020-mysoc/UK_IMD_E_score parse-double
:uk-composite-imd-2020-mysoc/overall_local_score parse-double
:uk-composite-imd-2020-mysoc/original_decile parse-long
:uk-composite-imd-2020-mysoc/UK_IMD_E_rank parse-double-as-long
:uk-composite-imd-2020-mysoc/UK_IMD_E_pop_quintile parse-long
:uk-composite-imd-2020-mysoc/employment_score parse-double
:uk-composite-imd-2020-mysoc/UK_IMD_E_pop_decile parse-long})
(defn parse
[m]
(reduce-kv (fn [acc k v]
(if-let [parser (get property-parsers k)]
(assoc acc k (try (parser v) (catch Exception e
(throw (ex-info "failed to parse" {:k k :v v})))))
(assoc acc k v))) {} m))
(defn- download-file
"Downloads a file from a URL to a temporary file, which is returned.
Sets the user agent header appropriately; some URLs return a 403 if there is
no defined user agent, including gov.wales."
[url prefix suffix]
(let [f (File/createTempFile prefix suffix)]
(with-open [is (:body (client/get url {:headers {"User-Agent" "deprivare v0.1"} :as :stream}))
os (io/output-stream f)]
(io/copy is os)
f)))
(def uk-composite-imd-2020-mysoc-url
"URL to download a composite UK score for deprivation indices for 2020 -
based on England with adjusted scores for the other nations as per Abel, Payne
and Barclay but calculated by Alex Parsons on behalf of MySociety."
"https://github.com/mysociety/composite_uk_imd/blob/e7a14d3317d9462890c28513866687a3a35adc8d/uk_index/UK_IMD_E.csv?raw=true")
(def headers-uk-composite-2020-mysoc
["nation"
"lsoa"
"overall_local_score"
"income_score"
"employment_score"
"UK_IMD_E_score"
"original_decile"
"E_expanded_decile"
"UK_IMD_E_rank"
"UK_IMD_E_pop_decile"
"UK_IMD_E_pop_quintile"])
(defn stream-uk-composite-imd-2020
"Streams the uk-composite-imd-2020-mysoc data to the channel specified."
[ch]
(with-open [reader (io/reader uk-composite-imd-2020-mysoc-url)]
(let [lines (csv/read-csv reader)]
(if-not (= headers-uk-composite-2020-mysoc (first lines))
(throw (ex-info "invalid CSV headers" {:expected headers-uk-composite-2020-mysoc :actual (first lines)}))
(doall (->> (map zipmap (->> (first lines)
(map #(keyword "uk-composite-imd-2020-mysoc" %))
repeat)
(rest lines))
(map parse)
(map #(assoc % :uk.gov.ons/lsoa (:uk-composite-imd-2020-mysoc/lsoa %)
:dataset :uk-composite-imd-2020-mysoc))
(map #(dissoc % :uk-composite-imd-2020-mysoc/lsoa))
(map #(a/>!! ch %))))))))
(defn stream-wales-imd-2019-ranks [ch]
(let [f (download-file "https://gov.wales/sites/default/files/statistics-and-research/2019-11/welsh-index-multiple-deprivation-2019-index-and-domain-ranks-by-small-area.ods"
"wimd-2019-" ".ods")
data (odf/sheet-data f "WIMD_2019_ranks"
:headings (map #(keyword "wales-imd-2019" (name %))
[:lsoa :lsoa_name :authority_name :wimd_2019 :income :employment :health :education :access_to_services :housing :community_safety :physical_environment])
:pred #(and (= (count %) 12) (.startsWith (first %) "W")))]
(doall (->> data
(map #(assoc % :uk.gov.ons/lsoa (:wales-imd-2019/lsoa %)
:dataset :wales-imd-2019-ranks))
(map #(dissoc % :wales-imd-2019/lsoa))
(map #(a/>!! ch %))))))
(defn stream-wales-imd-2019-quantiles [ch]
(let [f (download-file "https://gov.wales/sites/default/files/statistics-and-research/2019-11/welsh-index-multiple-deprivation-2019-index-and-domain-ranks-by-small-area.ods"
"wimd-2019-" ".ods")
data (odf/sheet-data f "Deciles_quintiles_quartiles"
:headings (map #(keyword "wales-imd-2019" (name %))
[:lsoa :lsoa_name :authority_name :wimd_2019 :wimd_2019_decile :wimd_2019_quintile :wimd_2019_quartile])
:pred (fn [row] (and (= (count row) 7) (.startsWith ^String (first row) "W"))))]
(doall (->> data
(map #(assoc % :uk.gov.ons/lsoa (:wales-imd-2019/lsoa %)
:dataset :wales-imd-2019-quantiles))
(map #(dissoc % :wales-imd-2019/lsoa))
(map #(a/>!! ch %))))))
(def available-data
{:uk-composite-imd-2020-mysoc {:title "UK composite index of multiple deprivation, 2020 (MySociety)"
:year 2020
:description (str/join "\n" ["A composite UK score for deprivation indices for 2020 - based on England"
"with adjusted scores for the other nations as per Abel, Payne and Barclay but"
"calculated by Alex Parsons on behalf of MySociety."])
:properties ["E_expanded_decile" "UK_IMD_E_pop_decile" "UK_IMD_E_pop_quintile" "UK_IMD_E_rank" "UK_IMD_E_score" "employment_score" "income_score" "nation" "original_decile" "overall_local_score"]
:stream-fn stream-uk-composite-imd-2020}
:wales-imd-2019-ranks {:title "Welsh Index of Deprivation - ranks, 2019"
:year 2019
:description "Welsh Index of Deprivation - raw ranks for each domain, by LSOA."
:namespace "wales-imd-2019"
:properties ["lsoa_name" "authority_name" "access_to_services" "community_safety" "education" "employment" "health" "housing"
"income" "physical_environment" "wimd_2019"]
:stream-fn stream-wales-imd-2019-ranks}
:wales-imd-2019-quantiles {:title "Welsh Index of Deprivation - quantiles, 2019"
:year 2019
:description "Welsh Index of Deprivation - with composite rank with decile, quintile and quartile."
:namespace "wales-imd-2019"
:properties ["lsoa_name" "authority_name" "wimd_2019" "wimd_2019_decile" "wimd_2019_quintile" "wimd_2019_quartile"]
:stream-fn stream-wales-imd-2019-quantiles}})
(defn properties-for-dataset [dataset]
(let [dataset' (get available-data (keyword dataset))
nspace (or (:namespace dataset') (name dataset))]
(set (map #(keyword nspace (name %))
(:properties dataset')))))
(defn properties-for-datasets [datasets]
(apply set/union (map properties-for-dataset datasets)))
(comment
(properties-for-dataset :uk-composite-imd-2020-mysoc)
(properties-for-datasets [:uk-composite-imd-2020-mysoc :wales-imd-2019-quantiles])
(def ch (a/chan 16 (partition-all 5)))
(a/thread (stream-wales-imd-2019-ranks ch))
(a/<!! ch)
(def ch (a/chan 16 (partition-all 5)))
(a/thread (stream-uk-composite-imd-2020 ch))
(a/<!! ch))