-
Notifications
You must be signed in to change notification settings - Fork 0
/
pgday_02_02_24_mini
387 lines (267 loc) · 11 KB
/
pgday_02_02_24_mini
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// SETUP //////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////
docker run -d -it -e POSTGRES_PASSWORD=postgres --name postgres postgres:16
docker exec -it postgres bash
apt update
apt install -y git wget bzip2
git clone https://github.com/credativ/omdb-postgresql.git
cd omdb-postgresql
./download
su postgres
./import
psql omdb
create index on categories (name);
create index on categories (parent_id);
analyze categories;
create index on movie_categories(category_id);
analyze movie_categories;
analyze movies;
\pset pager 0
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// CONTEXT //////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////
\d+ movies
\d+ movie_categories
\d+ categories
// We'll choose some movie
SELECT id, name FROM movies WHERE id = 77;
// It has 18 categories
SELECT * FROM movie_categories WHERE movie_id = 77;
// What are those categories ?
SELECT c.name, c.parent_id FROM movie_categories mc JOIN categories c ON mc.category_id=c.id WHERE movie_id = 77;
// We have parent categories
// Now try to fetch categories AND their parent categories
WITH RECURSIVE cte_cs AS (
SELECT c.name, c.id, c.parent_id, c.root_id
FROM movie_categories mc
JOIN categories c ON mc.category_id=c.id
WHERE movie_id = 77
UNION
SELECT c2.name, c2.id, c2.parent_id, c2.root_id
FROM categories c2
JOIN cte_cs on cte_cs.parent_id=c2.id
)
SELECT name, id, parent_id
FROM cte_cs
ORDER BY id, parent_id;
// we have more results, and important ones like 'thriller'
////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////// PROBLEM //////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now, how to find movies with Thiller + Justice Drama + Nominated for Academy Award ?
// naive join wouldn't work
SELECT m.name
FROM movies m
JOIN movie_categories mc ON m.id = mc.movie_id
JOIN categories c ON mc.category_id=c.id
WHERE c.name IN ('Thriller', 'Justice Drama', 'Nominated for Academy Award')
GROUP BY 1 HAVING count(c.name)=3;
// We have to use RECURSIVE CTE + a GROUP BY trick
WITH RECURSIVE cte_cs as (
SELECT c.id, c.id AS group, c.name
FROM categories c
WHERE c.name IN ('Thriller', 'Nominated for Academy Award', 'Justice Drama')
UNION
SELECT c2.id, cte_cs.id AS group, c2.name
FROM categories c2
JOIN cte_cs ON cte_cs.id = c2.parent_id
)
SELECT m.id, m.name, string_agg(cte_cs.name, ',')
FROM movies m
JOIN movie_categories mc ON m.id = mc.movie_id
JOIN cte_cs ON cte_cs.id = mc.category_id
GROUP BY 1, 2
HAVING count(distinct cte_cs.group) = 3;
// some are here because of subcategories ('Courtroom Drama', subcategory of 'Justice Drama' for example)
EXPLAIN WITH RECURSIVE cte_cs as (
SELECT c.id, c.id AS group, c.name
FROM categories c
WHERE c.name IN ('Thriller', 'Nominated for Academy Award', 'Justice Drama')
UNION
SELECT c2.id, cte_cs.id AS group, c2.name
FROM categories c2
JOIN cte_cs ON cte_cs.id = c2.parent_id
)
SELECT m.id, m.name, string_agg(cte_cs.name, ',')
FROM movies m
JOIN movie_categories mc ON m.id = mc.movie_id
JOIN cte_cs ON cte_cs.id = mc.category_id
GROUP BY 1, 2
HAVING count(distinct cte_cs.group) = 3;
////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////// ARRAY //////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// Now, how we could aggregate all that stuff into arrays using array_agg.
WITH RECURSIVE cte_cs AS (
SELECT mc.movie_id, c.id, c.name, c.parent_id
FROM movie_categories mc
JOIN categories c on mc.category_id=c.id
UNION
SELECT cte_cs.movie_id, c2.id, c2.name, c2.parent_id
FROM categories c2
JOIN cte_cs ON cte_cs.parent_id=c2.id
), cte_arr AS (
SELECT movie_id, array_agg(name)
FROM cte_cs
GROUP BY 1
)
SELECT cte_arr.movie_id, movies.name, cte_arr.array_agg
FROM cte_arr
JOIN movies ON cte_arr.movie_id=movies.id
ORDER BY movie_id LIMIT 5 OFFSET 51;
///////////////////////////////////
///// New movies table ! ////////
/////////////////////////////////
create table movies_array (like movies including all);
insert into movies_array select * from movies;
alter table movies_array add column categories varchar(64)[];
BEGIN;
WITH RECURSIVE cte_cs AS (
SELECT mc.movie_id, c.id, c.name, c.parent_id
FROM movie_categories mc
JOIN categories c ON mc.category_id=c.id
UNION
SELECT cte_cs.movie_id, c2.id, c2.name, c2.parent_id
FROM categories c2 JOIN cte_cs ON cte_cs.parent_id=c2.id
), cte_arr AS (
SELECT movie_id, array_agg(name) as array
FROM cte_cs
GROUP BY 1
)
UPDATE movies_array mar
SET categories=cte_arr.array
FROM cte_arr
WHERE cte_arr.movie_id=mar.id;
select id, name, categories from movies_array order by id limit 15;
COMMIT;
CREATE INDEX ON movies_array USING GIN (categories);
ANALYZE movies_array;
///////////////////////////////////
///////// Now using it //////////
/////////////////////////////////
SELECT id, name
FROM movies_array
WHERE categories @> '{"Thriller"}'::varchar(64)[]
AND categories @> '{"Nominated for Academy Award"}'::varchar(64)[]
AND categories @> '{"Justice Drama"}'::varchar(64)[]
ORDER BY id;
// Can be simplified with
SELECT id, name
FROM movies_array
WHERE categories @> '{"Justice Drama", "Nominated for Academy Award", "Thriller"}'::varchar(64)[]
ORDER BY id;
explain SELECT id, name
FROM movies_array
WHERE categories @> '{"Justice Drama", "Nominated for Academy Award", "Thriller"}'::varchar(64)[]
ORDER BY id;
// we can even search "related" movies, by varying how many categories we filter on
SELECT id, name FROM movies_array WHERE categories && '{"Bleak","Justice Drama"}'::varchar(64)[] ORDER BY id;
// And it's still straightforward to know that some films do not have categories
SELECT id, name FROM movies_array WHERE categories IS NULL;
////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////// LTREE //////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////
/////// New category table ///////
/////////////////////////////////
CREATE EXTENSION ltree;
CREATE TABLE categories_ltree (like categories including all);
INSERT INTO categories_ltree SELECT * FROM categories;
ALTER TABLE categories_ltree ADD COLUMN name_sanitized text;
ALTER TABLE categories_ltree ADD COLUMN name_tree ltree;
///////////////////////////////////
//// Sanitizing category name ////
/////////////////////////////////
// ltree is not compatible because of symbols:
select id, name, text2ltree(name) from categories_ltree order by id;
select text2ltree('Apocalypse & Post-Apocalypse');
SELECT name, regexp_matches(name, '[a-zA-Z0-9_]+', 'g') FROM categories order by name;
SELECT name, unnest(match), sort_order
FROM categories,
regexp_matches(name, '(\w+)', 'g') WITH ORDINALITY as f (match, sort_order) order by name;
SELECT name, string_agg(word, '_')
FROM (
select name, unnest(match) as word, sort_order
FROM categories,
regexp_matches(name, '(\w+)', 'g') WITH ORDINALITY as f (match, sort_order)
ORDER BY name, sort_order
) f
GROUP BY name ORDER BY name;
// Now sanitizing every names
WITH cte_sanitized AS (
SELECT id, string_agg(word, '_') as name
FROM (
SELECT id, unnest(match) as word, sort_order
FROM categories,
regexp_matches(name, '(\w+)', 'g') WITH ORDINALITY as f (match, sort_order)
ORDER BY id, sort_order
) f
GROUP BY id
)
UPDATE categories_ltree ctg_ltree
SET name_sanitized=cte_sanitized.name
FROM cte_sanitized
WHERE cte_sanitized.id=ctg_ltree.id;
// ltree is now compatible
select id, name, name_sanitized, text2ltree(name_sanitized) from categories_ltree order by id limit 10;
///////////////////////////////////
/////// Ltree categories ////////
/////////////////////////////////
// Setting proper tree on categories
WITH RECURSIVE cte_tree AS (
SELECT c.id, c.name_sanitized as tree, c.parent_id
FROM categories_ltree c
WHERE c.parent_id is null
UNION
select c2.id,
cte_tree.tree || '.' || c2.name_sanitized as tree,
c2.parent_id
FROM categories_ltree c2
JOIN cte_tree ON cte_tree.id=c2.parent_id
)
UPDATE categories_ltree ctg_ltree
SET name_tree = text2ltree(tree)
FROM cte_tree
WHERE cte_tree.id = ctg_ltree.id ;
CREATE INDEX on categories_ltree USING GIST (name_tree);
ANALYZE categories_ltree;
SELECT * from categories_ltree WHERE name_tree @ 'Justice_Drama | Thriller | Nominated_for_Academy_Award';
///////////////////////////////////
/// Adding LTREEs into movies ///
/////////////////////////////////
CREATE TABLE movies_arrayltree (like movies including all);
INSERT INTO movies_arrayltree select * from movies;
ALTER TABLE movies_arrayltree add column categories ltree[];
WITH cte_cs AS (
SELECT mc.movie_id, c.name_tree as name_tree
FROM movie_categories mc
JOIN categories_ltree c ON mc.category_id=c.id
), cte_arr AS (
SELECT movie_id, array_agg(cte_cs.name_tree) as array
FROM cte_cs
GROUP BY 1
)
UPDATE movies_arrayltree mar SET categories=cte_arr.array
FROM cte_arr
WHERE cte_arr.movie_id=mar.id;
CREATE INDEX on movies_arrayltree USING GIST (categories);
ANALYZE movies_arrayltree;
SELECT id, name
FROM movies_arrayltree
WHERE categories @ 'Justice_Drama'
AND categories @ 'Nominated_for_Academy_Award'
AND categories @ 'Thriller'
ORDER BY id;
explain SELECT id, name
FROM movies_arrayltree
WHERE categories @ 'Justice_Drama'
AND categories @ 'Nominated_for_Academy_Award'
AND categories @ 'Thriller'
ORDER BY id;
SELECT id, name
FROM movies_arrayltree
WHERE categories @ 'Justice_Drama & !Courtroom_Drama'
AND categories @ 'Nominated_for_Academy_Award'
AND categories @ 'Thriller & !Conspiracy_Thriller & !Political_Thriller'
ORDER BY id;