-
Notifications
You must be signed in to change notification settings - Fork 0
/
needle.rb
220 lines (202 loc) · 8.03 KB
/
needle.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
# Full text search is like finding a needle in a hay stack.
#
# Table fields:
#
# string :model_class, :default => '', :null => false
# integer :model_id, :null => false
# string :attribute_name, :default => '', :null => false
# string :content, :default => '', :null => false
# integer :user_id
# boolean :public
#
# Note that for speed neither a real ActiveRecord "belongs_to" association,
# nor timestamp fields (updated_at, created_at), nor validations, are used.
#
class Needle < ActiveRecord::Base
# internally generated from "safe" inputs, so basically everything is
# accessible
attr_accessible :model_class, :model_id, :attribute_name, :content, :user_id,
:public
def self.tokenize string
@@wikitext_parser ||= Wikitext::Parser.new
@@wikitext_parser.fulltext_tokenize string
end
# Options:
# <tt>:user</tt>:: a user model object (if admin finds all records, if nil
# finds only public records, otherwise finds records
# visible to that user)
# <tt>:type</tt>:: either <tt>:or</tt> (the default) to find models which
# feature any of the words in the query string, or
# <tt>:and</tt> to find models which feature all of the
# words in the query string.
def self.find_with_query_string query, options = {}
sql = NeedleQuery.new(query, options).sql
prefetch_models(sql ? Needle.find_by_sql(sql) : [])
end
private
# Normally a Needle.find_by_sql query would return an array of Needle objects
# like this:
#
# [#<Needle model_class: "Post", model_id: 13>]
#
# To actual get any useful information out of this array would require
# an additional database query per result, so it could get quite expensive
# (the typical "N + 1 SELECT" problem).
#
# Here we try to minimize the number of additional queries by "prefetching"
# the models in groups. Just say we had 5 articles, 5 issues, 5 posts and 5
# topics, we would get all 20 objects using 4 queries (1 per model type) and
# return them in an array.
#
def self.prefetch_models models
model_cache = Hash.new { |hash, key| hash[key] = {} }
models.each { |model| model_cache[model.model_class][model.model_id] = nil }
model_cache.each do |model_class, ids|
results = model_class.constantize.where(:id => ids.keys).each do |result|
model_cache[model_class][result.id] = result
end
if ids.length > results.length
self.logger.warn \
"warning: expected #{ids.length} #{model_class} instance(s) but found only #{results.length} (search index out of date)"
end
end
models.collect { |model| model_cache[model.model_class][model.model_id] }
end
class NeedleQuery
AND_QUERY_LIMIT = 5 # AND queries are slow and heavy because they require multiple joins
OR_QUERY_LIMIT = 10 # OR queries are much faster, so allow up to ten components
ROW_LIMIT = 20 # will try to fetch twenty rows at a time
attr_reader :clauses
def initialize query, options = {}
defaults = { :type => :or, :user => nil, :offset => 0 }
@query = query
@options = defaults.merge(options)
@columns = 'model_class, model_id, COUNT(*) AS count'
# preprocessing
@ignored = [] # TODO: report back ignored words (in errors?); these will get shown in the flash
prepare_clauses
end
def sql
clause_count = @clauses.length
if clause_count == 0
nil
elsif clause_count == 1 || @options[:type] == :or
sql_for_OR_query
elsif @options[:type] == :and
sql_for_AND_query
else
raise 'unrecognized type'
end
end
# Given a query string like: "title:hello here there", and user with id 1, we want to produce a query like:
#
# SELECT model_class, model_id, COUNT(*) AS count FROM needles
# JOIN (SELECT model_class, model_id FROM needles WHERE content = 'here') AS sub1
# USING (model_class, model_id)
# JOIN (SELECT model_class, model_id FROM needles WHERE content = 'are') AS sub2
# USING (model_class, model_id)
# WHERE content = 'hello' AND attribute_name = 'title'
# AND (user_id = 1 OR public = TRUE OR public IS NULL)
# GROUP BY model_class, model_id
# ORDER BY count DESC;
#
def sql_for_AND_query
sql = self.base_query
@columns = 'model_class, model_id'
count = 1
first = true
pending = nil
@clauses[0...AND_QUERY_LIMIT].each do |clause|
if first
pending = clause
first = false
next
end
sql << " JOIN (#{self.base_query} WHERE #{clause}) AS sub#{count} USING (model_class, model_id)"
count += 1
end
sql << " WHERE #{pending}"
sql << " AND #{self.user_constraint}" unless self.user_constraint.blank?
sql << " #{self.group_by} #{self.order_by} #{self.limit}"
end
# Given a query string like: "title:hello here there", and user with id 1, we want to produce a query like:
#
# SELECT model_class, model_id, COUNT(*) AS count
# FROM needles
# WHERE ((content = 'hello' AND attribute_name = 'title') -- first criterion
# OR (content = 'here') -- second criterion
# OR (content = 'there')) -- third criterion
# AND (user_id = 1 OR public = TRUE OR public IS NULL) -- user constraint
# GROUP BY model_class, model_id
# ORDER BY count DESC;
#
# Which will yield a result like this:
#
# +-------------+----------+-------+
# | model_class | model_id | count |
# +-------------+----------+-------+
# | article | 1 | 3 |
# | article | 3 | 2 |
# +-------------+----------+-------+
#
def sql_for_OR_query
sql = "#{base_query} WHERE ("
sql << @clauses[0...OR_QUERY_LIMIT].join(" OR ")
sql << ")"
sql << " AND #{self.user_constraint}" unless self.user_constraint.blank?
sql << " #{self.group_by} #{self.order_by} #{self.limit}"
end
def base_query
"SELECT #{@columns} FROM needles"
end
def group_by
'GROUP BY model_class, model_id'
end
def order_by
'ORDER BY count DESC'
end
def limit
# we try to retrieve one more row than we actually intend to use
# this is how we detect when to display a "more" link in the search results
"LIMIT #{@options[:offset].to_i}, #{ROW_LIMIT + 1}"
end
def user_constraint
if @options[:user].nil? # no user: public records only
'(public = TRUE OR public IS NULL)'
elsif @options[:user].superuser? # admin user: no constraint
''
else # normal user: user's own records plus public records
"(user_id = #{@options[:user].id} OR public = TRUE OR public IS NULL)"
end
end
def tokenize_and_sanitize_clause attribute_name, content
Needle.tokenize(content).collect do |token|
unless attribute_name.blank?
conditions = ["content = '%s' AND attribute_name = '%s'", token, attribute_name]
else
conditions = ["content = '%s'", token]
end
# use an Array rather than a hash, as we need to control order
# so as to make optimal use of the index
Needle.send :sanitize_sql_for_conditions, conditions
end
end
def prepare_clauses
@clauses = []
@query.split(/\s+/).each do |clause|
attribute_name, content = '', ''
if index = clause.index(':')
if index > 0
attribute_name = clause[0..index - 1]
word = clause[index + 1..-1]
unless word.blank? or attribute_name =~ /\A(https?|ftp|svn|mailto)\z/i
@clauses.push *tokenize_and_sanitize_clause(attribute_name, word)
next
end
end
end
@clauses.push *tokenize_and_sanitize_clause(nil, clause) # fallback case: no valid attribute supplied
end
end
end # class NeedleQuery
end