Skip to content

Commit

Permalink
Use Nilsimsa to generate locality-sensitive hashes and compare using …
Browse files Browse the repository at this point in the history
…Levenshtein distance
  • Loading branch information
Gargron committed Jun 30, 2019
1 parent fd23b73 commit 320eb3a
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 2 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ gem 'idn-ruby', require: 'idn'
gem 'kaminari', '~> 1.1'
gem 'link_header', '~> 0.0'
gem 'mime-types', '~> 3.2', require: 'mime/types/columnar'
gem 'nilsimsa', git: 'https://github.com/witgo/nilsimsa', ref: 'fd184883048b922b176939f851338d0a4971a532'
gem 'nokogiri', '~> 1.10'
gem 'nsa', '~> 0.2'
gem 'oj', '~> 3.7'
Expand Down
8 changes: 8 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@ GIT
specs:
http_parser.rb (0.6.1)

GIT
remote: https://github.com/witgo/nilsimsa
revision: fd184883048b922b176939f851338d0a4971a532
ref: fd18488
specs:
nilsimsa (1.1.2)

GEM
remote: https://rubygems.org/
specs:
Expand Down Expand Up @@ -703,6 +710,7 @@ DEPENDENCIES
microformats (~> 4.1)
mime-types (~> 3.2)
net-ldap (~> 0.10)
nilsimsa!
nokogiri (~> 1.10)
nsa (~> 0.2)
oj (~> 3.7)
Expand Down
28 changes: 26 additions & 2 deletions app/lib/spam_check.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ class SpamCheck
include Redisable
include ActionView::Helpers::TextHelper

LEVENSHTEIN_THRESHOLD = 10

def initialize(status)
@account = status.account
@status = status
Expand All @@ -14,7 +16,8 @@ def skip?
end

def spam?
!redis.zrank("spam_check:#{@account.id}", digest).nil?
other_digests = redis.zrange("spam_check:#{@account.id}", '0', '-1')
other_digests.any? { |other_digest| levenshtein(digest, other_digest) < LEVENSHTEIN_THRESHOLD }
end

def flag!
Expand All @@ -40,7 +43,7 @@ def hashable_text
end

def digest
@digest ||= Digest::MD5.hexdigest(hashable_text)
@digest ||= Nilsimsa.new(hashable_text).hexdigest
end

def remove_mentions(text)
Expand Down Expand Up @@ -76,4 +79,25 @@ def already_flagged?
def no_unsolicited_mentions?
@status.mentions.all? { |mention| mention.silent? || !mention.account.local? || mention.account.following?(@account) }
end

def levenshtein(first, second)
m = first.length
n = second.length

return m if n.zero?
return n if m.zero?

d = Array.new(m + 1) { Array.new(n + 1) }

0.upto(m) { |i| d[i][0] = i }
0.upto(n) { |j| d[0][j] = j }

1.upto(n) do |j|
1.upto(m) do |i|
d[i][j] = first[i - 1] == second[j - 1] ? d[i - 1][j - 1] : [d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + 1].min
end
end

d[m][n]
end
end
8 changes: 8 additions & 0 deletions spec/lib/spam_check_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ def status_with_html(text)
status2 = status_with_html('@bob Hello')
expect(described_class.new(status2).spam?).to be true
end

it 'returns true for nearly identical statuses with random numbers' do
source_text = 'Sodium, atomic number 11, was first isolated by Humphry Davy in 1807. A chemical component of salt, he named it Na in honor of the saltiest region on earth, North America.'
status1 = status_with_html('@alice ' + source_text + ' 1234')
described_class.new(status1).remember!
status2 = status_with_html('@bob ' + source_text + ' 9568')
expect(described_class.new(status2).spam?).to be true
end
end

describe '#skip?' do
Expand Down

0 comments on commit 320eb3a

Please sign in to comment.