Skip to content
Find file
Fetching contributors…
Cannot retrieve contributors at this time
345 lines (333 sloc) 9.29 KB
# encoding: UTF-8
require 'set'
require 'algorithms'
# mod for doing stuff w/ abbrevs
module Abbrev
# a list of abbreviations culled, at least in part, from the USPS
# official list https://www.usps.com/send/official-abbreviations.htm#2
# this might be too country-specific, but it's a start.
classes = [
["alley", "aly"],
["and", "&"],
["annex", "anx"],
["arcade", "arc"],
["avenue", "ave"],
["beach", "bch"],
["burg", "bg"],
["bluff", "blf"],
["boulevard", "blvd"],
["bend", "bnd"],
["branch", "br"],
["bridge", "brg"],
["brook", "brk"],
["bottom", "btm"],
["bayoo", "byu"],
["circle", "cir"],
["club", "clb"],
["cliff", "clf"],
["common", "cmn"],
["corner", "cor"],
["camp", "cp"],
["cape", "cpe"],
["crescent", "cr","cres"],
["creek", "crk"],
["course", "crse"],
["crest", "crst"],
["causeway", "cswy"],
["court", "ct"],
["center", "ctr"],
["curve", "curv"],
["cove", "cv"],
["canyon", "cyn"],
["dale", "dl"],
["dam", "dm"],
["drive", "dr"],
["doctor", "dr"],
["divide", "dv"],
["east", "e"],
["estate", "est"],
["expressway", "expy"],
["extension", "ext"],
["field", "fld"],
["flat", "flt"],
["ford", "frd"],
["forge", "frg"],
["fork", "frk"],
["forest", "frst"],
["ferry", "fry"],
["fort", "ft"],
["freeway", "fwy"],
["garden", "gdn"],
["glen", "gln"],
["green", "grn"],
["grove", "grv"],
["gateway", "gtwy"],
["harbor", "hbr"],
["hill", "hl"],
["hollow", "holw"],
["haven", "hvn"],
["highway", "hwy"],
["inlet", "inlt"],
["island", "is"],
["junction", "jct"],
["knoll", "knl"],
["key", "ky"],
["lock", "lck"],
["lodge", "ldg"],
["loaf", "lf"],
["light", "lgt"],
["lake", "lk"],
["lane", "ln"],
["landing", "lndg"],
["meadow", "mdw"],
["mill", "ml"],
["manor", "mnr"],
["mission", "msn"],
["mount", "mt"],
["mountain", "mtn"],
["motorway", "mtwy"],
["neck", "nck"],
["north", "n"],
["orchard", "orch"],
["parkway", "pkwy"],
["place", "pl"],
["plain", "pln"],
["plaza", "plz"],
["pine", "pne"],
["prairie", "pr"],
["port", "prt"],
["passage", "psge"],
["point", "pt"],
["radial", "radl"],
["road", "rd"],
["ridge", "rdg"],
["river", "riv"],
["ranch", "rnch"],
["row", "row"],
["rapid", "rpd"],
["rest", "rst"],
["route", "rte"],
["shoal", "shl"],
["shore", "shr"],
["skyway", "skwy"],
["south", "s"],
["summit", "smt"],
["spring", "spg"],
["square", "sq"],
["street", "st"],
["station", "sta"],
["stravenue", "stra"],
["stream", "strm"],
["terrace", "ter"],
["turnpike", "tpke"],
["track", "trak"],
["trace", "trce"],
["trafficway", "trfy"],
["trail", "trl"],
["throughway", "trwy"],
["tunnel", "tunl"],
["union", "un"],
["viaduct", "via"],
["vista", "vis"],
["ville", "vl"],
["village", "vlg"],
["valley", "vly"],
["view", "vw"],
["way", "wy"],
["well", "wl"],
["west", "w"],
["crossing", "xing"],
["crossroad", "xrd"],
# Russian abbreviations
# Copyright (C) 2011-2012 Dmitry Marakasov
# from https://github.com/AMDmi3/streetmangler/blob/master/lib/locales/ru.cc#L27
["улица", "ул"],
["площадь", "пл"],
["переулок", "пер", "пер-к"],
["проезд", "пр-д"],
["шоссе", "ш"],
["бульвар", "бул", "б-р"],
["тупик", "туп"],
["набережная", "наб"],
["проспект", "просп", "пр-кт", "пр-т"],
["тракт", "тр-т", "тр"],
["эстакада", "эст"],
["район", "р-н"],
["микрорайон", "мкр-н", "мк-н", "мкр", "мкрн"],
["посёлок", "поселок", "пос"],
["деревня", "дер", "д"],
["квартал", "кв-л", "кв"],
# German abbreviations
["anschlussstelle", "as"],
["an", "a"],
["bahnhof", "bf"],
["bei", "b"],
["bürgermeister", "bgm"],
["der", "d"],
["den", "d"],
["dem", "d"],
["evangelische", "ev", "evang"],
["evangelischer", "ev", "evang"],
["evangelisches", "ev", "evang"],
["evangelisch", "ev", "evang"],
["fachhochschule", "fh"],
["gasse", "g"],
["gemeinschaft", "gem"],
["gemeinschafts", "gem"],
["georg", "gg"],
["groß", "gr"],
["große", "gr"],
["großer", "gr"],
["großes", "gr"],
["grundschule", "gs"],
["gymnasium", "gym", "gymn"],
["hauptbahnhof", "hbf"],
["hauptschule", "hs"],
["hochschule", "hs"],
["in", "i"],
["johann", "joh"],
["johannes", "joh"],
["katholische", "kath"],
["katholischer", "kath"],
["katholisches", "kath"],
["katholisch", "kath"],
["kindergarten", "kiga"],
["kindertagesstätte", "kita"],
["klein", "kl"],
["kleine", "kl"],
["kleiner", "kl"],
["kleines", "kl"],
["krankenhaus", "kh", "krkh", "krh", "krhs"],
["obere", "ob"],
["oberer", "ob"],
["oberes", "ob"],
["platz", "pl"],
["realschule", "rs"],
["römisch", "röm"],
["samtgemeinde", "sg"],
["sankt", "st"],
["sebastian", "seb"],
["straße", "str"],
["und", "u","&"],
["universität", "uni"],
["unterer", "unt","u"],
["unteres", "unt","u"],
["untere", "unt","u"],
["unter", "u"],
["vom", "v"],
["von", "v"],
["weg", "wg"],
["zur", "z"],
["zum", "z"],
["zu", "z"],
# Swiss German
["strasse", "str"],
#other
["ulica","u"],
#compass
# of course, this is horribly english-specific...
# but how would one expand this in a sensible fashion to
# cover other languages?
["n", "north"],
["e", "east"],
["s", "south"],
["w", "west"],
# german
["n", "nord"],
["o", "ost"],
["s", "süd"],
]
#build substitution rules out of classes (plz do this only once per redactionbot-start)
@@rules = Hash.new(Set.new)
for clazz in classes
for elem in clazz
@@rules[elem] = @@rules[elem] | ((Set.new clazz) - (Set.new [elem]))
end
end
#special rules like kill spaces, dashes and dots
@@rules[' '] = @@rules[elem] | Set.new([' ', '', '-', '.', '. '])
@@rules['-'] = @@rules[elem] | Set.new(['-', ' ', ''])
@@rules['.'] = @@rules[elem] | Set.new(['.', ' ', ''])
# function for expanding a string into a list of strings
def self.manglenext(heap, manglerules, target)
if !heap.empty?()
#remove the best unvisited word from queue and mangle it
wordstart, wordend = heap.next!()
#call every rule
for rule in manglerules
#and try to use it
if wordend.size() < rule.size() # TODO: BENCHMARKTEST NEEDED
next
end
#execute rule (just split once!!)
newsplit = []
if rule == ' '
newsplit = wordend.split(/ /,2)
else
newsplit = wordend.split(rule,2)
end
# if rule doesn't apply len != 2
if newsplit.size() == 2
for substitute in @@rules[rule]
newwordstart = wordstart + newsplit[0] + substitute
newwordend = newsplit[1]
#everything in wordstart have to match targets first characters
if target.start_with?(newwordstart)
#if we found our string we're happy
if target == newwordstart + newwordend
return true
end
heap.push([newwordstart,newwordend],newwordstart.size())
#to avoid loops with insert space (and insert special rule ' ')
if rule != ' '
newwordspaceend = ' ' + newwordend
#if we found our string we're happy
if target == newwordstart + newwordspaceend
return true
end
heap.push([newwordstart,newwordspaceend],newwordstart.size()) # insert space
end
end
end
end
end
end
return false
end
# TODO: may need some work for internationalisation
def self.equal_expansions(a, b)
input1 = a.downcase() + ' '
input2 = b.downcase() + ' '
if input1 == input2
#shortcut if string a matches string b
return true
end
# filter rules? maybe if words are long enough / dont remove special rules
forwardrules = [] # TODO Benchmark needed
backwardrules = []
for rule in @@rules.keys()
if rule == ' ' or rule == '-' or rule == '.' or input1.include? rule
forwardrules.push(rule)
end
end
for rule in @@rules.keys()
if rule == ' ' or rule == '-' or rule == '.' or input2.include? rule
backwardrules.push(rule)
end
end
#init toextend (priorityqueue)
extendforwpq = Containers::PriorityQueue.new
extendforwpq.push(['',input1],0)
extendbackwpq = Containers::PriorityQueue.new
extendbackwpq.push(['',input2],0)
until extendforwpq.empty?() and extendbackwpq.empty?()
if manglenext(extendforwpq, forwardrules, input2)
return true
end
if manglenext(extendbackwpq, backwardrules, input1)
return true
end
end
return false
end
end
Jump to Line
Something went wrong with that request. Please try again.