-
Notifications
You must be signed in to change notification settings - Fork 1
/
inclass.h
175 lines (163 loc) · 7.83 KB
/
inclass.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/**ADDRESS_STANDARDIZER***************************************************
*
* Address Standardizer
* A collection of C++ classes for parsing street addresses
* and standardizing them for the purpose of Geocoding.
*
* Copyright 2016 Stephen Woodbridge <woodbri@imaptools.com>
*
* This is free software; you can redistribute and/or modify it under
* the terms of the MIT License. Please file LICENSE for details.
*
***************************************************ADDRESS_STANDARDIZER**/
#ifndef INCLASS_H
#define INCLASS_H
#include <map>
#include <set>
#include <string>
class InClass {
public:
typedef enum {
STOP = -1, ///< undefined token
NUMBER = 0, ///< A string of digits.
WORD = 1, ///< A word is a string of letters of arbitrary length. A single letter can be both a SINGLE and a WORD.
TYPE = 2, ///< Words and abbreviation used to denote street typess. For example, 'ST' or 'AVE'.
QUALIF = 3, ///< Words that qualify a street name, like, 'OLD', 'EXTENSION', 'BUSINESS', 'ALTERNATE', etc.
ROAD = 6, ///< Words and abbreviations used to denote highways and roads. Example: the 'Interstate' in 'Interstate 5'.
STOPWORD = 7, ///< A word with low lexical significance, that can be omitted in parsing. For example, 'THE'. STOPWORDS combine with WORDS. In rules a string of multiple WORDs and STOPWORDs will be represented by a single WORD token.
RR = 8, ///< Words and abbreviations used to denote rural routes. 'RR'.
DASH = 9, ///< A hyphen
CITY = 10, ///< City name
PROV = 11, ///< State, province, or region name or abbreviation
NATION = 12, ///< Nation name or abbreviation
AMPERS = 13, ///< The ampersand (&) is frequently used to abbreviate the word 'and'.
BOXH = 14, ///< Words used to denote post office boxes. For example 'Box' or 'PO Box'.
ORD = 15, ///< Representations such as First or 1st. Often used in street. Are often standardized as numbers.
UNITH = 16, ///< Words and abbreviation used to denote internal subaddresses. For example, 'APT' or 'UNIT'.
UNITT = 17, ///< Which UNIT is being described
SINGLE = 18, ///< A single letter.
BUILDH = 19, ///< Words used to denote buildings or building complexes, usually as a prefix. For example 'Tower' in 'Tower 7A'.
MILE = 20, ///< Words used to denote milepost addresses.
DOUBLE = 21, ///< A sequence of two letters. Often used as identifiers.
DIRECT = 22, ///< Words used to denote directions, for example 'North'. These should get standardized to the full word not the abreviation.
MIXED = 23, ///< An alphanumeric string that contains both letters and digits. Used for identifiers.
BUILDT = 24, ///< Words and abbreviations used to denote buildings or building complexes, usually as a suffix. For example, 'Shopping Centre'.
FRACT = 25, ///< Fractions are sometimes used in civic numbers or unit numbers.
PCT = 26, ///< A 3 character sequence of number letter number. Identifies an LDU, the last 3 characters of a Canadian postal code.
PCH = 27, ///< A 3 character sequence of letter number letter. Identifies an FSA, the first 3 characters of a Canadian postal code.
QUINT = 28, ///< A 5 digit number. Identifies a Zip Code
QUAD = 29, ///< A 4 digit number. Identifies ZIP4.
PUNCT = 30, ///< Punctuation character(s)
SPACE = 31, ///< whitespace
PLACEN = 32, ///< Placename keyword
EMDASH = 33, ///< Token used to identify where a word was split by the tokenizer
SLASH = 34, ///< A slash '/' token like 1/a 1/ab, but not a FRACT
ATSIGN = 35, ///< a at sign '@' token to indicate an intersection
COMMA = 36, ///< a comma ',' token commonly used to separate parts of the address
BADTOKEN = 99 ///< A bad token type.
} Type;
typedef enum {
ALB =0, ///< ALBANIAN
ARA, ///< ARABIC
BAQ, ///< BASQUE
BEL, ///< BELARUSIAN
BET, ///< BELARUSIAN TRANSCRIBED
BEX, ///< BELARUSIAN TRANSLITERATED
BOS, ///< BOSNIAN
BOX, ///< BOSNIAN TRANSLITERATED
BUL, ///< BULGARIAN
BUT, ///< BULGARIAN TRANSCRIBED
BUX, ///< BULGARIAN TRANSLITERATED
CAT, ///< CATALAN
CHI, ///< CHINESE (MODERN)
CHT, ///< CHINESE (TRADITIONAL)
CZE, ///< CZECH
CZX, ///< CZECH TRANSLITERATED
DAN, ///< DANISH
DUT, ///< DUTCH
ENG, ///< ENGLISH
EST, ///< ESTONIAN
ESX, ///< ESTONIAN TRANSLITERATED
FIN, ///< FINNISH
FRE, ///< FRENCH
GER, ///< GERMAN
GLG, ///< GALICIAN
GRE, ///< GREEK
GRN, ///< GUARANI
GRT, ///< GREEK TRANSCRIBED
GRX, ///< GREK TRANSLITERATED
HEB, ///< Hebrew
HUN, ///< HUNGARIAN
HUX, ///< HUNGARIAN TRANSLITERATED
IND, ///< BAHASA INDONESIA
ITA, ///< ITALIAN
KOR, ///< KOREAN
KOX, ///< KOREAN TRANSLITERATED
LAV, ///< LATVIAN
LAX, ///< LATVIAN TRANSLITERATED
LIT, ///< LITHUANIAN
LIX, ///< LITHUANIAN TRANSLITERATED
MAC, ///< MACEDONIAN
MAT, ///< MACEDONIAN TRANSCRIBED
MAY, ///< MALAYSIAN
MLT, ///< MALTESE
MLX, ///< MALTESE TRANSLITERATED
MNE, ///< MONTENEGRIN
MNX, ///< MONTENEGRIN TRANSLITERATED
MOL, ///< MOLDOVAN
MOX, ///< MOLDOVAN TRANSLITERATED
NOR, ///< NORWEGIAN
POL, ///< POLISH
POR, ///< PORTUGUESE
POX, ///< POLISH TRANSLITERATED
PYN, ///< PINYIN
RMX, ///< ROMANIAN TRANSLITERATED
RST, ///< RUSSIAN TRANSCRIBED
RUM, ///< ROMANIAN
RUS, ///< RUSSIAN
RUX, ///< RUSSIAN TRANSLITERATED
SCR, ///< CROATIAN
SCT, ///< SERBIAN TRANSCRIBED
SCX, ///< SERBIAN TRANSLITERATED
SIX, ///< SLOVENIAN TRANSLITERATED
SLO, ///< SLOVAK
SLV, ///< SLOVENIAN
SLX, ///< SLOVAK TRANSLITERATED
SPA, ///< SPANISH
SRB, ///< SERBIAN
SRX, ///< CROATION TRANSLITERATED
SWE, ///< SWEDISH
THA, ///< THAI
TKT, ///< TURKISH TRANSCRIBED
TUR, ///< TURKISH
TUX, ///< TURKISH TRANSLITERATED
UKR, ///< UKRAINIAN
UKT, ///< UKRAINIAN TRANSCRIBED
UKX, ///< UKRAINIAN TRANSLITERATED
WEL, ///< WELSH
UNKNOWN = -1 ///< UNKNOWN or UNDEFINED
} Lang;
typedef enum {
DET_PRE = 0, ///< detached prefix
DET_SUF = 1, ///< detached suffix
ATT_PRE = 2, ///< attached prefix
ATT_SUF = 3 ///< attached suffix
// DETACHED = can be used in the lexicon to indicate NO attachment
} AttachType;
// InClass::Type conversions
static std::string asString(const std::set<InClass::Type> &t);
static std::string asString(const InClass::Type &t);
static std::set<InClass::Type> asType(const std::string &s);
static InClass::Type asOneType(const std::string &s);
static InClass::Type asType(const int i);
// InClass::AttachType conversions
static std::string asString(const std::set<InClass::AttachType> &t);
static std::string asString(const InClass::AttachType &t);
static std::set<InClass::AttachType> asAttachType(const std::string &s);
// InClass::Lang conversions
static std::string asString(const InClass::Lang &lang);
static std::string asName(const InClass::Lang &lang);
static InClass::Lang asLang(const std::string &s);
private:
};
#endif