/
url.rb
249 lines (196 loc) · 5.68 KB
/
url.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
# frozen_string_literal: true
require "addressable/idna/pure"
require "addressable/uri"
require "public_suffix"
require "twingly/public_suffix_list"
require "twingly/url/null_url"
require "twingly/url/error"
require "twingly/version"
module Twingly
class URL
include Comparable
ACCEPTED_SCHEMES = /\Ahttps?\z/i.freeze
CUSTOM_PSL = PublicSuffixList.with_punycoded_names
ENDS_WITH_SLASH = /\/+$/.freeze
STARTS_WITH_WWW = /\Awww\./i.freeze
ERRORS_TO_EXTEND = [
Addressable::IDNA::PunycodeBigOutput,
Addressable::URI::InvalidURIError,
PublicSuffix::DomainInvalid,
].freeze
CARRIAGE_RETURN = "\u000D"
LINE_FEED = "\u000A"
NBSP = "\u00A0"
SPACE = "\u0020"
WHITESPACE_CHARS = [
CARRIAGE_RETURN,
LINE_FEED,
NBSP,
SPACE,
].join.freeze
LEADING_AND_TRAILING_WHITESPACE =
/\A[#{WHITESPACE_CHARS}]+|[#{WHITESPACE_CHARS}]+\z/.freeze
private_constant :ACCEPTED_SCHEMES
private_constant :CUSTOM_PSL
private_constant :STARTS_WITH_WWW
private_constant :ENDS_WITH_SLASH
private_constant :ERRORS_TO_EXTEND
private_constant :NBSP
private_constant :SPACE
private_constant :WHITESPACE_CHARS
private_constant :LEADING_AND_TRAILING_WHITESPACE
class << self
def parse(potential_url)
internal_parse(potential_url)
rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
NullURL.new
rescue Exception => error
error.extend(Twingly::URL::Error)
raise
end
def internal_parse(input)
potential_url = clean_input(input)
addressable_uri = Addressable::URI.heuristic_parse(potential_url)
raise Twingly::URL::Error::ParseError if addressable_uri.nil?
scheme = addressable_uri.scheme
raise Twingly::URL::Error::ParseError unless scheme =~ ACCEPTED_SCHEMES
# URLs that can't be normalized should not be valid
try_addressable_normalize(addressable_uri)
host = addressable_uri.host
public_suffix_domain = PublicSuffix.parse(host, list: CUSTOM_PSL,
default_rule: nil)
raise Twingly::URL::Error::ParseError if public_suffix_domain.nil?
raise Twingly::URL::Error::ParseError if public_suffix_domain.sld.nil?
new(addressable_uri, public_suffix_domain)
rescue *ERRORS_TO_EXTEND => error
error.extend(Twingly::URL::Error)
raise
end
def clean_input(input)
input = String(input)
input = input.scrub
input = strip_whitespace(input)
end
def strip_whitespace(input)
return input unless input.encoding == Encoding::UTF_8
input.gsub(LEADING_AND_TRAILING_WHITESPACE, "")
end
# Workaround for the following bug in addressable:
# https://github.com/sporkmonger/addressable/issues/224
def try_addressable_normalize(addressable_uri)
addressable_uri.normalize
rescue ArgumentError => error
if error.message.include?("invalid byte sequence in UTF-8")
raise Twingly::URL::Error::ParseError
end
raise
end
private :new
private :internal_parse
private :clean_input
private :strip_whitespace
private :try_addressable_normalize
end
def initialize(addressable_uri, public_suffix_domain)
@addressable_uri = addressable_uri
@public_suffix_domain = public_suffix_domain
end
def scheme
addressable_uri.scheme
end
def trd
public_suffix_domain.trd.to_s
end
def sld
public_suffix_domain.sld
end
def tld
public_suffix_domain.tld
end
# Many ccTLDs have a second level[1] underneath their ccTLD, use this when
# you don't care about the second level.
#
# [1]: https://en.wikipedia.org/wiki/Second-level_domain
def ttld
tld.split(".").last
end
def domain
public_suffix_domain.domain
end
def host
addressable_uri.host
end
def origin
addressable_uri.origin
end
def path
addressable_uri.path
end
def without_scheme
self.to_s.sub(/\A#{scheme}:/, "")
end
def normalized
normalized_url = addressable_uri.dup
normalized_url.scheme = normalized_scheme
normalized_url.host = normalized_host
normalized_url.path = normalized_path
self.class.parse(normalized_url)
end
def normalized_scheme
scheme.downcase
end
def normalized_host
host = addressable_uri.normalized_host
domain = public_suffix_domain
unless domain.subdomain?
host = "www.#{host}"
end
host = normalize_blogspot(host, domain)
host
end
def normalized_path
path = strip_trailing_slashes(addressable_uri.path)
(path.empty?) ? "/" : path
end
def userinfo
addressable_uri.userinfo.to_s
end
def user
addressable_uri.user.to_s
end
def password
addressable_uri.password.to_s
end
def valid?
true
end
def <=>(other)
self.to_s <=> other.to_s
end
def eql?(other)
return false unless other.is_a?(self.class)
self.hash == other.hash
end
def hash
self.to_s.hash
end
def to_s
addressable_uri.to_s
end
def inspect
sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s)
end
private
attr_reader :addressable_uri, :public_suffix_domain
def normalize_blogspot(host, domain)
if domain.sld.downcase == "blogspot"
host.sub(STARTS_WITH_WWW, "").sub(/#{domain.tld}\z/i, "com")
else
host
end
end
def strip_trailing_slashes(path)
path.sub(ENDS_WITH_SLASH, "")
end
end
end