-
Notifications
You must be signed in to change notification settings - Fork 0
/
urlutil.go
373 lines (347 loc) · 10.1 KB
/
urlutil.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
package urlutil
import (
"bytes"
"net/url"
"strings"
"github.com/zan8in/rawhttp/errorutil"
"github.com/zan8in/rawhttp/stringsutil"
)
// disables autocorrect related to parsing
// Ex: if input is admin url.Parse considers admin as host which is not a valid domain name
var DisableAutoCorrect bool
// URL a wrapper around net/url.URL
type URL struct {
*url.URL
Original string // original or given url(without params if any)
Unsafe bool // If request is unsafe (skip validation)
IsRelative bool // If URL is relative
Params Params // Query Parameters
// should call Update() method when directly updating wrapped url.URL or parameters
}
// mergepath merges given relative path
func (u *URL) MergePath(newrelpath string, unsafe bool) error {
if newrelpath == "" {
return nil
}
ux, err := ParseRelativePath(newrelpath, unsafe)
if err != nil {
return err
}
u.Params.Merge(ux.Params)
u.Path = mergePaths(u.Path, ux.Path)
if ux.Fragment != "" {
u.Fragment = ux.Fragment
}
return nil
}
// UpdateRelPath updates relative path with new path (existing params are not removed)
func (u *URL) UpdateRelPath(newrelpath string, unsafe bool) error {
u.Path = ""
return u.MergePath(newrelpath, unsafe)
}
// Updates internal wrapped url.URL with any changes done to Query Parameters
func (u *URL) Update() {
// This is a hot patch for url.URL
// parameters are serialized when parsed with `url.Parse()` to avoid this
// url should be parsed without parameters and then assigned with url.RawQuery to force unserialized parameters
u.RawQuery = u.Params.Encode()
}
// Query returns Query Params
func (u *URL) Query() Params {
return u.Params
}
// Clone
func (u *URL) Clone() *URL {
var userinfo *url.Userinfo
if u.User != nil {
// userinfo is immutable so this is the only way
tempurl := "https://" + u.User.String() + "@" + "scanme.sh/"
turl, _ := url.Parse(tempurl)
if turl != nil {
userinfo = turl.User
}
}
ux := &url.URL{
Scheme: u.Scheme,
Opaque: u.Opaque,
User: userinfo,
Host: u.Host,
Path: u.Path,
RawPath: u.RawPath,
RawQuery: u.RawQuery,
Fragment: u.Fragment,
// OmitHost: u.OmitHost, // only supported in 1.19
ForceQuery: u.ForceQuery,
RawFragment: u.RawFragment,
}
params := make(Params)
if u.Params != nil {
for k, v := range u.Params {
params[k] = v
}
}
return &URL{
URL: ux,
Params: params,
Original: u.Original,
Unsafe: u.Unsafe,
IsRelative: u.IsRelative,
}
}
// String
func (u *URL) String() string {
var buff bytes.Buffer
if u.Scheme != "" {
buff.WriteString(u.Scheme + "://")
}
if u.User != nil {
buff.WriteString(u.User.String())
buff.WriteRune('@')
}
buff.WriteString(u.Host)
buff.WriteString(u.GetRelativePath())
return buff.String()
}
// EscapedString returns a string that can be used as filename (i.e stripped of / and params etc)
func (u *URL) EscapedString() string {
var buff bytes.Buffer
buff.WriteString(u.Host)
if u.Path != "" && u.Path != "/" {
buff.WriteString("_" + strings.ReplaceAll(u.Path, "/", "_"))
}
return buff.String()
}
// GetRelativePath ex: /some/path?param=true#fragment
func (u *URL) GetRelativePath() string {
var buff bytes.Buffer
if u.Path != "" {
if !strings.HasPrefix(u.Path, "/") {
buff.WriteRune('/')
}
buff.WriteString(u.Path)
}
if len(u.Params) > 0 {
buff.WriteRune('?')
buff.WriteString(u.Params.Encode())
}
if u.Fragment != "" {
buff.WriteRune('#')
buff.WriteString(u.Fragment)
}
return buff.String()
}
// Updates port
func (u *URL) UpdatePort(newport string) {
if newport == "" {
return
}
if u.URL.Port() != "" {
u.Host = strings.Replace(u.Host, u.Port(), newport, 1)
return
}
u.Host += ":" + newport
}
// TrimPort if any
func (u *URL) TrimPort() {
u.URL.Host = u.Hostname()
}
// parseRelativePath parses relative path from Original Path without relying on
// net/url.URL
func (u *URL) parseUnsafeRelativePath() {
// url.Parse discards %0a or any percent encoded characters from path
// to avoid this if given url is not relative but has encoded chars
// parse the path manually regardless if it is unsafe
// ex: /%20test%0a =?
// autocorrect if prefix is missing
defer func() {
if !strings.HasPrefix(u.Path, "/") && u.Path != "" {
u.Path = "/" + u.Path
}
}()
// check path integrity
// url.parse() normalizes ../../ detect such cases are revert them
if u.Original != u.Path {
// params and fragements are removed from Original in Parsexx() therefore they can be compared
u.Path = u.Original
}
// percent encoding in path
if u.Host == "" || len(u.Host) < 4 {
if shouldEscape(u.Original) {
u.Path = u.Original
}
return
}
expectedPath := strings.SplitN(u.Original, u.Host, 2)
if len(expectedPath) != 2 {
// something went wrong fail silently
return
}
u.Path = expectedPath[1]
}
// fetchParams retrieves query parameters from URL
func (u *URL) fetchParams() {
if u.Params == nil {
u.Params = make(Params)
}
// parse fragments if any
if i := strings.IndexRune(u.Original, '#'); i != -1 {
// assuming ?param=value#highlight
u.Fragment = u.Original[i+1:]
u.Original = u.Original[:i]
}
if index := strings.IndexRune(u.Original, '?'); index == -1 {
return
} else {
encodedParams := u.Original[index+1:]
u.Params.Decode(encodedParams)
u.Original = u.Original[:index]
}
u.Update()
}
// ParseURL
func Parse(inputURL string) (*URL, error) {
return ParseURL(inputURL, false)
}
// Parse and return URL
func ParseURL(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
}
u.fetchParams()
// filter out fragments and parameters only then parse path
inputURL = u.Original
if inputURL == "" {
return nil, errorutil.NewWithTag("urlutil", "failed to parse url got empty input")
}
// Note: we consider //scanme.sh as valid (since all browsers accept this <script src="//ajax.googleapis.com/ajax/xx">)
if strings.HasPrefix(inputURL, "/") && !strings.HasPrefix(inputURL, "//") {
// this is definitely a relative path
u.IsRelative = true
u.Path = u.Original
return u, nil
}
// Try to parse host related input
if stringsutil.HasPrefixAny(inputURL, "http", "https", "//") || strings.Contains(inputURL, "://") {
u.IsRelative = false
urlparse, parseErr := url.Parse(inputURL)
if parseErr != nil {
// for parse errors in unsafe way try parsing again
if unsafe {
urlparse = parseUnsafeFullURL(inputURL)
if urlparse != nil {
parseErr = nil
}
}
if parseErr != nil {
return nil, errorutil.NewWithErr(parseErr).Msgf("failed to parse url")
}
}
copy(u.URL, urlparse)
} else {
// if no prefix try to parse it with https
// if failed we consider it as a relative path and not a full url
urlparse, parseErr := url.Parse("https://" + inputURL)
if parseErr != nil {
// most likely a relativeurl
u.IsRelative = true
// TODO: investigate if prefix / should be added
} else {
urlparse.Scheme = "" // remove newly added scheme
copy(u.URL, urlparse)
}
}
// try parsing path
if !u.IsRelative {
// if parsing is successful validate and autocorrect
//ex: when inputURL is admin `url.Parse()` considers admin as Host with parsed with https://
// i.e https://admin which is not valid/accepted domain
//TODO: Properly Validate using regex
if u.Host == "" {
// this is unexpected case return err
return nil, errorutil.NewWithTag("urlutil", "failed to parse url %v got empty host", inputURL)
}
// TODO: should use a proper regex to validate hostname/ip
// currently domain names without (.) are not considered as valid and autocorrected
// if DisableAutoCorrect is false
if !strings.Contains(u.Host, ".") && !strings.Contains(u.Host, ":") && u.Host != "localhost" {
// this does not look like a valid domain , ipv4 or ipv6
// consider it as relative
if !DisableAutoCorrect {
u.IsRelative = true
u.Path = inputURL
u.Host = ""
}
}
}
if !u.IsRelative && u.Host == "" {
return nil, errorutil.NewWithTag("urlutil", "failed to parse url `%v`", inputURL).Msgf("got empty host when url is not relative")
}
if u.IsRelative {
return ParseRelativePath(inputURL, unsafe)
}
return u, nil
}
// ParseRelativePath parses and returns relative path
func ParseRelativePath(inputURL string, unsafe bool) (*URL, error) {
u := &URL{
URL: &url.URL{},
Original: inputURL,
Unsafe: unsafe,
IsRelative: true,
}
u.fetchParams()
urlparse, parseErr := url.Parse(inputURL)
if parseErr != nil {
if !unsafe {
// should return error if not unsafe url
return nil, errorutil.NewWithErr(parseErr).WithTag("urlutil").Msgf("failed to parse input url")
} else {
// if unsafe do not rely on net/url.Parse
u.Path = inputURL
}
}
if urlparse != nil {
copy(u.URL, urlparse)
}
u.parseUnsafeRelativePath()
return u, nil
}
// parseUnsafeFullURL parses invalid(unsafe) urls (ex: https://scanme.sh/%invalid)
// this is not supported as per RFC and url.Parse fails
func parseUnsafeFullURL(urlx string) *url.URL {
// we only allow unsupported chars in path
// since url.Parse() returns error there isn't any standard way to do this
// Current methodology
// 1. temp replace `//` schema seperator to avoid collisions
// 2. get first index of `/` i.e path seperator (if none skip any furthur preprocessing)
// 3. if found split urls into base and path (i.e https://scanme.sh/%invalid => `https://scanme.sh`+`/%invalid`)
// 4. Host part is parsed by net/url.URL and path is parsed manually
temp := strings.Replace(urlx, "//", "", 1)
index := strings.IndexRune(temp, '/')
if index == -1 {
return nil
}
urlPath := temp[index:]
urlHost := strings.TrimSuffix(urlx, urlPath)
parseURL, parseErr := url.Parse(urlHost)
if parseErr != nil {
return nil
}
if relpath, err := ParseRelativePath(urlPath, true); err == nil {
parseURL.Path = relpath.Path
return parseURL
}
return nil
}
// copy parsed data from src to dst this does not include fragment or params
func copy(dst *url.URL, src *url.URL) {
dst.Host = src.Host
// dst.OmitHost = src.OmitHost // only supported in 1.19
dst.Opaque = src.Opaque
dst.Path = src.Path
dst.RawPath = src.RawPath
dst.Scheme = src.Scheme
dst.User = src.User
}