Skip to content

Commit

Permalink
Merge pull request #76 from torden/feature/entitydecode
Browse files Browse the repository at this point in the history
added DecodeURLEncoded,DecodeUnicodeEntities
  • Loading branch information
torden committed Jun 28, 2018
2 parents 2355505 + 2f58fc8 commit 1254ca1
Show file tree
Hide file tree
Showing 8 changed files with 325 additions and 44 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ strictlint: setup
## Run Go Test with Data Race Detection
testassert: clean
@$(CMD_ECHO) -e "\033[1;40;32mRun Go Test.\033[01;m\x1b[0m"
@$(CMD_GO) test -tags unittest -v -test.parallel 4 -race -run Test_strutils_Assert*
@$(CMD_GO) test -v -test.parallel 4 -race -run Test_strutils_Assert*
@$(CMD_ECHO) -e "\033[1;40;36mGenerated a report of data race detection in $(PATH_REPORT)/doc/$(PATH_RACE_REPORT).pid\033[01;m\x1b[0m"
@$(CMD_ECHO) -e "\033[1;40;36mDone\033[01;m\x1b[0m"

Expand Down
73 changes: 72 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ README.md haven't contain all the examples. Please refer to the the XXXtest.go f
- [HumanByteSize](#humanbytesize)
- [HumanFileSize](#humanfilesize)
- [AnyCompare](#anycompare)
- [DecodeUnicodeEntities](#decodeunicodeentities)
- [DecodeURLEncoded](#decodeurlencoded)
- [StripTags](#striptags)
- [ConvertToStr](#converttostr)
- [ReverseStr](#reversestr)
Expand Down Expand Up @@ -462,9 +464,78 @@ Return : false
Error : different value : (obj1[A][name][first][last][F][name][first] := 1) != (obj2[A][name][first][last][F][name][first] := 11)
```

### DecodeUnicodeEntities

DecodeUnicodeEntities Decodes Unicode Entities

```go
func (s *StringProc) DecodeUnicodeEntities(val string) (string, error)
```

Example:

```go
StrUnicodeEntityEncodedMultipleLine := "%uC548%uB155%uD558%uC138%uC694.%0A%uBC29%uAC11%uC2B5%uB2C8%uB2E4.%0A%uAC10%uC0AC%uD569%uB2C8%uB2E4.%0A%u304A%u306F%u3088%u3046%u3054%u3056%u3044%u307E%u3059%0A%u3053%u3093%u306B%u3061%u306F%uFF0E%0A%u3053%u3093%u3070%u3093%u306F%uFF0E%0A%u304A%u3084%u3059%u307F%u306A%u3055%u3044%uFF0E%0A%u3042%u308A%u304C%u3068%u3046%u3054%u3056%u3044%u307E%u3059%0A%u4F60%u597D%0A%u518D%u898B%0A%u8C22%u8C22%21%u0E2A%u0E27%u0E31%u0E2A%u0E14%u0E35%u0E04%u0E23%u0E31%u0E1A%0A%u0E41%u0E25%u0E49%u0E27%u0E40%u0E08%u0E2D%u0E01%u0E31%u0E19%u0E04%u0E23%u0E31%u0E1A%0A%u0E02%u0E2D%u0E1A%u0E04%u0E38%u0E13%u0E04%u0E23%u0E31%u0E1A%0A%u0421%u0430%u0439%u043D%20%u0431%u0430%u0439%u043D%u0430%u0443%u0443"

retval, err := strproc.DecodeUnicodeEntities(StrUnicodeEntityEncodedMultipleLine)

fmt.Println("Return : ", retval)
fmt.Println("Error : ", err)
```


The above example will output:

```bash
Return : 안녕하세요.
방갑습니다.
감사합니다.
おはようございます
こんにちは.
こんばんは.
おやすみなさい.
ありがとうございます
你好
再見
谢谢!สวัสดีครับ
แล้วเจอกันครับ
ขอบคุณครับ
Сайн байнауу
Error : <nil>
```

### DecodeURLEncoded

DecodeURLEncoded Decodes URL-encoded string (including unicode entities)

```go
func (s *StringProc) DecodeURLEncoded(val string) (string, error)
```

Example:

```go

URLWithJapanWorld := "http://hello.%E4%B8%96%E7%95%8C.com/foo"

retval, err := strproc.DecodeURLEncoded(URLWithJapanWorld)

fmt.Println("Return : ", retval)
fmt.Println("Error : ", err)


```

The abose example will output:

```bash
Result : http://hello.世界.com/foo
Err : <nil>
```

### StripTags

StipTags is remove all tag in string (Pure String or URL Encoded or Html Entity Encoded or Mixed String)
StipTags is remove all tag in string (Pure String or URL Encoded or Html (Unicode) Entities Encoded or Mixed String)

```go
func (s *StringProc) StripTags(str string) (string, error)
Expand Down
4 changes: 2 additions & 2 deletions assert.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ func NewAssert() *Assert {
return obj
}

//TurnOffUnitTestMode is turn off unitTestMode
//TurnOnUnitTestMode is turn on unitTestMode
func (a *Assert) TurnOnUnitTestMode() {

a.mutx.Lock()
defer a.mutx.Unlock()
a.unitTestMode = true
}

//RevertUnitTestMode is revert unitTestMode
//TurnOffUnitTestMode is turn off unitTestMode
func (a *Assert) TurnOffUnitTestMode() {

a.mutx.Lock()
Expand Down
108 changes: 107 additions & 1 deletion stringproc.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ var whiteSpacePattern = regexp.MustCompile(`(?im)\s{2,}`)
var entityEncodedPattern = regexp.MustCompile(`(?ims)(&(?:[a-z0-9]{2,8}|#[0-9]{2,3});)`)
var urlEncodedPattern = regexp.MustCompile(`(?ims)(%[a-zA-Z0-9]{2})`)

// for debug
//var detectUnicodeEntities = regexp.MustCompile(`(?ims)u([0-9a-z]{4})`)

// StringProc is String processing methods, All operations on this object
type StringProc struct {
sync.RWMutex
Expand Down Expand Up @@ -844,6 +847,101 @@ func (s *StringProc) AnyCompare(obj1 interface{}, obj2 interface{}) (bool, error
return true, nil
}

func (s *StringProc) isHex(c byte) bool {

if (c >= 48 && c <= 57) || (c >= 65 && c <= 70) || (c >= 97 && c <= 102) { //0~9, a~f, A~F
return true
}

return false
}

func (s *StringProc) unHex(c byte) byte { //from golang. unhex

switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
}

return 0
}

// DecodeUnicodeEntities Decodes Unicode Entities
func (s *StringProc) DecodeUnicodeEntities(val string) (string, error) {

var tmpret []byte

l := len(val)
for i := 0; i < l; i++ {

if val[i] == 37 && val[i+1] == 117 && l >= i+6 { // % + u

var tmpval []byte
tmpval = append(tmpval, val[i+2], val[i+3], val[i+4], val[i+5])

runeval, err := strconv.ParseInt(string(tmpval), 16, 64)
if err != nil {
return "", err
}

tmprune := []byte(string(rune(runeval)))
tmpret = append(tmpret, tmprune...)
i += 5 //jump %uXXXX

} else if val[i] == 37 { //control character or other
tmpret = append(tmpret, s.unHex(val[i+1])<<4|s.unHex(val[i+2]))
i += 2
} else {
tmpret = append(tmpret, val[i])
}
}

return string(tmpret), nil
}

// DecodeURLEncoded Decodes URL-encoded string (including unicode entities)
// NOTE : golang.url.unescape not support unicode entities (%uXXXX)
func (s *StringProc) DecodeURLEncoded(val string) (string, error) {

var tmpret []byte

l := len(val)
for i := 0; i < l; i++ {

// 37 = %, 117 = u (UnicodeEntity)
if val[i] == 37 && val[i+1] != 117 && l >= i+3 && s.isHex(val[i+1]) && s.isHex(val[i+2]) {

tmpret = append(tmpret, s.unHex(val[i+1])<<4|s.unHex(val[i+2]))
i += 2
continue
}

if val[i] == 37 && val[i+1] == 117 && l >= i+6 { // % + u

var tmpval []byte
tmpval = append(tmpval, val[i+2], val[i+3], val[i+4], val[i+5])

runeval, err := strconv.ParseInt(string(tmpval), 16, 64)
if err != nil {
return "", err
}

tmprune := []byte(string(rune(runeval)))
tmpret = append(tmpret, tmprune...)
i += 5
continue
}

tmpret = append(tmpret, val[i])
}

return string(tmpret), nil
}

// StripTags is remove all tag in string
func (s *StringProc) StripTags(str string) (string, error) {

Expand All @@ -864,7 +962,15 @@ ENTITY_DECODE:
str = tmpstr
goto ENTITY_DECODE
} else {
return str, err

//url.QueryUnescape not support UnicodeEntities
tmpstr, err := s.DecodeURLEncoded(str)
if err == nil {
str = tmpstr
goto ENTITY_DECODE
} else {
return str, err
}
}
}

Expand Down
Loading

0 comments on commit 1254ca1

Please sign in to comment.