-
Notifications
You must be signed in to change notification settings - Fork 1
/
htmlUtil.ts
210 lines (180 loc) · 6.14 KB
/
htmlUtil.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/*
* Copyright (c) 2023, Terwer . All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Terwer designates this
* particular file as subject to the "Classpath" exception as provided
* by Terwer in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Terwer, Shenzhen, Guangdong, China, youweics@163.com
* or visit www.terwer.space if you need additional information or have any
* questions.
*/
/**
* HTML 处理工具类
*/
class HtmlUtil {
/**
* 移除标题数字
*
* @param str - 字符串
*/
public static removeTitleNumber(str: string): string {
let newstr = str
// 移除序号
const publisherRegex = /([0-9]*)\.?/
newstr = newstr.replace(publisherRegex, "")
return newstr
}
/**
* 删除挂件的HTML
*
* @param str - 原字符
*/
public static removeWidgetTag(str: string): string {
let newstr = str.toString()
// 旧版发布挂件
const publisherRegex = /<iframe.*src="\/widgets\/publisher.*<\/iframe>/g
newstr = newstr.replace(publisherRegex, "")
// 新版发布挂件
const syPublisherRegex = /<iframe.*src="\/widgets\/sy-post-publisher.*<\/iframe>/g
newstr = newstr.replace(syPublisherRegex, "")
// 文章属性挂件
const noteAttrRegex = /<iframe.*\/widgets\/Note*\sAttrs.*\/iframe>/g
newstr = newstr.replace(noteAttrRegex, "")
return newstr
}
/**
* 删除Markdown文本的挂件的HTML
*
* @param str - 原字符
*/
public static removeMdWidgetTag(str: string): string {
let newstr = str.toString()
// 删除挂件的iframe
newstr = this.removeWidgetTag(newstr)
return newstr
}
/**
* 去除html标签,残缺不全也可以
*
* @param str - 字符串
*/
public static filterHtml(str: string): string {
/*
* <.*?>为正则表达式,其中的.表示任意字符,*?表示出现0次或0次以上,此方法可以去掉双头标签(双头针对于残缺的标签)
* "<.*?"表示<尖括号后的所有字符,此方法可以去掉残缺的标签,及后面的内容
* " ",若有多种此种字符,可用同一方法去除
*/
str = str.replace(/<style((.|\n|\r)*?)<\/style>/g, "")
str = str.replace(/<script((.|\n|\r)*?)<\/script>/g, "")
str = str.replace(/<[^>]*>/g, "")
str = str.replace(/&.*;/g, "")
str = str.replace(/(^\s*)|(\s*$)/g, "")
str = str.replace(/</g, "").replace(/>/g, "")
str = str.replace(/"/g, "").replace(/'/g, "")
// 正则保留字符
str = str.replace(/\*/g, "")
str = str.replace(/\$/g, "")
str = str.replace(/\./g, "")
str = str.replace(/\+/g, "")
// 下面是行内空格,不建议去除
str = str.replace(/\s+/g, "")
// 冒号分号等替换成下划线
str = str.replace(/[:|:]/g, "_")
str = str.replace(/[;|;]/g, "_")
str = str.replace(/\^/g, "_")
str = str.replace(/!/g, "_")
str = str.replace(/@/g, "at_")
// 去除HTML分隔符
str = str.replace(/---/g, "")
// 需要排除的字符
const excludeWords = ["\\d*/\\d/\\d*", "[、|\\\\]", "[,|,]", "\\d", "/", "-"]
for (let i = 0; i < excludeWords.length; i++) {
const regex = new RegExp(excludeWords[i], "g")
str = str.replace(regex, "")
}
str = str.toLowerCase()
return str
}
/**
* 截取指定长度html
*
* @param html - html
* @param length - 长度
* @param ignoreSign - 不要结尾省略号
*/
public static parseHtml(html: string, length: number, ignoreSign?: boolean): string {
const allText = this.filterHtml(html)
const ellipsis = ignoreSign ? "" : "..."
// 使用正则表达式匹配中文字符
const chineseCharReg = /[\u4e00-\u9fa5]/
let textLength = 0
let result = ""
for (let i = 0; i < allText.length; i++) {
const char = allText[i]
if (chineseCharReg.test(char)) {
textLength += 2 // 中文字符长度为2
} else {
textLength += 1 // 英文字符长度为1
}
if (textLength > length) {
result = allText.slice(0, i) + ellipsis
break
}
}
return result || allText
}
/**
* 移除H1标签
*
* @param html - html
*/
public static removeH1(html: string): string {
let newstr = html
// 在正则表达式中使用非贪婪模式
const h1Regex = /<h1.*?\/h1>/
// 查找第一个匹配的 <h1> 标签
const match = newstr.match(h1Regex)
// 替换第一个 <h1> 标签及其内容为空字符串
if (match) {
newstr = newstr.replace(match[0], "")
}
// 去除HTML分隔符
newstr = newstr.replace(/---/g, "")
return newstr
}
/**
* 移除Markdown里面的H1标签
*
* JavaScript 正则表达式可以用来删除所有 Markdown 中的 h1 标签。下面是一个示例代码:
*
* const str = "# This is an H1\n## This is an H2\n### This is an H3";
*
* const regex = /^# .*$/gm;
* const result = str.replace(regex, '');
*
* console.log(result);
* 在这个例子中,我们使用正则表达式 /^# .*$/gm 来匹配所有的 h1 标签。
* 在 JavaScript 中,^ 匹配行首,# 匹配 # 字符,.* 匹配任意字符,$ 匹配行尾,m 标记表示多行模式。
*/
public static removeMdH1(md: string) {
let newstr = md
const mdH1Regex = /^# .*$/m // 移除 'g' 标志以仅匹配第一个 H1 标题
newstr = newstr.replace(mdH1Regex, "")
return newstr
}
}
export default HtmlUtil