Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat:敏感词过滤 #31

Merged
merged 1 commit into from
Jun 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14,630 changes: 14,630 additions & 0 deletions docs/sensitive_word.sql

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
package com.abin.mallchat.common.common.utils;

import org.apache.commons.lang3.StringUtils;

import java.io.*;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.stream.Collectors;


/**
* 敏感词过滤
*
* @author zhaoyuhang
* @since 2023/06/11
*/
public final class SensitiveWordUtils {
private static SensitiveWordList wordList;
private final static char replace = '*'; // 替代字符
private final static char[] skip = new char[]{ // 遇到这些字符就会跳过
' ', '!', '*', '-', '+', '_', '=', ',', ',', '.', '@', ';', ':', ';', ':'
};

/**
* 有敏感词
*
* @param text 文本
* @return boolean
*/
public static boolean hasSensitiveWord(String text) {
if (StringUtils.isBlank(text)) return false;
return !Objects.equals(filter(text), text);
}

/**
* 敏感词替换
*
* @param text 待替换文本
* @return 替换后的文本
*/
public static String filter(String text) {
if (wordList == null || wordList.size() == 0 || StringUtils.isBlank(text)) return text;
char[] __char__ = text.toCharArray(); // 把String转化成char数组,便于遍历
int i, j;
Word word;
boolean flag; // 是否需要替换
for (i = 0; i < __char__.length; i++) { // 遍历所有字符
char c = __char__[i];
word = wordList.binaryGet(c); // 使用二分查找来寻找字符,提高效率
if (word != null) { // word != null说明找到了
flag = false;
j = i + 1;
while (j < __char__.length) { // 开始逐个比较后面的字符
if (skip(__char__[j])) { // 跳过空格之类的无关字符
j++;
continue;
}
if (word.next != null) { // 字符串尚未结束,不确定是否存在敏感词
/*
以下代码并没有使用二分查找,因为以同一个字符开头的敏感词较少
例如,wordList中记录了所有敏感词的开头第一个字,它的数量通常会有上千个
假如现在锁定了字符“T”开头的敏感词,而“T”开头的敏感词只有10个,这时使用二分查找的效率反而低于顺序查找
*/
word = word.next.get(__char__[j]);
if (word == null) {
break;
}
j++;
} else { // 字符串已结束,存在敏感词汇
flag = true;
break;
}
}
if (word != null && word.next == null) {
flag = true;
}
if (flag) { // 如果flag==true,说明检测出敏感粗,需要替换
while (i < j) {
// if(skip(__char__[i])){ // 跳过空格之类的无关字符,如果要把空格也替换成'*',则删除这个if语句
// i++;
// continue;
// }
__char__[i] = replace;
i++;
}
i--;
}
}
}
return new String(__char__);
}

/**
* 加载敏感词列表
*
* @param words 敏感词数组
*/
public static void loadWord(List<String> words) {
if (words == null) return;
words = words.stream().distinct().collect(Collectors.toList()); // 去重
char[] chars;
SensitiveWordList now;
Word word;
wordList = new SensitiveWordList();
for (String __word__ : words) {
if (__word__ == null) continue;
chars = __word__.toCharArray();
now = wordList;
word = null;
for (char c : chars) {
if (word != null) {
if (word.next == null) word.next = new SensitiveWordList();
now = word.next;
}
word = now.get(c);
if (word == null) word = now.add(c);
}
}
sort(wordList);
}

/**
* 加载敏感词txt文件,每个敏感词独占一行,不可出现空格,空行,逗号等非文字内容,必须使用UTF-8编码
*
* @param path txt文件的绝对地址
*/
public static void loadWordFromFile(String path) {
String encoding = "UTF-8";
File file = new File(path);
try {
if (file.isFile() && file.exists()) {
InputStreamReader inputStreamReader = new InputStreamReader(
Files.newInputStream(file.toPath()), encoding
);
BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
String line;
ArrayList<String> list = new ArrayList<>();
while ((line = bufferedReader.readLine()) != null) {
list.add(line);
}
bufferedReader.close();
inputStreamReader.close();
loadWord(list);
}
} catch (IOException e) {
e.printStackTrace();
}
}

/**
* 对敏感词多叉树递增排序
*
* @param list 待排序List
*/
private static void sort(SensitiveWordList list) {
if (list == null) return;
Collections.sort(list); // 递增排序
for (Word word : list) {
sort(word.next);
}
}

/**
* 判断是否跳过当前字符
*
* @param c 待检测字符
* @return true:需要跳过 false:不需要跳过
*/
private static boolean skip(char c) {
for (char c1 : skip) {
if (c1 == c) return true;
}
return false;
}

/**
* 敏感词列表
*
* @author zhaoyuhang
* @since 2023/06/11
*/
public static class SensitiveWordList extends ArrayList<Word> {
public Word get(char c) {
for (Word w : this) {
if (w.c == c) return w;
}
return null;
}

/**
* 二分查找,必须先升序排序
*
* @param c 需要查找的字符
* @return Word对象:如果找到 null:如果没找到
*/
public Word binaryGet(char c) {
int left, right, key;
Word word;
left = 0;
right = this.size() - 1;
while (left <= right) {
key = (left + right) / 2;
word = get(key);
if (word.c == c) {
return word;
} else if (word.c > c) {
right = key - 1;
} else {
left = key + 1;
}
}
return null;
}

public Word add(char c) {
Word word = new Word(c);
super.add(word);
return word;
}

}

/**
* 敏感词
*
* @author zhaoyuhang
* @since 2023/06/11
*/
public static class Word implements Comparable<Word> {
public char c;
public SensitiveWordList next = null;

public Word(char c) {
this.c = c;
}

@Override
public int compareTo(Word word) {
return c - word.c;
}

public String toString() {
return c + "(" + (next == null ? null : next.size()) + ")";
}
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package com.abin.mallchat.common.sensitive.dao;

import com.abin.mallchat.common.sensitive.domain.SensitiveWord;
import com.abin.mallchat.common.sensitive.mapper.SensitiveWordMapper;
import com.baomidou.mybatisplus.extension.service.impl.ServiceImpl;
import org.springframework.stereotype.Service;

/**
* 敏感词DAO
*
* @author zhaoyuhang
* @since 2023/06/11
*/
@Service
public class SensitiveWordDao extends ServiceImpl<SensitiveWordMapper, SensitiveWord> {

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.abin.mallchat.common.sensitive.domain;

import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.EqualsAndHashCode;

/**
* 敏感词
*
* @author zhaoyuhang
* @since 2023/06/11
*/
@Data
@EqualsAndHashCode(callSuper = false)
@TableName("sensitive_word")
public class SensitiveWord {
private String word;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package com.abin.mallchat.common.sensitive.mapper;

import com.abin.mallchat.common.sensitive.domain.SensitiveWord;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;

/**
* 敏感词Mapper
*
* @author zhaoyuhang
* @since 2023-05-21
*/
public interface SensitiveWordMapper extends BaseMapper<SensitiveWord> {

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package com.abin.mallchat.common.sensitive.service;

public interface ISensitiveWordService {

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package com.abin.mallchat.common.sensitive.service.impl;

import com.abin.mallchat.common.common.utils.SensitiveWordUtils;
import com.abin.mallchat.common.sensitive.dao.SensitiveWordDao;
import com.abin.mallchat.common.sensitive.domain.SensitiveWord;
import com.abin.mallchat.common.sensitive.service.ISensitiveWordService;
import org.apache.commons.collections.CollectionUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import javax.annotation.PostConstruct;
import java.util.List;
import java.util.stream.Collectors;

@Service
public class SensitiveWordServiceImpl implements ISensitiveWordService {
@Autowired
private SensitiveWordDao sensitiveWordDao;

@PostConstruct
public void initSensitiveWord() {
List<SensitiveWord> list = sensitiveWordDao.list();
if (!CollectionUtils.isEmpty(list)) {
List<String> wordList = list.stream()
.map(SensitiveWord::getWord)
.collect(Collectors.toList());
SensitiveWordUtils.loadWord(wordList);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import com.abin.mallchat.common.chat.service.cache.MsgCache;
import com.abin.mallchat.common.common.domain.enums.YesOrNoEnum;
import com.abin.mallchat.common.common.utils.AssertUtil;
import com.abin.mallchat.common.common.utils.SensitiveWordUtils;
import com.abin.mallchat.common.common.utils.discover.PrioritizedUrlTitleDiscover;
import com.abin.mallchat.common.user.domain.entity.User;
import com.abin.mallchat.common.user.domain.enums.RoleEnum;
Expand Down Expand Up @@ -82,7 +83,7 @@ public void saveMsg(Message msg, ChatMessageReq request) {//插入文本内容
MessageExtra extra = Optional.ofNullable(msg.getExtra()).orElse(new MessageExtra());
Message update = new Message();
update.setId(msg.getId());
update.setContent(body.getContent());
update.setContent(SensitiveWordUtils.filter(body.getContent()));
update.setExtra(extra);
//如果有回复消息
if (Objects.nonNull(body.getReplyMsgId())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.abin.mallchat.common.common.event.UserBlackEvent;
import com.abin.mallchat.common.common.event.UserRegisterEvent;
import com.abin.mallchat.common.common.utils.AssertUtil;
import com.abin.mallchat.common.common.utils.SensitiveWordUtils;
import com.abin.mallchat.common.user.dao.BlackDao;
import com.abin.mallchat.common.user.dao.ItemConfigDao;
import com.abin.mallchat.common.user.dao.UserBackpackDao;
Expand Down Expand Up @@ -74,7 +75,9 @@ public UserInfoResp getUserInfo(Long uid) {
@Transactional
public void modifyName(Long uid, ModifyNameReq req) {
//判断名字是不是重复
User oldUser = userDao.getByName(req.getName());
String newName = req.getName();
AssertUtil.isFalse(SensitiveWordUtils.hasSensitiveWord(newName), "名字中包含敏感词,请重新输入"); // 判断名字中有没有敏感词
User oldUser = userDao.getByName(newName);
AssertUtil.isEmpty(oldUser, "名字已经被抢占了,请换一个哦~~");
//判断改名卡够不够
UserBackpack firstValidItem = userBackpackDao.getFirstValidItem(uid, ItemEnum.MODIFY_NAME_CARD.getId());
Expand Down