Skip to content

Commit

Permalink
简单共有词计算方式改进
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed May 20, 2015
1 parent 799be49 commit d4f2c05
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 77 deletions.
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -575,7 +575,7 @@ word分词提供了两种文本相似度计算方式:
运行结果如下: 运行结果如下:


我爱购物 和 我爱购物 的相似度分值:1.0 我爱购物 和 我爱购物 的相似度分值:1.0
我爱购物 和 我爱读书 的相似度分值:0.67 我爱购物 和 我爱读书 的相似度分值:0.5
我爱购物 和 他是黑客 的相似度分值:0.0 我爱购物 和 他是黑客 的相似度分值:0.0
我爱读书 和 我爱读书 的相似度分值:1.0 我爱读书 和 我爱读书 的相似度分值:1.0
我爱读书 和 他是黑客 的相似度分值:0.0 我爱读书 和 他是黑客 的相似度分值:0.0
Expand Down
54 changes: 45 additions & 9 deletions src/main/java/org/apdplat/word/analysis/CosineTextSimilarity.java
Expand Up @@ -23,10 +23,7 @@
import org.apdplat.word.segmentation.Word; import org.apdplat.word.segmentation.Word;


import java.math.BigDecimal; import java.math.BigDecimal;
import java.util.HashSet; import java.util.*;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;


/** /**
Expand All @@ -43,15 +40,22 @@ public class CosineTextSimilarity extends TextSimilarity {
* |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2] * |a|=根号[(x1)^2+(y1)^2],|b|=根号[(x2)^2+(y2)^2]
* @param words1 词列表1 * @param words1 词列表1
* @param words2 词列表2 * @param words2 词列表2
* @param frequency1 词列表1的词频统计结果
* @param frequency2 词列表2的词频统计结果
* @return 相似度分值 * @return 相似度分值
*/ */
@Override @Override
protected double scoreImpl(List<Word> words1, List<Word> words2, Map<Word, AtomicInteger> frequency1, Map<Word, AtomicInteger> frequency2) { protected double scoreImpl(List<Word> words1, List<Word> words2) {
//词频统计
Map<Word, AtomicInteger> frequency1 = frequency(words1);
Map<Word, AtomicInteger> frequency2 = frequency(words2);
//输出词频统计信息
if(LOGGER.isDebugEnabled()){
LOGGER.debug("词频统计1:\n{}", formatWordsFrequency(frequency1));
LOGGER.debug("词频统计2:\n{}", formatWordsFrequency(frequency2));
}
//所有的不重复词
Set<Word> words = new HashSet<>(); Set<Word> words = new HashSet<>();
words.addAll(frequency1.keySet()); words.addAll(words1);
words.addAll(frequency2.keySet()); words.addAll(words2);
//向量的维度为words的大小,每一个维度的权重是词频 //向量的维度为words的大小,每一个维度的权重是词频
//a.b //a.b
AtomicInteger ab = new AtomicInteger(); AtomicInteger ab = new AtomicInteger();
Expand Down Expand Up @@ -93,6 +97,38 @@ protected double scoreImpl(List<Word> words1, List<Word> words2, Map<Word, Atomi
return cos; return cos;
} }


/**
* 统计词频
* @param words 词列表
* @return 词频统计结果
*/
private Map<Word, AtomicInteger> frequency(List<Word> words){
Map<Word, AtomicInteger> frequency =new HashMap<>();
words.forEach(word->{
frequency.putIfAbsent(word, new AtomicInteger());
frequency.get(word).incrementAndGet();
});
return frequency;
}

/**
* 格式化词频统计信息
* @param frequency 词频统计信息
*/
private String formatWordsFrequency(Map<Word, AtomicInteger> frequency){
StringBuilder str = new StringBuilder();
if(frequency != null && !frequency.isEmpty()) {
AtomicInteger c = new AtomicInteger();
frequency
.entrySet()
.stream()
.sorted((a, b) -> b.getValue().get() - a.getValue().get())
.forEach(e -> str.append("\t").append(c.incrementAndGet()).append("、").append(e.getKey()).append("=").append(e.getValue()).append("\n"));
}
str.setLength(str.length()-1);
return str.toString();
}

public static void main(String[] args) { public static void main(String[] args) {
String text1 = "我爱购物"; String text1 = "我爱购物";
String text2 = "我爱读书"; String text2 = "我爱读书";
Expand Down
34 changes: 21 additions & 13 deletions src/main/java/org/apdplat/word/analysis/SimpleTextSimilarity.java
Expand Up @@ -22,8 +22,9 @@


import org.apdplat.word.segmentation.Word; import org.apdplat.word.segmentation.Word;


import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;


/** /**
Expand All @@ -36,25 +37,32 @@ public class SimpleTextSimilarity extends TextSimilarity {
* 判定相似度的方式:简单共有词 * 判定相似度的方式:简单共有词
* @param words1 词列表1 * @param words1 词列表1
* @param words2 词列表2 * @param words2 词列表2
* @param frequency1 词列表1的词频统计结果
* @param frequency2 词列表2的词频统计结果
* @return 相似度分值 * @return 相似度分值
*/ */
@Override @Override
protected double scoreImpl(List<Word> words1, List<Word> words2, Map<Word, AtomicInteger> frequency1, Map<Word, AtomicInteger> frequency2) { protected double scoreImpl(List<Word> words1, List<Word> words2) {
//判断有几个相同的词 //计算词列表1总的字符数
AtomicInteger words1Length = new AtomicInteger();
words1.parallelStream().forEach(word -> words1Length.addAndGet(word.getText().length()));
//计算词列表2总的字符数
AtomicInteger words2Length = new AtomicInteger();
words2.parallelStream().forEach(word -> words2Length.addAndGet(word.getText().length()));
//计算词列表1和词列表2共有的词的总的字符数
//HashSet的contains性能要大于ArrayList的contains
Set<Word> words2Set = new HashSet<>();
words2Set.addAll(words2);
AtomicInteger intersectionLength = new AtomicInteger(); AtomicInteger intersectionLength = new AtomicInteger();
frequency1.keySet().parallelStream().forEach(word -> { words1.parallelStream().forEach(word -> {
if (frequency2.keySet().contains(word)) { if (words2Set.contains(word)) {
intersectionLength.incrementAndGet(); intersectionLength.addAndGet(word.getText().length());
} }
}); });
double score = intersectionLength.get()/(double)Math.min(frequency1.size(), frequency2.size()); double score = intersectionLength.get()/(double)Math.min(words1Length.get(), words2Length.get());
if(LOGGER.isDebugEnabled()) { if(LOGGER.isDebugEnabled()) {
LOGGER.debug("文本1有的词数:" + frequency1.size()); LOGGER.debug("词列表1总的字符数:" + words1Length.get());
LOGGER.debug("文本2有的词数:" + frequency2.size()); LOGGER.debug("词列表2总的字符数:" + words2Length.get());
LOGGER.debug("文本1和2共有的词数:" + intersectionLength.get()); LOGGER.debug("词列表1和2共有的词的总的字符数:" + intersectionLength.get());
LOGGER.debug("相似度分值=" + intersectionLength.get() + "/(double)Math.min(" + frequency1.size() + ", " + frequency2.size() + ")=" + score); LOGGER.debug("相似度分值=" + intersectionLength.get() + "/(double)Math.min(" + words1Length.get() + ", " + words1Length.get() + ")=" + score);
} }
return score; return score;
} }
Expand Down
78 changes: 24 additions & 54 deletions src/main/java/org/apdplat/word/analysis/TextSimilarity.java
Expand Up @@ -39,12 +39,14 @@
*/ */
public abstract class TextSimilarity implements Similarity{ public abstract class TextSimilarity implements Similarity{
protected static final Logger LOGGER = LoggerFactory.getLogger(TextSimilarity.class); protected static final Logger LOGGER = LoggerFactory.getLogger(TextSimilarity.class);

//默认分词器 //默认分词器
protected Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore); protected Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore);
//相似性阈值 //相似性阈值
protected float thresholdRate = 0.5F; protected float thresholdRate = 0.5F;
//是否忽略停用词 //是否忽略停用词
protected boolean filterStopWord = false; protected boolean filterStopWord = false;

/** /**
* 文本1和文本2是否相似 * 文本1和文本2是否相似
* @param text1 文本1 * @param text1 文本1
Expand All @@ -53,12 +55,19 @@ public abstract class TextSimilarity implements Similarity{
*/ */
@Override @Override
public boolean isSimilar(String text1, String text2) { public boolean isSimilar(String text1, String text2) {
if(LOGGER.isDebugEnabled()) {
LOGGER.debug("文本1:");
LOGGER.debug("\t" + text1);
LOGGER.debug("文本2:");
LOGGER.debug("\t" + text2);
}
//分词 //分词
List<Word> words1 = seg(text1); List<Word> words1 = seg(text1);
List<Word> words2 = seg(text2); List<Word> words2 = seg(text2);
//判断相似度 //判断相似度
return isSimilar(words1, words2); return isSimilar(words1, words2);
} }

/** /**
* 文本1和文本2的相似度分值 * 文本1和文本2的相似度分值
* @param text1 文本1 * @param text1 文本1
Expand All @@ -67,6 +76,12 @@ public boolean isSimilar(String text1, String text2) {
*/ */
@Override @Override
public double similarScore(String text1, String text2) { public double similarScore(String text1, String text2) {
if(LOGGER.isDebugEnabled()) {
LOGGER.debug("文本1:");
LOGGER.debug("\t" + text1);
LOGGER.debug("文本2:");
LOGGER.debug("\t" + text2);
}
//分词 //分词
List<Word> words1 = seg(text1); List<Word> words1 = seg(text1);
List<Word> words2 = seg(text2); List<Word> words2 = seg(text2);
Expand Down Expand Up @@ -95,7 +110,14 @@ public boolean isSimilar(List<Word> words1, List<Word> words2) {
public double similarScore(List<Word> words1, List<Word> words2) { public double similarScore(List<Word> words1, List<Word> words2) {
if(words1 != null && words2 != null if(words1 != null && words2 != null
&& !words1.isEmpty() && !words2.isEmpty()){ && !words1.isEmpty() && !words2.isEmpty()){
double score = score(words1, words2); //输出词列表信息
if(LOGGER.isDebugEnabled()) {
LOGGER.debug("词列表1:");
LOGGER.debug("\t" + words1);
LOGGER.debug("词列表2:");
LOGGER.debug("\t" + words2);
}
double score = scoreImpl(words1, words2);
if(LOGGER.isDebugEnabled()){ if(LOGGER.isDebugEnabled()){
LOGGER.debug("分值:"+score); LOGGER.debug("分值:"+score);
} }
Expand All @@ -114,28 +136,7 @@ public double similarScore(List<Word> words1, List<Word> words2) {
* @param words2 词列表2 * @param words2 词列表2
* @return 相似度分值 * @return 相似度分值
*/ */
private double score(List<Word> words1, List<Word> words2){ protected abstract double scoreImpl(List<Word> words1, List<Word> words2);
//词频统计
Map<Word, AtomicInteger> frequency1 = frequency(words1);
Map<Word, AtomicInteger> frequency2 = frequency(words2);
//输出详细信息
if(LOGGER.isDebugEnabled()){
showDetail(words1, frequency1);
showDetail(words2, frequency2);
}
//计算相似度分值
return scoreImpl(words1, words2, frequency1, frequency2);
}

/**
* 计算相似度分值
* @param words1 词列表1
* @param words2 词列表2
* @param frequency1 词列表1的词频统计结果
* @param frequency2 词列表2的词频统计结果
* @return 相似度分值
*/
protected abstract double scoreImpl(List<Word> words1, List<Word> words2, Map<Word, AtomicInteger> frequency1, Map<Word, AtomicInteger> frequency2);


/** /**
* 对文本进行分词 * 对文本进行分词
Expand All @@ -150,35 +151,4 @@ private List<Word> seg(String text){
} }
return words; return words;
} }

/**
* 统计词频
* @param words 词列表
* @return 词频统计结果
*/
private Map<Word, AtomicInteger> frequency(List<Word> words){
Map<Word, AtomicInteger> frequency =new HashMap<>();
words.forEach(word->{
frequency.putIfAbsent(word, new AtomicInteger());
frequency.get(word).incrementAndGet();
});
return frequency;
}

/**
* 输出词列表和词频统计信息
* @param words 词列表
* @param frequency 词频统计信息
*/
private void showDetail(List<Word> words, Map<Word, AtomicInteger> frequency){
LOGGER.debug("分词结果:");
LOGGER.debug("\t"+words);
LOGGER.debug("词频统计:");
AtomicInteger c = new AtomicInteger();
frequency
.entrySet()
.stream()
.sorted((a,b)->b.getValue().get()-a.getValue().get())
.forEach(e->LOGGER.debug("\t"+c.incrementAndGet()+"、"+e.getKey()+"="+e.getValue()));
}
} }

0 comments on commit d4f2c05

Please sign in to comment.