Skip to content

Commit

Permalink
用词频来标注词的权重
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed May 21, 2015
1 parent 72e6088 commit 420b18b
Showing 1 changed file with 12 additions and 15 deletions.
Expand Up @@ -21,9 +21,9 @@
package org.apdplat.word.analysis; package org.apdplat.word.analysis;


import org.apdplat.word.segmentation.Word; import org.apdplat.word.segmentation.Word;
import org.apdplat.word.util.AtomicFloat;


import java.util.*; import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;


/** /**
* 文本相似度计算 * 文本相似度计算
Expand All @@ -45,35 +45,32 @@ public class ManhattanDistanceTextSimilarity extends TextSimilarity {
*/ */
@Override @Override
protected double scoreImpl(List<Word> words1, List<Word> words2) { protected double scoreImpl(List<Word> words1, List<Word> words2) {
//词频统计 //用词频来标注词的权重
Map<Word, AtomicInteger> frequency1 = frequency(words1); taggingWeightWithWordFrequency(words1, words2);
Map<Word, AtomicInteger> frequency2 = frequency(words2); //构造权重快速搜索容器
//输出词频统计信息 Map<String, Float> weights1 = toFastSearchMap(words1);
if(LOGGER.isDebugEnabled()){ Map<String, Float> weights2 = toFastSearchMap(words2);
LOGGER.debug("词频统计1:\n{}", formatWordsFrequency(frequency1));
LOGGER.debug("词频统计2:\n{}", formatWordsFrequency(frequency2));
}
//所有的不重复词 //所有的不重复词
Set<Word> words = new HashSet<>(); Set<Word> words = new HashSet<>();
words.addAll(words1); words.addAll(words1);
words.addAll(words2); words.addAll(words2);
//向量的维度为words的大小,每一个维度的权重是词频 //向量的维度为words的大小,每一个维度的权重是词频
//manhattanDistance=|x1-x2|+|y1-y2| //manhattanDistance=|x1-x2|+|y1-y2|
AtomicInteger manhattanDistance = new AtomicInteger(); AtomicFloat manhattanDistance = new AtomicFloat();
//计算 //计算
words words
.parallelStream() .parallelStream()
.forEach(word -> { .forEach(word -> {
AtomicInteger x1 = frequency1.get(word); Float x1 = weights1.get(word.getText());
AtomicInteger x2 = frequency2.get(word); Float x2 = weights2.get(word.getText());
if (x1 == null) { if (x1 == null) {
x1 = new AtomicInteger(0); x1 = 0f;
} }
if (x2 == null) { if (x2 == null) {
x2 = new AtomicInteger(0); x2 = 0f;
} }
//|x1-x2| //|x1-x2|
int oneOfTheDimension = Math.abs(x1.get() - x2.get()); float oneOfTheDimension = Math.abs(x1 - x2);
//+ //+
manhattanDistance.addAndGet(oneOfTheDimension); manhattanDistance.addAndGet(oneOfTheDimension);
}); });
Expand Down

0 comments on commit 420b18b

Please sign in to comment.