Skip to content

Commit

Permalink
用词频来标注词的权重
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed May 21, 2015
1 parent 72e6088 commit 420b18b
Showing 1 changed file with 12 additions and 15 deletions.
Expand Up @@ -21,9 +21,9 @@
package org.apdplat.word.analysis;

import org.apdplat.word.segmentation.Word;
import org.apdplat.word.util.AtomicFloat;

import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;

/**
* 文本相似度计算
Expand All @@ -45,35 +45,32 @@ public class ManhattanDistanceTextSimilarity extends TextSimilarity {
*/
@Override
protected double scoreImpl(List<Word> words1, List<Word> words2) {
//词频统计
Map<Word, AtomicInteger> frequency1 = frequency(words1);
Map<Word, AtomicInteger> frequency2 = frequency(words2);
//输出词频统计信息
if(LOGGER.isDebugEnabled()){
LOGGER.debug("词频统计1:\n{}", formatWordsFrequency(frequency1));
LOGGER.debug("词频统计2:\n{}", formatWordsFrequency(frequency2));
}
//用词频来标注词的权重
taggingWeightWithWordFrequency(words1, words2);
//构造权重快速搜索容器
Map<String, Float> weights1 = toFastSearchMap(words1);
Map<String, Float> weights2 = toFastSearchMap(words2);
//所有的不重复词
Set<Word> words = new HashSet<>();
words.addAll(words1);
words.addAll(words2);
//向量的维度为words的大小,每一个维度的权重是词频
//manhattanDistance=|x1-x2|+|y1-y2|
AtomicInteger manhattanDistance = new AtomicInteger();
AtomicFloat manhattanDistance = new AtomicFloat();
//计算
words
.parallelStream()
.forEach(word -> {
AtomicInteger x1 = frequency1.get(word);
AtomicInteger x2 = frequency2.get(word);
Float x1 = weights1.get(word.getText());
Float x2 = weights2.get(word.getText());
if (x1 == null) {
x1 = new AtomicInteger(0);
x1 = 0f;
}
if (x2 == null) {
x2 = new AtomicInteger(0);
x2 = 0f;
}
//|x1-x2|
int oneOfTheDimension = Math.abs(x1.get() - x2.get());
float oneOfTheDimension = Math.abs(x1 - x2);
//+
manhattanDistance.addAndGet(oneOfTheDimension);
});
Expand Down

0 comments on commit 420b18b

Please sign in to comment.