Skip to content

Commit

Permalink
SimHash + 汉明距离
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed May 22, 2015
1 parent fb04a99 commit 0715190
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 22 deletions.
10 changes: 5 additions & 5 deletions README.md
Expand Up @@ -719,11 +719,11 @@ word分词提供了多种文本相似度计算方式:
运行结果如下: 运行结果如下:


我爱购物 和 我爱购物 的相似度分值:1.0 我爱购物 和 我爱购物 的相似度分值:1.0
我爱购物 和 我爱读书 的相似度分值:0.86 我爱购物 和 我爱读书 的相似度分值:0.95
我爱购物 和 他是黑客 的相似度分值:0.48 我爱购物 和 他是黑客 的相似度分值:0.83
我爱读书 和 我爱读书 的相似度分值:1.0 我爱读书 和 我爱读书 的相似度分值:1.0
我爱读书 和 他是黑客 的相似度分值:0.57 我爱读书 和 他是黑客 的相似度分值:0.86
他是黑客 和 他是黑客 的相似度分值:1.0 他是黑客 和 他是黑客 的相似度分值:1.0


方式五:Jaccard相似性系数,通过计算两个集合交集的大小除以并集的大小来评估他们的相似度 方式五:Jaccard相似性系数,通过计算两个集合交集的大小除以并集的大小来评估他们的相似度


Expand Down
Expand Up @@ -73,53 +73,63 @@ public void setHashBitCount(int hashBitCount) {
*/ */
@Override @Override
protected double scoreImpl(List<Word> words1, List<Word> words2){ protected double scoreImpl(List<Word> words1, List<Word> words2){
//用词频来标注词的权重
taggingWeightWithWordFrequency(words1, words2);
//计算SimHash //计算SimHash
String simHash1 = simHash(words1); String simHash1 = simHash(words1);
String simHash2 = simHash(words2); String simHash2 = simHash(words2);
//计算SimHash值之间的汉明距离 //计算SimHash值之间的汉明距离
int hammingDistance = hammingDistance(simHash1, simHash2); int hammingDistance = hammingDistance(simHash1, simHash2);
if(hammingDistance == -1){ if(hammingDistance == -1){
LOGGER.error("文本1:" + words1.toString());
LOGGER.error("文本2:" + words2.toString());
LOGGER.error("文本1SimHash值:" + simHash1); LOGGER.error("文本1SimHash值:" + simHash1);
LOGGER.error("文本2SimHash值:" + simHash2); LOGGER.error("文本2SimHash值:" + simHash2);
LOGGER.error("文本1和文本2的SimHash值长度不相等,不能计算汉明距离"); LOGGER.error("文本1和文本2的SimHash值长度不相等,不能计算汉明距离");
return 0; return 0;
} }
double score = (1 - hammingDistance / (double)simHash1.length()); int maxDistance = simHash1.length();
double score = (1 - hammingDistance / (double)maxDistance);
if(LOGGER.isDebugEnabled()){ if(LOGGER.isDebugEnabled()){
LOGGER.debug("文本1:" + words1.toString());
LOGGER.debug("文本2:" + words2.toString());
LOGGER.debug("文本1SimHash值:"+simHash1); LOGGER.debug("文本1SimHash值:"+simHash1);
LOGGER.debug("文本2SimHash值:"+simHash2); LOGGER.debug("文本2SimHash值:"+simHash2);
LOGGER.debug("hashBitCount:"+hashBitCount); LOGGER.debug("hashBitCount:"+hashBitCount);
LOGGER.debug("SimHash值之间的汉明距离:"+hammingDistance); LOGGER.debug("SimHash值之间的汉明距离:"+hammingDistance);
LOGGER.debug("文本1和文本2的相似度分值:1 - "+hammingDistance+" / (double)"+simHash1.length()+"="+score); LOGGER.debug("文本1和文本2的相似度分值:1 - "+hammingDistance+" / (double)"+maxDistance+"="+score);
} }
return score; return score;
} }


/** /**
* 计算词列表的SimHash值 * 计算词列表的SimHash值
* @param words 词列表 * @param words 词列表
* @return SimHash * @return SimHash值
*/ */
private String simHash(List<Word> words) { private String simHash(List<Word> words) {
int[] hashBit = new int[hashBitCount]; float[] hashBit = new float[hashBitCount];
words.forEach(word -> { words.forEach(word -> {
BigInteger t = hash(word.getText()); float weight = word.getWeight()==null?1:word.getWeight();
BigInteger hash = hash(word.getText());
for (int i = 0; i < hashBitCount; i++) { for (int i = 0; i < hashBitCount; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i); BigInteger bitMask = new BigInteger("1").shiftLeft(i);
if (t.and(bitmask).signum() != 0) { if (hash.and(bitMask).signum() != 0) {
hashBit[i] += 1; hashBit[i] += weight;
} else { } else {
hashBit[i] -= 1; hashBit[i] -= weight;
} }
} }
}); });
BigInteger fingerprint = new BigInteger("0"); StringBuffer fingerprint = new StringBuffer();
for (int i = 0; i < hashBitCount; i++) { for (int i = 0; i < hashBitCount; i++) {
if (hashBit[i] >= 0) { if (hashBit[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); fingerprint.append("1");
}else{
fingerprint.append("0");
} }
} }
return fingerprint.toString(2); return fingerprint.toString();
} }


/** /**
Expand All @@ -135,11 +145,11 @@ private BigInteger hash(String word) {
BigInteger x = BigInteger.valueOf(((long) charArray[0]) << 7); BigInteger x = BigInteger.valueOf(((long) charArray[0]) << 7);
BigInteger m = new BigInteger("1000003"); BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(hashBitCount).subtract(new BigInteger("1")); BigInteger mask = new BigInteger("2").pow(hashBitCount).subtract(new BigInteger("1"));
long wordSum = 0; long sum = 0;
for (char c : charArray) { for (char c : charArray) {
wordSum += (long)c; sum += c;
} }
x = x.multiply(m).xor(BigInteger.valueOf(wordSum)).and(mask); x = x.multiply(m).xor(BigInteger.valueOf(sum)).and(mask);
x = x.xor(new BigInteger(String.valueOf(word.length()))); x = x.xor(new BigInteger(String.valueOf(word.length())));
if (x.equals(new BigInteger("-1"))) { if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2"); x = new BigInteger("-2");
Expand All @@ -159,15 +169,16 @@ private int hammingDistance(String simHash1, String simHash2) {
return -1; return -1;
} }
int distance = 0; int distance = 0;
for (int i = 0; i < simHash1.length(); i++) { int len = simHash1.length();
for (int i = 0; i < len; i++) {
if (simHash1.charAt(i) != simHash2.charAt(i)) { if (simHash1.charAt(i) != simHash2.charAt(i)) {
distance++; distance++;
} }
} }
return distance; return distance;
} }


public static void main(String[] args) { public static void main(String[] args) throws Exception{
String text1 = "我爱购物"; String text1 = "我爱购物";
String text2 = "我爱读书"; String text2 = "我爱读书";
String text3 = "他是黑客"; String text3 = "他是黑客";
Expand Down

0 comments on commit 0715190

Please sign in to comment.