-
Notifications
You must be signed in to change notification settings - Fork 87
/
CosineSimilarityComparison.php
64 lines (54 loc) · 1.68 KB
/
CosineSimilarityComparison.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
<?php
declare(strict_types = 1);
namespace TextAnalysis\Comparisons;
use TextAnalysis\Interfaces\IDistance;
use TextAnalysis\Interfaces\ISimilarity;
/**
* Implements cosine similarity algorithm for comparing two sets of arrays
*
* @author yooper
*/
class CosineSimilarityComparison implements IDistance, ISimilarity
{
/**
*
* @param array $text1 an array of tokens
* @param array $text2 an array of tokens
*/
public function similarity($text1, $text2)
{
$text1Freq = array_count_values($text1);
$text2Freq = array_count_values($text2);
$product = 0.0;
// always choose the smaller document
if(count($text1Freq) < count($text2Freq)) {
$iterateTokens =& $text1Freq;
} else {
$iterateTokens =& $text2Freq;
}
foreach($iterateTokens as $term => $freq)
{
if (isset($text1Freq[$term]) && isset($text2Freq[$term])) {
$product += $text1Freq[$term] * $text2Freq[$term];
}
}
$productFunc = function($carry, $freq)
{
$carry += pow($freq, 2);
return $carry;
};
$text1VectorSum = sqrt(array_reduce(array_values($text1Freq), $productFunc, 0));
$text2VectorSum = sqrt(array_reduce(array_values($text2Freq), $productFunc, 0));
return $product / ($text1VectorSum * $text2VectorSum);
}
/**
*
* @param array $text1
* @param array $text2
* @return float
*/
public function distance($text1, $text2)
{
return 1 - $this->similarity($text1, $text2);
}
}