-
Notifications
You must be signed in to change notification settings - Fork 5
/
bigram-similarity.pl
59 lines (49 loc) · 1.48 KB
/
bigram-similarity.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/perl
# Dice Coefficient or Bigram String Similarity Calculation
# Written by Ye Kyaw Thu,
# Visiting Professor,
# Language and Semantic Technology Research Team (LST), NECTEC, Thailand
#
# How to use:
# $ perl bigram-similarity.pl <string1> <string2>
# e.g. perl ./bigram-similarity.pl ကိုကို ကိုကြီး
use strict;
use warnings;
use utf8;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
sub bigram {
my $str = shift; my @charBigram=();
my $strLength = length($str)-1;
for (my $i=0; $i<$strLength; $i++){
push @charBigram, substr($str, $i, 2);
}
return @charBigram;
}
sub uniqArray {
my %uniq;
return grep { !$uniq{$_}++ } @_;
}
sub calc_similarity {
my $inputStr1 = shift;
my @str1 = bigram($inputStr1);
#print("str1: @str1\n");
my $inputStr2 = shift;
my @str2 = bigram($inputStr2);
#print("str2: @str2\n");
my (%isect, @isect);
foreach my $item (@str1) {
$isect{$item}++ if grep { $item eq $_ } @str2;
}
@isect = keys %isect;
#print("\@isect: @isect\n");
my @uniqStr1 = uniqArray(@str1);
my @uniqStr2 = uniqArray(@str2);
#print("@uniqStr1\n");
#print("length \@uniqStr1: ", scalar(@uniqStr1), "\n");
#print("@uniqStr2\n");
#print("length \@uniqStr2: ", scalar(@uniqStr2), "\n");
my $similarityValue = (2.0 * scalar(@isect))/(scalar(uniqArray(@str1)) + scalar(uniqArray(@str2)));
print("Similarity Value:\t$similarityValue\n");
}
calc_similarity($ARGV[0], $ARGV[1]);