# Text Similarity
(by Tevfik Aytekin)

In [57]:
from nltk.corpus import wordnet as wn
import nltk
from nltk.corpus import gutenberg, brown
import numpy as np
from scipy.sparse import csr_matrix
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer

# WordNet

### Lexical Matrix
table taken from [https://wordnetcode.princeton.edu/5papers.pdf](https://wordnetcode.princeton.edu/5papers.pdf)

<img src="images/lexical_matrix.png" style="width: 400px;"/>


The word meaning $M_1$ in above table can be represented by the set of word forms that can be used to express it: {F1, F2, . . . }. These sets are called synonym sets (or simply synsets).

### WordNet Hierarchy
Below is a simplified illustration of the hierarchy of wordnet.

<img src="images/wordnet_hierarchy.png" style="width: 400px;"/>


### A word is a set of meanings
wn.synsets('word') gives these meanings.

In [2]:
wn.synsets("car")

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

### Synsets

A synset is a set of synonyms (word forms). Each synset corresponds to a concept/meaning. The nodes in the WordNet hierarchy corresponds to synsets. A synset is identified with a 3-part name of the form: word.pos.nn. For example, 'car.n.01' means the first meaning of 'car' used as a noun.

In [3]:
print(wn.synset('car.n.01').definition())

a motor vehicle with four wheels; usually propelled by an internal combustion engine


Lemmas correspond to word forms.

In [4]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [5]:
wn.synset('car.n.01').lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

Hypernyms anf hyponyms

In [6]:
wn.synset('car.n.01').hypernyms()

[Synset('motor_vehicle.n.01')]

In [7]:
wn.synset('car.n.01').hyponyms()

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [8]:
wn.synset('car.n.01').root_hypernyms()

[Synset('entity.n.01')]

## Synonymy

### Path Similarity
path_similarity assigns a score in the range 0–1 based on the shortest path that connects the concepts in the hypernym hierarchy (-1 is returned in those cases where a path cannot be found)

In [9]:
right = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
minke = wn.synset('minke_whale.n.01')
tortoise = wn.synset('tortoise.n.01')
novel = wn.synset('novel.n.01')

In [10]:
print(right.path_similarity(minke))
print(right.path_similarity(orca))
print(right.path_similarity(tortoise))
print(right.path_similarity(novel))


0.25
0.16666666666666666
0.07692307692307693
0.043478260869565216


<img src="images/wordnet_hierarchy.png" style="width: 400px;"/>

In [11]:
motorcar = wn.synset('car.n.01')
compact = wn.synset('compact.n.03')
hatchback = wn.synset('hatchback.n.01')
print(motorcar.path_similarity(compact))

0.5


In [12]:
print(hatchback.path_similarity(compact))

0.3333333333333333


In [13]:
print(hatchback.path_similarity(hatchback))

1.0


# Automated ways for finding synonyms

WordNet is constructed manually by experts of linguistics. There is also the computational approach to semantics. Below we will look at one such approach for finding synonyms. The approach relies on the below fundamental hypothesis:
<br><br>
<center><b>Distributional Hypothesis: similar words appear in similar contexts.</b></center>
<br><br>
We will first need to build a corpus and a co-occurrence matrix.

In [55]:
def build_corpus(text):
    """ 
  
    Parameters: 
    text (string): A (long) string 
  
    Returns: 
    words: A list of unique word names.
    word_to_index: a mapping from word names to integers.
    index_to_word: a mapping from integers to word names.
  
    """
    porter = nltk.PorterStemmer()

    tokenizer = RegexpTokenizer(r'\w+')
    words = [] 
    for i in sent_tokenize(text): 
        for j in tokenizer.tokenize(i):
            words.append(j.lower())
    words = np.unique(words)

    porter = nltk.PorterStemmer()
    words = [porter.stem(t) for t in words]
    words = np.unique(words)
    
    word_to_index = {}
    index_to_word = {}
    counter = 0;
    for w in words:
        word_to_index[w] = counter
        index_to_word[counter] = w
        counter += 1  
    return words, word_to_index, index_to_word

In [15]:
text = "This is data mining course cmp3004. It is about data mining. I like it so much."
words, word_to_index, index_to_word = build_corpus(text)

In [16]:
corpus_size = len(words)
print(words)
print(word_to_index)

['about' 'cmp3004' 'cours' 'data' 'i' 'is' 'it' 'like' 'mine' 'much' 'so'
 'thi']
{'about': 0, 'cmp3004': 1, 'cours': 2, 'data': 3, 'i': 4, 'is': 5, 'it': 6, 'like': 7, 'mine': 8, 'much': 9, 'so': 10, 'thi': 11}


In [59]:
text = brown.raw()
words, word_to_index, index_to_word = build_corpus(text)
corpus_size = len(words)
print(corpus_size)
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)
print("number of tokens: ", len(tokens))
words[:1000]

27114
number of tokens:  2084675


array(['0', '00', '000', '001', '002', '005', '01', '014', '018', '02',
       '025', '027', '03', '035', '039', '04', '040', '043', '046', '05',
       '054', '06', '060', '0600', '062', '065', '07', '075', '076', '08',
       '080', '081', '082', '083', '09', '090', '0c', '0f', '1', '10',
       '100', '1000', '1001', '101', '101b', '102', '1020', '1024', '103',
       '104', '1040', '1040a', '1044', '105', '106', '1065', '1066',
       '1068', '107', '108', '109', '10th', '11', '110', '1105', '111',
       '113', '114', '115', '1159', '116', '1162', '117', '1184', '119',
       '11th', '12', '120', '1200', '121', '1213', '1215', '122', '1223',
       '123', '1231', '124', '125', '125th', '126', '127', '128', '129',
       '1290', '1298', '12a', '12th', '13', '130', '1300', '1307', '1310',
       '1311', '132', '133', '135', '137', '138', '139', '13th', '14',
       '140', '1409', '141', '1416', '142', '143', '144', '145', '1450',
       '1453', '147', '1479', '148', '149', '1492', '

In [94]:
def build_co_matrix2(text, words, word_to_index, window=1):
    """ 
    Build a co-occurrence matrix 
    
    Parameters: 
    text (string): A long string to be split into sentences.
    words: A list of unique word names.
    word_to_index: a mapping from word names to integers.
    window: The size of the context window.
  
    Returns: 
    co_matrix: ndarray 
  
    """
    porter = nltk.PorterStemmer()
    tokenizer = RegexpTokenizer(r'\w+')
    corpus_size = len(words)
    co_matrix = np.zeros((corpus_size,corpus_size),dtype=int)
    for s in sent_tokenize(text): 
        sent = [] 
        for w in tokenizer.tokenize(s):        
            sent.append(porter.stem(w.lower()))
        for i, w in enumerate(sent):
            for j in range(max(i-window,0),min(i+window+1,len(sent))):
                co_matrix[word_to_index[w],word_to_index[sent[j]]] += 1
        np.fill_diagonal(co_matrix,0)
    return co_matrix

In [61]:
def build_co_matrix(text, window=1):
    tokenizer = RegexpTokenizer(r'\w+')
    counter = 0
    co_matrix = pd.DataFrame();
    for s in sent_tokenize(text): 
        sent = [] 
        for w in tokenizer.tokenize(s):        
            sent.append(w.lower())
        for i, w in enumerate(sent):
            for j in range(max(i-window,0),min(i+window+1,len(sent))):
                if w == sent[j]:# skip the word itself
                    co_matrix.loc[w,sent[j]] = 0
                elif (w in co_matrix.index and sent[j] in co_matrix.columns) and not np.isnan(co_matrix.loc[w,sent[j]]):
                    co_matrix.loc[w,sent[j]] += 1
                else:
                    co_matrix.loc[w,sent[j]] = 1
    co_matrix.fillna(0, inplace=True)
    return co_matrix

How tokenization with regex works

In [89]:
text = "This is data mining course cmp3004. It is about data mining. I like it so much."
tokenizer = RegexpTokenizer(r'\w+')
for w in tokenizer.tokenize(text):  
    print(w)

This
is
data
mining
course
cmp3004
It
is
about
data
mining
I
like
it
so
much


In [90]:
matrix = build_co_matrix(text, 2)
matrix

Unnamed: 0,this,is,data,mining,course,cmp3004,it,about,i,like,so,much
this,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
is,1.0,0.0,2.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
data,1.0,2.0,0.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
mining,0.0,1.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
course,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
cmp3004,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
it,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
about,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
i,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
like,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [95]:
words, word_to_index, index_to_word = build_corpus(text)
matrix = build_co_matrix2(text, words, word_to_index, 5)
matrix

array([[0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0],
       [0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1],
       [0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1],
       [1, 1, 1, 0, 0, 2, 1, 0, 2, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0],
       [1, 1, 1, 2, 0, 0, 1, 0, 2, 0, 0, 1],
       [1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0],
       [1, 1, 1, 2, 0, 2, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0]])

In [96]:
text = brown.raw()
words, word_to_index, index_to_word = build_corpus(text)
co_matrix = build_co_matrix2(text, words, word_to_index, 5)
print(co_matrix.shape)
print(co_matrix)

(27114, 27114)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Cosine Similarity:
Intuition: Dot product increases if both pairs have the same sign and decreases if pairs have different signs (similar to correlation, actually Pearson correlation is just cosine similarity of the mean centered vectors). Division by the norms is necessary to penalize vectors which has large values.

In [99]:
co_matrix[:,0][np.nonzero(co_matrix[:,0])]

array([  2,   1,   1,   1,   1,   1,   1,   1,   2,  29,   2,   1,   2,
         3,   1,   2,  11,   2,   2,   1,   7,   1,   2,   1,   8,   1,
         1,  17,   2,   1,   3,   1,   1,   1,   1,   1,   1,   6,   4,
         1,   1,   1,   5,   7,   1,   9,   1,   1,   1,   1,   2,  10,
         1,   3,   1,   2,   1,   1,  32,   1,  10,   1,   3,   3,   1,
        10,   2,   1,   1,   1,   1,   3,   1,   1,   1,  21, 127,   1,
         1,   1,   1,   1,   1,   1,   3,   1,   1,   1,   1,   1,   2,
         1,   1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,
         1,   1,   2,   1,   1,   1,   3,   1,   8,   1,   1,   3,   1,
         1,   1,   1,   4,   1,   1,   1, 106,   1,   1,   1,   2,  10,
        15,   1,   3,   1,   1,   1,   1,   3,   4,   1,   1,   7,   1,
         4,   1,   3,   1,   4,   1,   1,   2,   1,   1,  14, 137,   1,
         2,   1,   5,   1,  31,   3,   1,   9,   2,   1,   1,   1,   5,
         1,   1,   1,   1,   4,   1,   1,   1,   4,   1,   1,   

In [100]:
# Finds cosine similarity between two vectors a and b
def cosine(a, b):
    dot = np.dot(a, b)
    norma = np.linalg.norm(a)
    normb = np.linalg.norm(b)
    return dot / (norma * normb)

Find most similar words to the target word using cosine similarity

In [101]:
target = word_to_index['book']
target

3621

In [102]:
word_vector = co_matrix[target,:]
word_vector.shape

(27114,)

In [103]:
word_vector = np.reshape(word_vector,(word_vector.size,1 ))
word_vector.shape

(27114, 1)

In [104]:
sims = np.dot(word_vector.T,co_matrix)
sims = sims[0,:]
sims

array([117120,  31834, 476572, ...,   3606,    938,   2328])

In [106]:
sims.argsort()[::-1][:10]

array([12229, 16505,  2343, 23958, 13061, 16875,  4753, 24294,  1783,
       24296])

In [107]:
index_to_word[12229]

'in'

In [108]:
norms = np.linalg.norm(co_matrix, axis=0)
norms

array([231.17742104,  61.57921727, 809.71599959, ...,   7.07106781,
         4.24264069,   6.        ])

In [109]:
word_to_index["and"]

1783

In [110]:
norms[345]

12.409673645990857

In [111]:
norm_sims = np.divide(sims,norms)

In [112]:
norm_sims.argsort()[::-1][:10]

array([ 3621,  8957, 13715, 19713,  5609, 26775, 16887, 14957, 17524,
       14157])

In [80]:
index_to_word[3621]

'book'

In [113]:
index_to_word[8957]


'famili'

In [114]:
word_vector = co_matrix[word_to_index['book'],:]
word_vector = np.reshape(word_vector,(1,word_vector.size))
sims = np.dot(word_vector,co_matrix)
sims = sims[0,:]
norm_sims = np.divide(sims,norms)
top10 = norm_sims.argsort()[::-1][:10]
top10

array([ 3621,  8957, 13715, 19713,  5609, 26775, 16887, 14957, 17524,
       14157])

In [115]:
for i in range(len(top10)):
    print(index_to_word[top10[i]])

book
famili
land
record
command
word
offic
mass
paper
line


In [117]:
word_vector = co_matrix[word_to_index['friend'],:]
word_vector = np.reshape(word_vector,(1,word_vector.size))
sims = np.dot(word_vector,co_matrix)
sims = sims[0,:]
norm_sims = np.divide(sims,norms)
top10 = norm_sims.argsort()[::-1][:10]
top10

array([ 9818,  9022, 15911, 17579, 15530, 22291,  9110, 22466, 22349,
       26037])

In [118]:
for i in range(len(top10)):
    print(index_to_word[top10[i]])

friend
father
mother
parent
mine
son
fellow
speech
soul
voic
