# Text Similarity
(by Tevfik Aytekin)

In [2]:
from nltk.corpus import wordnet as wn
import nltk
from nltk.corpus import gutenberg, brown
import numpy as np
from scipy.sparse import csr_matrix
from nltk.tokenize import RegexpTokenizer
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/tevfik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# WordNet

### Lexical Matrix
table taken from [https://wordnetcode.princeton.edu/5papers.pdf](https://wordnetcode.princeton.edu/5papers.pdf)

<img src="./images/lexical_matrix.png" style="width: 400px;"/>


The word meaning $M_1$ in above table can be represented by the set of word forms that can be used to express it: {F1, F2, . . . }. These sets are called synonym sets (or simply synsets).

### WordNet Hierarchy
Below is a simplified illustration of the hierarchy of wordnet.

<img src="./images/wordnet_hierarchy.png" style="width: 400px;"/>


### A word is a set of meanings
wn.synsets('word') gives these meanings.

In [4]:
wn.synsets("car")

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

### Synsets

A synset is a set of synonyms (word forms). Each synset corresponds to a concept/meaning. The nodes in the WordNet hierarchy corresponds to synsets. A synset is identified with a 3-part name of the form: word.pos.nn. For example, 'car.n.01' means the first meaning of 'car' used as a noun.

In [5]:
print(wn.synset('car.n.01').definition())

a motor vehicle with four wheels; usually propelled by an internal combustion engine


In [6]:
print(wn.synset('car.n.02').definition())

a wheeled vehicle adapted to the rails of railroad


Lemmas correspond to word forms.

In [7]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [8]:
wn.synset('car.n.01').lemmas()

[Lemma('car.n.01.car'),
 Lemma('car.n.01.auto'),
 Lemma('car.n.01.automobile'),
 Lemma('car.n.01.machine'),
 Lemma('car.n.01.motorcar')]

In [9]:
wn.synset('car.n.02').lemma_names()

['car', 'railcar', 'railway_car', 'railroad_car']

Hypernyms anf hyponyms

In linguistics, hyponymy means a subtype and a hypernym means a supertype

In [10]:
wn.synset('car.n.01').hypernyms()

[Synset('motor_vehicle.n.01')]

In [11]:
wn.synset('car.n.01').hyponyms()

[Synset('ambulance.n.01'),
 Synset('beach_wagon.n.01'),
 Synset('bus.n.04'),
 Synset('cab.n.03'),
 Synset('compact.n.03'),
 Synset('convertible.n.01'),
 Synset('coupe.n.01'),
 Synset('cruiser.n.01'),
 Synset('electric.n.01'),
 Synset('gas_guzzler.n.01'),
 Synset('hardtop.n.01'),
 Synset('hatchback.n.01'),
 Synset('horseless_carriage.n.01'),
 Synset('hot_rod.n.01'),
 Synset('jeep.n.01'),
 Synset('limousine.n.01'),
 Synset('loaner.n.02'),
 Synset('minicar.n.01'),
 Synset('minivan.n.01'),
 Synset('model_t.n.01'),
 Synset('pace_car.n.01'),
 Synset('racer.n.02'),
 Synset('roadster.n.01'),
 Synset('sedan.n.01'),
 Synset('sport_utility.n.01'),
 Synset('sports_car.n.01'),
 Synset('stanley_steamer.n.01'),
 Synset('stock_car.n.01'),
 Synset('subcompact.n.01'),
 Synset('touring_car.n.01'),
 Synset('used-car.n.01')]

In [12]:
wn.synset('car.n.01').root_hypernyms()

[Synset('entity.n.01')]

## Synonymy

### Path Similarity
path_similarity assigns a score in the range 0–1 based on the shortest path that connects the concepts in the hypernym hierarchy (-1 is returned in those cases where a path cannot be found)

In [13]:
right = wn.synset('right_whale.n.01')
orca = wn.synset('orca.n.01')
minke = wn.synset('minke_whale.n.01')
tortoise = wn.synset('tortoise.n.01')
novel = wn.synset('novel.n.01')

In [14]:
print(right.path_similarity(minke))
print(right.path_similarity(orca))
print(right.path_similarity(tortoise))
print(right.path_similarity(novel))


0.25
0.16666666666666666
0.07692307692307693
0.043478260869565216


<img src="./images/wordnet_hierarchy.png" style="width: 400px;"/>

In [15]:
motorcar = wn.synset('car.n.01')
compact = wn.synset('compact.n.03')
hatchback = wn.synset('hatchback.n.01')
print(motorcar.path_similarity(compact))

0.5


In [16]:
print(hatchback.path_similarity(compact))

0.3333333333333333


In [17]:
print(hatchback.path_similarity(hatchback))

1.0


### Automated ways for finding synonyms

WordNet is constructed manually by experts of linguistics. There is also the computational approach to semantics. Below we will look at one such approach for finding synonyms. The approach relies on the below fundamental hypothesis:
<br><br>
<center><b>Distributional Hypothesis: similar words appear in similar contexts.</b></center>
<br><br>
We will first need to build a corpus and a co-occurrence matrix.



In [18]:
def build_corpus(text):
    """ 
  
    Parameters: 
    text (string): A (long) string 
  
    Returns: 
    words: A list of unique word names.
    word_to_index: a mapping from word names to integers.
    index_to_word: a mapping from integers to word names.
  
    """
    porter = nltk.PorterStemmer()

    tokenizer = RegexpTokenizer(r'\w+')
    words = [] 
    for i in sent_tokenize(text): 
        for j in tokenizer.tokenize(i):
            if (j.lower() not in stop_words):
                words.append(j.lower())
    words = np.unique(words)

    
    #porter = nltk.PorterStemmer()
    #words = [porter.stem(t) for t in words]
    wn_lemma = nltk.WordNetLemmatizer()
    words = [wn_lemma.lemmatize(t) for t in words]
    
    words = np.unique(words)


    
    word_to_index = {}
    index_to_word = {}
    counter = 0;
    for w in words:
        word_to_index[w] = counter
        index_to_word[counter] = w
        counter += 1  
    return words, word_to_index, index_to_word

In [19]:
text = "This is data mining course cmp5101. It is about data mining. I like it so much."
words, word_to_index, index_to_word = build_corpus(text)

In [20]:
corpus_size = len(words)
print(corpus_size)
print(words)
print(word_to_index)
print(index_to_word)

6
['cmp5101' 'course' 'data' 'like' 'mining' 'much']
{'cmp5101': 0, 'course': 1, 'data': 2, 'like': 3, 'mining': 4, 'much': 5}
{0: 'cmp5101', 1: 'course', 2: 'data', 3: 'like', 4: 'mining', 5: 'much'}


In [21]:
text = brown.raw()
words, word_to_index, index_to_word = build_corpus(text)
corpus_size = len(words)
print(corpus_size)

words[1100:1110]

37011


array(['absolutely', 'absoluteness', 'absolution', 'absorb', 'absorbed',
       'absorbency', 'absorbent', 'absorber', 'absorbing', 'absorbs'],
      dtype='<U21')

In [22]:
def build_co_matrix2(text, words, word_to_index, window=1):
    """ 
    Build a co-occurrence matrix 
    
    Parameters: 
    text (string): A long string to be split into sentences.
    words: A list of unique word names.
    word_to_index: a mapping from word names to integers.
    window: The size of the context window.
  
    Returns: 
    co_matrix: ndarray 
  
    """
    porter = nltk.PorterStemmer()
    wn_lemma = nltk.WordNetLemmatizer()

    tokenizer = RegexpTokenizer(r'\w+')
    corpus_size = len(words)
    co_matrix = np.zeros((corpus_size,corpus_size),dtype=int)
    for s in sent_tokenize(text): 
        sent = [] 
        for w in tokenizer.tokenize(s):   
            if w.lower() not in stop_words:      
                sent.append(wn_lemma.lemmatize(w.lower()))
        for i, w in enumerate(sent):
            for j in range(max(i-window,0),min(i+window+1,len(sent))):
                co_matrix[word_to_index[w],word_to_index[sent[j]]] += 1
        np.fill_diagonal(co_matrix,0)
    return co_matrix

In [23]:
def build_co_matrix(text, window=1):
    tokenizer = RegexpTokenizer(r'\w+')
    wn_lemma = nltk.WordNetLemmatizer()

    counter = 0
    co_matrix = pd.DataFrame();
    for s in sent_tokenize(text): 
        sent = [] 
        for w in tokenizer.tokenize(s):  
            if w.lower() not in stop_words:      
                sent.append(wn_lemma.lemmatize(w.lower()))
        for i, w in enumerate(sent):
            for j in range(max(i-window,0),min(i+window+1,len(sent))):
                if w == sent[j]:# skip the word itself
                    co_matrix.loc[w,sent[j]] = 0
                elif (w in co_matrix.index and sent[j] in co_matrix.columns) and not np.isnan(co_matrix.loc[w,sent[j]]):
                    co_matrix.loc[w,sent[j]] += 1
                else:
                    co_matrix.loc[w,sent[j]] = 1
    co_matrix.fillna(0, inplace=True)
    return co_matrix

How tokenization with regex works

In [24]:
text = "This is data mining course cmp5101. It is about data mining. I like it so much."
tokenizer = RegexpTokenizer(r'\w+')
for w in tokenizer.tokenize(text):  
    print(w)

This
is
data
mining
course
cmp5101
It
is
about
data
mining
I
like
it
so
much


In [25]:
matrix = build_co_matrix(text, 2)
matrix

Unnamed: 0,data,mining,course,cmp5101,like,much
data,0.0,2.0,1.0,0.0,0.0,0.0
mining,2.0,0.0,1.0,1.0,0.0,0.0
course,1.0,1.0,0.0,1.0,0.0,0.0
cmp5101,0.0,1.0,1.0,0.0,0.0,0.0
like,0.0,0.0,0.0,0.0,0.0,1.0
much,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
words, word_to_index, index_to_word = build_corpus(text)
matrix = build_co_matrix2(text, words, word_to_index, 2)
matrix

array([[0, 1, 0, 0, 1, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 1, 0, 0, 2, 0],
       [0, 0, 0, 0, 0, 1],
       [1, 1, 2, 0, 0, 0],
       [0, 0, 0, 1, 0, 0]])

In [53]:
text = brown.raw()
words, word_to_index, index_to_word = build_corpus(text)
co_matrix = build_co_matrix2(text, words, word_to_index, 5)
print(co_matrix.shape)
print(co_matrix)

(37011, 37011)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### Cosine Similarity:
Intuition: Dot product increases if both pairs have the same sign and decreases if pairs have different signs (similar to correlation, actually Pearson correlation is just cosine similarity of the mean centered vectors). Division by the norms is necessary to penalize vectors which has large values.

In [54]:
co_matrix[:,0][np.nonzero(co_matrix[:,0])]

array([  2,   1,   1,   1,   1,   2,   1,   1,   2,  32,   2,   1,   2,
         3,   1,   4,   1,   1,   2,  13,   2,   2,   2,   7,   1,   2,
         1,  10,   1,   1,  17,   2,   1,   4,   1,   1,   1,   1,   1,
         1,   6,   4,   1,   1,   1,   1,   1,   1,   1,  19,   1,   1,
         1,   1,   1,   3,   1,   1,   1,   2,   1,   1,   1,   1,   3,
         1,   5,  16,   3,   2,   1,   1,   3,   2,   1,   4,   1,   1,
        33, 155,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   2,
         3,   3,   1,   2,   1,   1,   1,   1,   1,   2,   1,   1,   1,
         1,   1,   1,   1,   1,   2,   1,   4,   1,   1,   1,   1,   1,
         1,   1,   3,   1,   2,   2,   6,   1,   1,   1,   1,   1,   1,
         1,   1,   1,   1,   5,   1,   1,   1,   1,   1,   1,   1,   4,
         1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   2,   1,   1,
        26,   1,   1,   3,   1,   1,   3,   1,   2,   4,   1,   1,   1,
         1,   5,   1,   1,   3,   1,   9,   1,   1,   2,   1,   

In [29]:
# Finds cosine similarity between two vectors a and b
def cosine(a, b):
    dot = np.dot(a, b)
    norma = np.linalg.norm(a)
    normb = np.linalg.norm(b)
    return dot / (norma * normb)

Find most similar words to the target word using cosine similarity

In [55]:
target = word_to_index['book']
target

4646

In [56]:
word_vector = co_matrix[target,:]
print(word_vector.shape)
word_vector[:10]

(37011,)


array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0])

In [57]:
word_vector = np.reshape(word_vector,(word_vector.size,1 ))
word_vector.shape

(37011, 1)

In [33]:
sims = np.dot(word_vector.T,co_matrix)
sims = sims[0,:]
sims

array([ 38012,   8980, 130748, ...,    971,    117,    744])

In [34]:
sims.argsort()[::-1][:10]

array([18209, 22508,  6051, 33447, 22509, 22746, 35325, 35322, 26780,
       25405])

In [35]:
index_to_word[6051]

'cc'

In [38]:
def incremental_row_norms(matrix):
    row_norms = []
    # Iterate over each row in the matrix
    for row in matrix:
        # Compute the Euclidean norm (L2 norm) of the row
        row_norm = np.sqrt(np.dot(row, row))
        row_norms.append(row_norm)
    return row_norms

In [40]:
norms = incremental_row_norms(co_matrix.T)
norms

[157.87336697492708,
 40.124805295477756,
 505.8932693760612,
 4.69041575982343,
 2.8284271247461903,
 2.8284271247461903,
 10.677078252031311,
 2.449489742783178,
 2.449489742783178,
 8.94427190999916,
 2.8284271247461903,
 4.47213595499958,
 9.643650760992955,
 2.449489742783178,
 2.8284271247461903,
 7.3484692283495345,
 2.0,
 2.8284271247461903,
 2.449489742783178,
 17.944358444926362,
 2.23606797749979,
 9.899494936611665,
 2.449489742783178,
 2.8284271247461903,
 2.449489742783178,
 2.449489742783178,
 10.954451150103322,
 2.8284271247461903,
 4.0,
 5.830951894845301,
 3.4641016151377544,
 2.0,
 2.449489742783178,
 4.898979485566356,
 4.69041575982343,
 2.449489742783178,
 26.40075756488817,
 3.4641016151377544,
 1223.6314804711426,
 333.42615374322395,
 167.91962362987834,
 21.18962010041709,
 2.8284271247461903,
 4.898979485566356,
 2.8284271247461903,
 4.0,
 4.47213595499958,
 2.8284271247461903,
 2.449489742783178,
 31.016124838541646,
 21.42428528562855,
 10.862780491200215,

In [41]:
norm_sims = np.divide(sims,norms)

In [42]:
norm_sims.argsort()[::-1][:10]

array([ 4646, 13381,  8258, 19522, 23159,  5911, 20493, 14972, 12904,
       20579])

In [43]:
index_to_word[4646]

'book'

In [46]:
index_to_word[19522]

'line'

In [58]:
word_vector = co_matrix[word_to_index['book'],:]
word_vector = np.reshape(word_vector,(1,word_vector.size))
sims = np.dot(word_vector,co_matrix)
sims = sims[0,:]
norm_sims = np.divide(sims,norms)
top10 = norm_sims.argsort()[::-1][:10]
top10

array([35325, 26780, 35324, 35326, 35322,  1210, 22746,  4129, 16440,
        6053])

In [59]:
for i in range(len(top10)):
    print(index_to_word[top10[i]])

vbn
rb
vbg
vbz
vb
according
np
beyond
hvg
cd


In [60]:
word_vector = co_matrix[word_to_index['friend'],:]
word_vector = np.reshape(word_vector,(1,word_vector.size))
sims = np.dot(word_vector,co_matrix)
sims = sims[0,:]
norm_sims = np.divide(sims,norms)
top10 = norm_sims.argsort()[::-1][:10]
top10

array([35325, 26780, 35324, 35322,  1210,  6053,  1052,  2401, 35326,
        4129])

In [61]:
for i in range(len(top10)):
    print(index_to_word[top10[i]])

vbn
rb
vbg
vb
according
cd
abn
ap
vbz
beyond
