# Vector space model

---
**Author**: Marko Bajec

**Last update**: 13.4.2019

**Description**: in this example we show how to calculate TF-IDF based document representations.  

**Required libraries** (use pip3):
* TBA

---
## Document corpus
Let's say we have the following 5 sentences representing the **document corpus**:
* d<sub>1</sub>: *Remeo and Juliet.*
* d<sub>2</sub>: *Juliet: O happy dagger!*
* d<sub>3</sub>: *Romeo died by dagger.*
* d<sub>4</sub>: *"Live free or die”, that’s the New-Hampshire’s motto.*
* d<sub>5</sub>: *Did you know, New-Hampshire is in New-England.*

**Question**: How close (relevant) are the above sentences to the following query: $q=$<span style="color:blue">*died, dagger*</span>?

## Python implementation
In the Python program below we calculate how relevant is each of the documents $d_i$ in the corpus to the given query $q$ by using different relevance measures:
* Dot product
* Cosine distance
* Ocapi method


### 1. Import required libraries 
Apart from importing the required libraries, two methods are defined in this code snippet, <code>get_word_pos</code> and <code>doc_num</code>, both requited later on in the code. 

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import re
import numpy
import math
from collections import Counter
from texttable import Texttable

# lemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        #if not J, V, N or R then make it default, i.e. "n" as naun 
        return 'n'

# returns document id (either di or q)
def doc_num(i):
    if i == len(corpus):
        return "q"
    else:
        return "d" + str(i)

### 2. Define corpus and query
Notice that query $q$ is treated as the last document in the **corpus**. This is to assure that the same transformations will be done to $q$ as to other documents in the corpus. 

In [None]:
# document corpus
corpus = ["Romeo and Juliet.", 
          "Juliet: O happy dagger!", 
          "Romeo died by dagger.", 
          "'Live free or die'”, that’s the New-Hampshire’s motto.", 
          "Did you know, New-Hampshire is in New-England."
         ]

# query
query = "died, dagger"

# query will be treated as last document of the corpus
corpus = numpy.append(corpus, query)

### 3. Processing corpus documents
In this step, several transformations are performed on the corpus documents:
* documents are tokenized
* all letters are changed to lowercase
* punctations and stopwords are removed
* tokens are lemmatized

In [None]:
# tokenize
text = []        
for i in range(0, len(corpus)):
    text.extend(nltk.word_tokenize(corpus[i]))

# change words to lowercase
text = [x.lower() for x in text]

# remove punctation
nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit 
text_filtered = [w for w in text if nonPunct.match(w)]

# remove stopwords
text_filtered = [word for word in text_filtered if word not in stopwords.words('english')]

# lemmatize
lemmatizer = WordNetLemmatizer()
wordsPOS = nltk.pos_tag(text_filtered)
wordsPOSset = {}
for i in range (0, len(wordsPOS)):
    wordsPOSset.update({(wordsPOS[i][0]):wordsPOS[i][1][0]})

text_lemmatized = []
for word in text_filtered:
    text_lemmatized.append(lemmatizer.lemmatize(word, pos=get_wordnet_pos(wordsPOSset[word])))
    
print("Tokens without punctation and stopwords")
print(text_filtered)
print(" ")
print("Lemmatized tokens, vocabulary")
print(set(text_lemmatized))

### 4. Create TF matrix
Three matrices are created:
* TF matrix,
* normalized TF matrix (using Euclidean normalization) and
* TF-IDF matrix.

#### Euclidean normalization
<span style="color:darkred">
$$\large tf_{ij} = \frac{f_{ij}}{\sqrt{f_{1j}^2+f_{2j}^2+ ... + f_{|V|j}^2}}$$
</span>

#### TF-IDF
<span style="color:darkred">
$$\large w_{ij} = tf_{ij}\cdot idf_{i} = \frac{f_{ij}}{\sqrt{f_{1j}^2+f_{2j}^2+ ... + f_{|V|j}^2}}\cdot log \frac{N}{df_i}$$
</span>

In [None]:
# get words counts
text_counts = Counter(text_lemmatized)
dist_words = set(text_counts)
num_dist_words = len(dist_words)
corpus_length = len(corpus)

# create TF matrix
tf = numpy.zeros((corpus_length, num_dist_words))
j = -1

for word in dist_words:
    #print(word)
    j = j + 1
    #print(j)
    for i in range(0, corpus_length):
        if word in corpus[i].lower():
            tf[i][j] = tf[i][j] + 1
            
# normalize TF matrix with Euclidean normalization
tf_norm = numpy.zeros((len(corpus), num_dist_words))
sqr = numpy.zeros(corpus_length)
for i in range(0, corpus_length):
    sqr[i] = 0
    for j in range(0, num_dist_words):
        sqr[i] += math.pow(tf[i][j], 2)
    sqr[i] = math.sqrt(sqr[i])

for i in range(0, corpus_length):
    for j in range(0, num_dist_words):
        tf_norm[i][j] = tf[i][j]/sqr[i]
        
# create TF-IDF matrix
idf = numpy.zeros(num_dist_words)
for i in range(0, num_dist_words):
    idf[i] = 0
    for j in range(0, corpus_length):
        idf[i] += tf[j][i]

tfidf = numpy.zeros((corpus_length, num_dist_words))
for i in range(0, corpus_length):
    for j in range(0, num_dist_words):
        tfidf[i][j] = tf_norm[i][j] * math.log(corpus_length/idf[j])

# print results

table = Texttable(0)
table.set_cols_align(numpy.append(["l"], numpy.full((1, num_dist_words), "r")[0]))
table.set_cols_valign(numpy.full((1, num_dist_words+1), "m")[0])

print("TF matrix")
table.header(numpy.append(["document"], list(dist_words)))
for i in range(0, corpus_length):
    table.add_row(numpy.append([doc_num(i+1)],tf[i]))
    
print(table.draw() + "\n")

print("Normalized TF matrix")
table.reset()
table.header(numpy.append(["document"], list(dist_words)))
for i in range(0, corpus_length):
    table.add_row(numpy.append([doc_num(i+1)],tf_norm[i]))
    
print(table.draw() + "\n")

print("TF-IDF matrix")
table.reset()
table.header(numpy.append(["document"], list(dist_words)))
for i in range(0, corpus_length):
    table.add_row(numpy.append([doc_num(i+1)],tfidf[i]))
    
print(table.draw() + "\n")

### 5. Calculating relevance of documents to the query
Relevance can be calculated using different measures. Here we use:
* Dot product and
* Cosine similarity or distance.

#### Dot product
<span style="color:darkred">
$$\large sim(\mathbf{d}_j,\mathbf{q}) = \big \langle \mathbf{d}_j \cdot \mathbf{q} \big \rangle$$
</span>

#### Cosine similarity
<span style="color:darkred">
$$\large cosine(\mathbf{d}_j,\mathbf{q}) = \frac{\big \langle \mathbf{d}_j \cdot \mathbf{q} \big \rangle} 
{\Bigl\| \begin{matrix} \mathbf{d}_j \end{matrix} \Bigr\| \cdot 
\Bigl\| \begin{matrix} \mathbf{q} \end{matrix} \Bigr\|} =
\frac{\sum_{i=1}^{|V|}{w_{ij} \cdot w_{iq}}}
{\sqrt{\sum_{i=1}^{|V|}{w_{ij}^2}} \cdot \sqrt{\sum_{i=1}^{|V|}{w_{iq}^2}}}
$$
</span>

In [None]:
# calculate relevance of di to q

# A) using dot product (di * q)
dot_product = numpy.zeros(corpus_length-1)
for i in range(0, corpus_length - 1):
    dot_product[i] = numpy.dot(tfidf[i], tfidf[corpus_length-1])
   
# B) using cosine distance
cosine_distance = numpy.zeros(corpus_length-1)
for i in range(0, corpus_length - 1):
    cosine_distance[i] = dot_product[i]/(numpy.linalg.norm(tfidf[i]) * numpy.linalg.norm(tfidf[corpus_length-1]))
    
# C) using Okapi distance
cosine_distance = numpy.zeros(corpus_length-1)
for i in range(0, corpus_length - 1):
    cosine_distance[i] = dot_product[i]/(numpy.linalg.norm(tfidf[i]) * numpy.linalg.norm(tfidf[corpus_length-1]))
 
print("Document relevance based on dot product")
table1 = Texttable(0)
table1.set_cols_align(["l", "r", "r"])
table1.set_cols_valign(["m", "m", "m"])

table1.header(["Method", "Dot product", "Cosine similarity"])
for i in range(0, corpus_length-1):
    table1.add_row([doc_num(i+1), dot_product[i], cosine_distance[i]])

print(table1.draw() + "\n")    
