# Calculating similarity measures between queries and sample documents  

Objectives are to demonestrate: 
- How to preprocess text and embedd textual data
-  Compare the results of textual similarity between tradditional and dep learning based methods   


*** Important consideration: You are not expected to use any particular library or any particular method; the codes below are just meant to provide you with some help so you spend most of your time on the deep learning based model. Feel free to choose your own methods. the evaluation is based on being able to obtain results regardless of which method is being used.  

# Set-up and import data 

from google.colab import files
uploaded = files.upload()

In [1]:
import json 

with open('data/sample_repository.json') as in_file:
    test_data = json.load(in_file)

titles = [item[0] for item in test_data['data']]
documents = [item[1] for item in test_data['data']]

In [2]:
import pandas as pd
df = pd.DataFrame(list(zip(titles, documents)), columns =['titles', 'documents'])
df.tail()

Unnamed: 0,titles,documents
27,botany,"Botany, also called plant science(s), plant bi..."
28,Ford Bronco,The Ford Bronco is a model line of sport utili...
29,List of fruit dishes,Fruit dishes are those that use fruit as a pri...
30,Neuro linguistic programming,Neuro linguistic programming (NLP) is a pseudo...
31,fruit serving bowl,A fruit serving bowl is a round dish or contai...


# 1. TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
query = 'vegetables'
# query  terms: 'fruits' / 'vegetables' / 'healthy foods in Canada'

vectorizer = TfidfVectorizer(stop_words=stop_words)
vectors = vectorizer.fit_transform([query] + documents)

# Calculate the word frequency, and a measure of similarity (whatever you find it to be approperiate) of the search terms with each document
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv_fit = cv.fit_transform(df.documents)
word_list = cv.get_feature_names()
count_list = cv_fit.toarray().sum(axis=0)

wordcount_dict = dict(zip(word_list,count_list))

'''QUERY'''
query_vec = vectorizer.transform([query])

results = linear_kernel(vectors,query_vec).reshape((-1,))

# Print Top 10 results
for i in results.argsort()[-10:][::-1]:
    print(df.iloc[i,0])

Pomegranate Bhagwa
Major Market
Pink Onions
Pomegranate Arakta
About Us
Contact Us
White Onions
Video Gallery
Food classes
Nutrition


## Repeat the same task after some preprocessing 

You are not expected to do any specific type of cleaning/standardizations but at minimum use 2 techniques (e.g. lemmatization, removing punctuations and etc.) 

In [86]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

nltk.download('words')
nltk.download('punkt')
nltk.download('wordnet')

def pre_processing(text):
    # Remove Punctuation
    import re
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    # Tokenize Words
    words = word_tokenize(text.lower())
    
    # Remove Stop Words
    words = [w for w in words if w not in stopwords.words("english")]
    
    # 
    #words = [PorterStemmer().stem(w) for w in words]
    words = [WordNetLemmatizer().lemmatize(w, pos='v') for w in words]
    
    return words

[nltk_data] Downloading package words to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sarthakkaushik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [87]:
WordNetLemmatizer().lemmatize("extremely", pos='v')

'extremely'

In [88]:
PorterStemmer().stem("extremely")

'extrem'

In [89]:
pre_processing(documents[0])

['fresh',
 'pomegranate',
 'anushka',
 'avni',
 'international',
 'bhagwa',
 'premium',
 'pomegranate',
 'variety',
 'india',
 'deep',
 'red',
 'arils',
 'please',
 'red',
 'rugged',
 'skin',
 'enhance',
 'appearance',
 'whilst',
 'promote',
 'shelf',
 'life',
 'fruit',
 'bhagwa',
 'widely',
 'know',
 'soft',
 'seed',
 'dark',
 'red',
 'color',
 'extremely',
 'delicious',
 'package',
 'net',
 'weight',
 'box',
 '2',
 '5kg',
 '3',
 '00kg',
 '3',
 '5kg',
 'detail',
 'minimum',
 'weight',
 '180gm',
 'maximum',
 'weight',
 '400gm',
 'color',
 'arils',
 'dark',
 'cherry',
 'red',
 'taste',
 'sweet',
 'fruit',
 'count',
 'carton',
 '3',
 '50',
 'kg',
 'net',
 'wt',
 '9',
 'number',
 'pack',
 'per',
 'carton',
 '350',
 '400gms']

In [33]:
# You can clean, standar a lemmatizer
# To reduces words down to their simplest 'lemma' (e.g. helpful when dealing with plurals) 


Unnamed: 0,titles,documents,word_tokens,documents_stop_rem
0,Pomegranate Bhagwa,Fresh Pomegranate from Anushka Avni Internatio...,"[fresh, pomegranate, from, anushka, avni, inte...","[fresh, pomegranate, anushka, avni, internatio..."
1,Pomegranate Arakta,Fresh Pomegranate Arakta from Anushka Avni Int...,"[fresh, pomegranate, arakta, from, anushka, av...","[fresh, pomegranate, arakta, anushka, avni, in..."
2,About Us,About Us Anushka Avni International (AAI) take...,"[about, us, anushka, avni, international, (, a...","[us, anushka, avni, international, (, aai, ), ..."
3,Contact Us,About Us Anushka Avni International (AAI) take...,"[about, us, anushka, avni, international, (, a...","[us, anushka, avni, international, (, aai, ), ..."
4,White Onions,White Onions from Anushka Avni International F...,"[white, onions, from, anushka, avni, internati...","[white, onions, anushka, avni, international, ..."


In [34]:
print(df.documents.iloc[1,])

Fresh Pomegranate Arakta from Anushka Avni International This Pomegranate are bigger in size, sweet with soft seeds, bold red arils. It also possess glossy, attractive, dark red skin. Packaging: Net weight of box 2.5kg, 3.00kg, 3.5kg. Details: Minimum Weight 180gm, maximum weight 400gm Taste: Sweet Fruit count / carton (3.50 kg net wt.) 9 Numbers packed per carton: 350-400gms 10 Numbers packed per carton :290-320gms 12 Numbers packed per carton: 275-325gms 15 Numbers packed per carton: 225-275gms Load ability: 4400 cartons per container 20 pallets with 220 cartons per pallet Load ability: 5500 cartons per container Loading with No pallets Availability: January | February | March | April | July | August | September | October | Nov | Dec READ MORE


In [29]:
print(df.documents_stop_rem.iloc[1,])

['Fresh', 'Pomegranate', 'Arakta', 'Anushka', 'Avni', 'International', 'This', 'Pomegranate', 'bigger', 'size', ',', 'sweet', 'soft', 'seeds', ',', 'bold', 'red', 'arils', '.', 'It', 'also', 'possess', 'glossy', ',', 'attractive', ',', 'dark', 'red', 'skin', '.', 'Packaging', ':', 'Net', 'weight', 'box', '2.5kg', ',', '3.00kg', ',', '3.5kg', '.', 'Details', ':', 'Minimum', 'Weight', '180gm', ',', 'maximum', 'weight', '400gm', 'Taste', ':', 'Sweet', 'Fruit', 'count', '/', 'carton', '(', '3.50', 'kg', 'net', 'wt', '.', ')', '9', 'Numbers', 'packed', 'per', 'carton', ':', '350-400gms', '10', 'Numbers', 'packed', 'per', 'carton', ':290-320gms', '12', 'Numbers', 'packed', 'per', 'carton', ':', '275-325gms', '15', 'Numbers', 'packed', 'per', 'carton', ':', '225-275gms', 'Load', 'ability', ':', '4400', 'cartons', 'per', 'container', '20', 'pallets', '220', 'cartons', 'per', 'pallet', 'Load', 'ability', ':', '5500', 'cartons', 'per', 'container', 'Loading', 'No', 'pallets', 'Availability', ':'

In [None]:
#!pip install tfidf

# 2. Semantic matching using GloVe embeddings

In [None]:
#!pip install  gensim==4.0.1 # if you decide to use the gensim library and the sample codes below, you would need gensim version >=4.0.1 to be installed 
import gensim
print(gensim.__version__)

In [None]:
import logging
import json
import logging
from re import sub
from multiprocessing import cpu_count

import numpy as np

import gensim.downloader as api
from gensim.utils import simple_preprocess
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity

In [None]:
import logging

# Initialize logging.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)

In [None]:
import nltk

# Import and download stopwords from NLTK.
nltk.download('stopwords')  # Download stopwords list.
stopwords = set(nltk.corpus.stopwords.words("english"))

In [None]:
def preprocess(doc):
    # Tokenize, clean up input document string
    doc = sub(r'<img[^<>]+(>|$)', " image_token ", doc)
    # you may decide to add additional steps here 
    return [token for token in simple_preprocess(doc, min_len=0, max_len=float("inf")) if token not in stopwords]

In [None]:
# Load test data
with open('data/sample_repository.json') as in_file:
    test_data = json.load(in_file)

titles = [item[0] for item in test_data['data']]
documents = [item[1] for item in test_data['data']]

In [None]:
query_s = 'Your queries here'
 


# Preprocess the documents, including the query string
corpus = [preprocess(document) for document in documents]
query = preprocess(query_s)

In [None]:
# Download and load the GloVe word vector embeddings

if 'glove' not in locals():  # only load if not already in memory
    glove = api.load("glove-wiki-gigaword-50")

similarity_index = WordEmbeddingSimilarityIndex(glove)

In [None]:
# Build the term dictionary, TF-idf model
# Keep in mind that the search query must be in the dictionary as well, in case the terms do not overlap with the documents  
dictionary = Dictionary(corpus+[query])
tfidf = TfidfModel(dictionary=dictionary)

# Create the term similarity matrix. 
# The nonzero_limit enforces sparsity by limiting the number of non-zero terms in each column. 
# For my application, I got best results by removing the default value of 100
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)  # , nonzero_limit=None)

In [None]:
# Compute similarity measure between the query and the documents.
query_tf = tfidf[dictionary.doc2bow(query)]

index = SoftCosineSimilarity(
            tfidf[[dictionary.doc2bow(document) for document in corpus]],
            similarity_matrix)

doc_similarity_scores = index[query_tf]

In [None]:
# Output the similarity scores for top 5/10 documents and interpreat the findings and compare the results 
 

# 3. BERT
Use a bert model to create sentence embeddings and calculate the similarity between queries and documents.

In [None]:
# compare the findings  