In [1]:
# This script generates the Semantic Similarity score using GenSim tool
# Read the comments at each cell to use it
# This script is based on the tutorial provided on https://radimrehurek.com/gensim/tutorial.html

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
import pandas as pd

In [2]:
# This cell contains common functions used to interact with GenSim
# You don't have to change anthing here

import os
import glob
from collections import defaultdict
import pickle

dictionaryKey = "dictionary"
corpusKey = "corpus"
tfidfKey = "tfidf"
lsiKey = "lsi"
similarityIndexKey = "sim_index"
id2docNameKey = "id2docName"
docName2IdKey = "docName2Id"

# Transform a text to its LSI vector representation
def toLsi(text, aDictionary, tfidf_index, lsi_index):
    text_bow = aDictionary.doc2bow(text.lower().split())
    text_tfidf = tfidf_index[text_bow]
    return lsi_index[text_tfidf]

def toLsi2(text, indexes_map):
    return toLsi(text, indexes_map[dictionaryKey], indexes_map[tfidfKey], indexes_map[lsiKey])

# Returns the similarity score
def getSimilarity(simResult, docName, docToIdMap):
    docId = docToIdMap.get(docName, -1)
    if docId == -1:
        return 0.0
    
    for aSim in simResult:
        if aSim[0] == docId:
            return aSim[1]
    return 0.0

def getSimilarity2(simResult, docName, indexes_map):
    return getSimilarity(simResult, docName, indexes_map[docName2IdKey])

#Return the ranking of a document for a query. If not found, returns 0
def getRanking(sortedSimResult, docName, docToIdMap):
    docId = docToIdMap.get(docName, -1)
    if docId == -1:
        return 0
    
    ranking = 1
    for aSim in sortedSimResult:
        if aSim[0] == docId:
            if (ranking > 101):
                return 101
            else:
                return ranking
        ranking = ranking + 1
    return 0

def getRanking2(sortedSimResult, docName, indexes_map):
    return getRanking(sortedSimResult, docName, indexes_map[docName2IdKey])

def sortSimilarities(similarities):
    return sorted(enumerate(similarities), key=lambda item: -item[1])

# BUild indexes
def buildIndexes(sourceDir):
    #rootDir = '/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/'
    #path = rootDir + 'data/lsi_all/'
    documents = []
    frequency = defaultdict(int)
    #id2docName = {}
    docName2Id = {}

    i = 0
    for infile in glob.glob( os.path.join(sourceDir, '*.txt') ):
        file = open(infile, 'r')
        content = file.read().split()

        aDoc = []
        for aToken in content:
            frequency[aToken] += 1
            aDoc.append(aToken)

        documents.append(aDoc)

        docName = infile.split("/")[-1].split(".")[0]
        #id2docName[i] = docName
        docName2Id[docName] = i

        i = i + 1
        #if i > 5:
         #   break 

    for aDoc in documents:
        uniqueTokensInCorpus = []
        for aToken in aDoc:
            if frequency[aToken] == 1:
                uniqueTokensInCorpus.append(aToken)

        [aDoc.remove(uniqueToken) for uniqueToken in uniqueTokensInCorpus]


    dictionary = corpora.Dictionary(documents)

    corpus = [dictionary.doc2bow(aDoc) for aDoc in documents]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary)
    corpus_lsi = lsi[corpus_tfidf]

    sim_index = similarities.MatrixSimilarity(corpus_lsi)
    
    indexes = {dictionaryKey: dictionary, corpusKey: corpus, tfidfKey: tfidf, lsiKey: lsi, 
               similarityIndexKey: sim_index, docName2IdKey: docName2Id} #, id2docNameKey: id2docName}
    return indexes


def saveIndexes(indexes, targetDir):
    indexes[dictionaryKey].save(targetDir + dictionaryKey + '.ser')
    corpora.MmCorpus.serialize(targetDir + corpusKey + '.ser', indexes[corpusKey])
    indexes[tfidfKey].save(targetDir + tfidfKey + '.ser')
    indexes[lsiKey].save(targetDir + lsiKey + '.ser')
    indexes[similarityIndexKey].save(targetDir + similarityIndexKey + '.ser')
    pickle.dump(indexes[docName2IdKey], open(targetDir + docName2IdKey + ".ser", "wb"))
    
def loadIndexes(indexDir):
    indexes = {}
    indexes[dictionaryKey] = corpora.Dictionary.load(indexDir + dictionaryKey + '.ser')
    indexes[corpusKey] = corpora.MmCorpus(indexDir + corpusKey + '.ser')
    indexes[tfidfKey] = models.TfidfModel.load(indexDir + tfidfKey + '.ser')
    indexes[lsiKey] = models.LsiModel.load(indexDir + lsiKey + '.ser')
    indexes[similarityIndexKey] = similarities.MatrixSimilarity.load(indexDir + similarityIndexKey + '.ser')
    indexes[docName2IdKey] = pickle.load( open( indexDir + docName2IdKey + '.ser', "rb" ) )
    return indexes

In [3]:
# If you don't have an GenSim index, you can run this cell to build it
# Below is the path to HomeDepot product files. These are used to build the indexes

myIndexes = buildIndexes("/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/data/lsi_all/")



In [4]:
# This save the indexes to disk at the location of your choice

saveIndexes(myIndexes, "/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/lsi_index_all/")

In [3]:
# This load the indexes from disk. Set the path

newIndexes = loadIndexes("/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/lsi_index_all/")

In [11]:
# Set this to the query file provided by HomeDepot. Either train.csv or test.csv

queryFile = "/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/RawData/lsi_test_all_1.csv"
queryData = pd.read_csv(queryFile)

In [12]:
# Set the path below to the file in which you want to save the generated similarty scores

file = open('/Users/taklumbo/Ucl_assignments/IRDM/HomeDepot/featureFiles/test_set/lsi_all_1.csv', 'w')
file.write('queryId,productId,sim_score_all,sim_rank_all\n')

for i in range(0, len(queryData)):
    #print(queryData.loc[i, "search_term"])
    searchTerm = queryData.iloc[i, 3]
    queryId = queryData.iloc[i, 0]
    productId = queryData.iloc[i, 1]
    
    #print(searchTerm + " " + str(queryId) + " " + str(productId))
    
    searchTermVector = toLsi2(searchTerm, newIndexes)
    if (len(searchTermVector) > 0):
        result = newIndexes[similarityIndexKey][searchTermVector]
        result = sortSimilarities(result)
    
        simScore = getSimilarity2(result, str(productId), newIndexes)
        ranking = getRanking2(result, str(productId), newIndexes)
        file.write(str(queryId) + "," + str(productId) + "," + str(simScore) + "," + str(ranking) + "\n")
        
    else:
        file.write(str(queryId) + "," + str(productId) + ",-1,101\n")
    
    #if i == 10:
     #   break
file.close()

In [10]:
# Use for debugging

myVec = toLsi2("90 degree bracket", newIndexes)
myRes = newIndexes[similarityIndexKey][myVec]

theDoc = "100001"
myRes = sortSimilarities(myRes)
print(getSimilarity2(myRes, theDoc, newIndexes))
print(getRanking2(myRes, theDoc, newIndexes))


0.0505577
101
