In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from simhash import Simhash, SimhashIndex
from os import listdir
from sklearn.metrics import ndcg_score, dcg_score
import numpy as np
import pandas as pd
from preprocessing import *


def readText(path):
    Text = []
    return Text


def gainVectorizer(textData):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(textData)
    return vectorizer


def gainVocabulary(vectorizer):
    vocabulary = dict((i, w) for w, i in vectorizer.vocabulary_.items())
    return vocabulary


def gainDocsVectors(docs, vectorizer):
    docsVectors = vectorizer.transform(docs)
    return docsVectors


def gainDocsSimhashValues(docsVectors, vocabulary):
    simhashValues = []
    for i in range(docsVectors.shape[0]):
        docVector = docsVectors.getrow(i)
        # features as list of (token, weight) tuples)
        features = zip([vocabulary[j] for j in docVector.indices], docVector.data)
        simhashValues.append(Simhash(features))
        
    return simhashValues


def docsToSimhashValues(docs, vectorizer, vocabulary):
    docsVectors = gainDocsVectors(docs, vectorizer)
    docsSimhashValues = gainDocsSimhashValues(docsVectors, vocabulary)
    return docsSimhashValues


    
def gainFormattedQuery(query: str)-> list:
    return [query]
    
    
def queryToSimHashValue(query, vectorizer, vocabulary):
    formattedQuery = gainFormattedQuery(query)
    querySimhashValue = docsToSimhashValues(formattedQuery, vectorizer, vocabulary)
    return querySimhashValue[0]
    
    
def gainSimhashValueDistance(simhashValue1, simhashValue2):
    return simhashValue1.distance(simhashValue2)


def gainDistancOfTwoDocs(doc1, doc2, vectorizer, vocabulary):
    doc1SimhashValue = queryToSimHashValue(doc1, vectorizer, vocabulary)
    doc2SimhashValue = queryToSimHashValue(doc2, vectorizer, vocabulary)
    
    return gainSimhashValueDistance(doc1SimhashValue, doc2SimhashValue)


def argsort(seq):
    return sorted(range(len(seq)), key=seq.__getitem__)


def gainTopNAnswers(query, optionalDocs, vectorizer, vocabulary, n):
    if n >= len(optionalDocs):
        n = len(optionalDocs)

    querySimhashValue = queryToSimHashValue(query, vectorizer, vocabulary)
    optionalDocsSimhashValues = docsToSimhashValues(optionalDocs, vectorizer, vocabulary)
    distances = []
    for optionalDocsSimhashValue in optionalDocsSimhashValues:
        distances.append(gainSimhashValueDistance(querySimhashValue, optionalDocsSimhashValue))
    
    sortedIndices = argsort(distances)
    returnedAnswers = []
    for i in range(n):
        returnedAnswers.append(optionalDocs[sortedIndices[i]])
        
    return returnedAnswers

def gainTopNAnswersIndices(query, optionalDocs, vectorizer, vocabulary, n):
    if n >= len(optionalDocs):
        n = len(optionalDocs)

    querySimhashValue = queryToSimHashValue(query, vectorizer, vocabulary)
    optionalDocsSimhashValues = docsToSimhashValues(optionalDocs, vectorizer, vocabulary)
    distances = []
    for optionalDocsSimhashValue in optionalDocsSimhashValues:
        distances.append(gainSimhashValueDistance(querySimhashValue, optionalDocsSimhashValue))
    
    sortedIndices = argsort(distances)
        
    return sortedIndices



In [7]:
# For sampling use: get the sampling documents
annotate_result = pd.read_csv('../data/annotate_result.csv', encoding='utf-8', header=0)
remain_city_country_list = annotate_result['city'].tolist()


doc_filenames = sorted(listdir('../data/sample_docs'))
sample_docs = []
city_country_names = []
for i in range(len(doc_filenames)):
    line = doc_filenames[i].split('_')
    if "('" + line[0] + "', '" + line[1] + "')" in remain_city_country_list:
        city_country_names.append((line[0], line[1]))
        with open('../data/sample_docs/' + doc_filenames[i], 'r', encoding='utf-8') as df:
            doc = df.read()
            sample_docs.append(doc)

tokenized_corpus = []
corpus = []
for ind1, doc in enumerate(sample_docs):
    doc = str(doc)
    name = city_country_names[ind1][0].lower()
    corpus.append(doc)
#     # print(name)
    tokens = preprocess(doc, name)
    tokenized_corpus.append(tokens)

# # Convert to appropriate format for later use
corpus = [" ".join(docTokens) for docTokens in tokenized_corpus]
optionalDocs = corpus
vectorizer = gainVectorizer(optionalDocs)
vocabulary = gainVocabulary(vectorizer)
n = 100
top_10_test_queries = [["waterfall"], ["silk"], ["desert"],["volcano"],["beer"],["coconut"],["seafood"]]
top_100_test_queries = [["football"], ["castle"], ["shopping"], ['monument'], ["forest"]]

for q in top_100_test_queries:
    print("------------------{}------------------".format(q[0]))
    
    answersIndices = gainTopNAnswersIndices(q[0], optionalDocs, vectorizer, vocabulary, n)
    top_n = [city_country_names[answersIndices[i]] for i in range(n)]
    print([(pair[0].encode('utf-8'), pair[1].encode('utf-8'))for pair in top_n])

    # NDCG evaluations:
    relevance_score = [1 for i in range(n)]

    annotate_result = pd.read_csv('../data/annotate_result.csv', encoding='utf-8', header=0)

    true_relevence = []
    for top_city in top_n:
        true_relevence.append(annotate_result[annotate_result['city'] == str((top_city[0], top_city[1]))].squeeze()[q[0]])

    # Releveance scores in Ideal order 
    true_relevance = np.asarray([true_relevence]) 
    
    # Releveance scores in output order 
    relevance_score = np.asarray([relevance_score]) 

    print(ndcg_score( true_relevance, relevance_score))

------------------football------------------
[(b'Chicago Heights', b'United States'), (b'Chongqing', b'China'), (b'Nerang', b'Australia'), (b'Civitavecchia', b'Italy'), (b'Troy', b'United States'), (b'Aarau', b'Switzerland'), (b'Albano Laziale', b'Italy'), (b'Draper', b'United States'), (b'Enschede', b'Netherlands'), (b'Jingmen', b'China'), (b'Khabarovsk', b'Russia'), (b'Painesville', b'United States'), (b'Poprad', b'Slovakia'), (b'Suceava', b'Romania'), (b'Tuttlingen', b'Germany'), (b'Weimar', b'Germany'), (b'Wooster', b'United States'), (b'Abbotsford', b'Canada'), (b'Benin City', b'Nigeria'), (b'Caboolture', b'Australia'), (b'Faribault', b'United States'), (b'Foster City', b'United States'), (b'Kirandul', b'India'), (b'Lake Oswego', b'United States'), (b'Lier', b'Belgium'), (b'Pasadena', b'United States'), (b'Solna', b'Sweden'), (b'Thomasville', b'United States'), (b'Warwick', b'United Kingdom'), (b'Waterloo', b'Canada'), (b'Xanten', b'Germany'), (b'Abuja', b'Nigeria'), (b'Akola', b'

[(b'Saint Albans', b'Australia'), (b'Alpharetta', b'United States'), (b'Kozan', b'Turkey'), (b'Tha Mai', b'Thailand'), (b'Timi\xc5\x9foara', b'Romania'), (b'Zeytinburnu', b'Turkey'), (b'Ry\xc5\xab\xc5\x8d', b'Japan'), (b'Salavat', b'Russia'), (b'Bhav\xc4\x81ni', b'India'), (b'Gampola', b'Sri Lanka'), (b'Izhevsk', b'Russia'), (b'Paragould', b'United States'), (b'Parsippany', b'United States'), (b'San Juan', b'Philippines'), (b'Utsunomiya', b'Japan'), (b'Antelope', b'United States'), (b'Bal\xc4\xb1kesir', b'Turkey'), (b'Ba\xc4\x8dka Palanka', b'Serbia'), (b'Brawley', b'United States'), (b'Clermont-Ferrand', b'France'), (b'Crawfordsville', b'United States'), (b'Hondarribia', b'Spain'), (b'Husum', b'Germany'), (b'Inver Grove Heights', b'United States'), (b'Iquique', b'Chile'), (b'Jacksonville Beach', b'United States'), (b'Madrid', b'Spain'), (b'Manzanillo', b'Mexico'), (b'Milas', b'Turkey'), (b'Okrika', b'Nigeria'), (b'Phultala', b'Bangladesh'), (b'Rakovski', b'Bulgaria'), (b'Saint-L\xc3\x