In [2]:
import nltk
from os import listdir
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import numpy as np
from numpy import savetxt
import pandas as pd


stop_words = set(stopwords.words('english'))
stemmer= PorterStemmer()
lemmatizer=WordNetLemmatizer()


def text_lower(text):
    return text.lower()

def remove_empty_lines(text):
    return text.replace('\\n', ' ')

def remove_ending_blocks(text):
    # target_block = ['== See also ==','== References ==','== External links ==','=== Notable people ===']
    target_block = ['== See also ==','== References ==','== External links ==']
    for target in target_block:
        start_i = text.find(target)
        if start_i != -1:
            text = text[:start_i]
        
    return text

def remove_block_title(text):
    return re.sub(r'[=]+\s[a-z\s]+\s[=]+', '', text)

def remove_extra_blanks(text):
    return re.sub(' {2,}', ' ', text)

def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_spec_char(text):
    return re.sub(r'[^A-Za-z0-9\s]+', '', text)

def tokenize(text):
    tokens = word_tokenize(text)
    return [i for i in tokens if not i in stop_words]

def stemming(word_list):
    return [stemmer.stem(word) for word in word_list]

def lemmatize(word_list):
    return [lemmatizer.lemmatize(word) for word in word_list]

def remove_city_name(word_list, name):
    return [word for word in word_list if word != name]

# Preprocess the sample documents:
def preprocess(doc, city_name):
    # Remove several ending blocks
    doc = remove_ending_blocks(doc)
    # Lower the letters
    doc = text_lower(doc)
    # Remove empty lines
    doc = remove_empty_lines(doc)

    # Remove block titles:
    doc = remove_block_title(doc)
    # Remove extra blank spaces:
    doc = remove_extra_blanks(doc)
    # Remove punctuations:
    doc = remove_punc(doc)
    # Remove speacial characters:
    doc = remove_spec_char(doc)

    # Tokenization:
    tokens = tokenize(doc)
    # Stemming:
    # tokens = stemming(tokens)
    # Lemmatizing:
    tokens = lemmatize(tokens)
    # Remove city names:
    tokens = remove_city_name(tokens, city_name)

    return tokens

In [12]:
from os import listdir
from sklearn.metrics import ndcg_score, dcg_score
import numpy as np
import pandas as pd
import numpy as geek 

def get_jaccard_score(tokenized_query,tokenized_corpus):
    jaccard_scores = np.array([])
    for one_doc in tokenized_corpus:
        intersection = geek.intersect1d(tokenized_query, one_doc)
        union = geek.union1d(tokenized_query, one_doc)
        jaccard_scores = np.append(jaccard_scores, float(len(intersection)) / len(union))
    return jaccard_scores

def argsort(seq):
    return sorted(range(len(seq)), key=seq.__getitem__, reverse=True)

# For sampling use: get the sampling documents
doc_filenames = sorted(listdir('../data/sample_docs'))
sample_docs = []
city_country_names = []
for i in range(len(doc_filenames)):
    line = doc_filenames[i].split('_')
    city_country_names.append((line[0], line[1]))
    with open('../data/sample_docs/' + doc_filenames[i], 'r', encoding='utf-8') as df:
        doc = df.read()
        sample_docs.append(doc)

tokenized_corpus = []

for ind1, doc in enumerate(sample_docs):
    doc = str(doc)
    name = city_country_names[ind1][0].lower()
    # print(name)
    tokens = preprocess(doc, name)
    tokenized_corpus.append(tokens)


top_10_test_queries = [["waterfall"], ["silk"], ["desert"],["volcano"],["beer"],["coconut"],["seafood"]]
top_100_test_queries = [["football"], ["castle"], ["shopping"], ['monument'], ["forest"]]


for q in top_100_test_queries:
    print("------------------{}------------------".format(q[0]))
    
    tokenized_query = q
    jaccard_scores = get_jaccard_score(tokenized_query,tokenized_corpus)
    top_n = []
    top_n.append([city_country_names[i] for i in argsort(jaccard_scores)])
    top_n = top_n[0][:100]
    # print([(pair[0].encode('utf-8'), pair[1].encode('utf-8'))for pair in top_n])

    # NDCG evaluations:
    relevance_score = [1 for i in range(100)]

    annotate_result = pd.read_csv('../data/annotate_result.csv', encoding='utf-8', header=0)

    true_relevence = []
    for top_city in top_n:
        true_relevence.append(annotate_result[annotate_result['city'] == str((top_city[0], top_city[1]))].squeeze()[q[0]])

    # Releveance scores in Ideal order 
    true_relevance = np.asarray([true_relevence]) 
    
    # Releveance scores in output order 
    relevance_score = np.asarray([relevance_score]) 

    print(ndcg_score( true_relevance, relevance_score))

------------------football------------------
0.5571741306624307
------------------castle------------------
0.6396587213360692
------------------shopping------------------
0.6396587213360692
------------------monument------------------
0.7094859686180036
------------------forest------------------
0.5571741306624307
