In [1]:
from build_dictionary import tokenizer
from get_data import GroupData, get_filename, get_data
from analysis_query import spelling_correction, AugmentedQuery
from get_simularity import Score
from dp_similarity import SentenceTransformers
from main import main
import pandas as pd
pd.set_option('display.width', 6000)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import pickle

import nltk
# nltk.download('wordnet')
import numpy as np

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# Outline
* Explain the components and functions of the search engine
* Demonstrate demos and test the speed of different cases
* Explain the ranking basis with an example
* Discuss some possible future extensions

# Components

### 1. Data Loader - store each blog as objects

In [2]:
class GroupData:
    def __init__(self, blog_id, user_id, gender, age, industry, astrology, date, post):
        self.blog_id = blog_id
        self.user_id = user_id
        self.gender = gender
        self.age = age
        self.industry = industry
        self.astrology = astrology
        self.date = date
        self.post = post

In [3]:
with open('./group_data_objects.pickle', 'rb') as f:
    data_lists = pickle.load(f) # Read a pickle file

    
i = data_lists[40]
print("Blog ID:\t\t", i.blog_id, "\nUser ID:\t\t", i.user_id,
      "\nUser's Gender:\t\t", i.gender, "\nUser's Age:\t\t", i.age,
      "\nUser's Industry:\t", i.industry, "\nUser's Astrology:\t", i.astrology,
      "\nPosting Date:\t\t", i.date, "\nPosted blog:\t\t", i.post, '\n')

Blog ID:		 40 
User ID:		 3489929 
User's Gender:		 female 
User's Age:		 25 
User's Industry:	 Student 
User's Astrology:	 Cancer 
Posting Date:		 23,July,2004 
Posted blog:		 urlLink        Why this is, I do not know. But, OK. 



### 2. Vocaburary Dictionaries

#### 1) A Tokenizer - split paragraphs into a list with all the single tokens

In [4]:
blog = ["Why this is, I do not know. But, OK."]
tokenized_blog = tokenizer(blog)[0]  # spaCy
print("Tonkenized blog:\n", tokenized_blog)

Tonkenized blog:
 ['why', 'this', 'be', ',', 'i', 'do', 'not', 'know', '.', 'but', ',', 'ok', '.']


#### 2) Four Dictionaries

   * Vocaburary Dictionary:  {word &rarr; word frequency in the whole dataset}

   * Word-to-ID & ID-to-Word: {word &rarr; word's unique ID; wordword's unique ID &rarr; word}

   * Posting Lists: {word's upique ID &rarr; (blog ID, word frequence in this blog, Word frequency in the whole dataset)}

In [5]:
voc_dic = pickle.load(open('voc_dic.pickle', 'rb'))
print("5 samples of vocaburary dictionary:\n", list(voc_dic.items())[:5], "\n")
voc2id = pickle.load(open('voc2id.pickle', 'rb'))
print("5 samples of word to id:\n", list(voc2id.items())[1:6], "\n")
id2voc = pickle.load(open('id2voc.pickle', 'rb'))
print("5 samples of id to word:\n", list(id2voc.items())[:5], "\n")
posting_list = pickle.load(open('posting_list.pickle', 'rb'))
first_posting_list = next(iter(posting_list.items()))
print("5 samples of posting list:\n", first_posting_list[0], first_posting_list[1][0:5],"\n")

5 samples of vocaburary dictionary:
 [('destiny', 1803), ('...', 1001878), ('hear', 36743), ('chosen', 3411), ('life', 167720)] 

5 samples of word to id:
 [('destiny', 0), ('...', 1), ('hear', 2), ('chosen', 3), ('life', 4)] 

5 samples of id to word:
 [(0, 'destiny'), (1, '...'), (2, 'hear'), (3, 'chosen'), (4, 'life')] 

5 samples of posting list:
 0 [(0, 3, 169), (1, 1, 183), (213, 1, 866), (373, 1, 201), (591, 1, 247)] 



### 3. Query Pre-processors

#### 1) A Tokenizer
  * Recognize name entity

In [6]:
query = ["who is not happy in New York"]
tokenized_query = tokenizer(query)[0]  # spaCy
print(tokenized_query)

['who', 'be', 'not', 'happy', 'in', 'new york']


#### 2) Spelling Correction (demonstrated in the demo)

#### 3) Augmentation

  * Motivation
       * even if some blogs do not contain the excatly keywords in the query,  the system can still consider them to be relevant based on the added augmentation data

  * Presupposition
       * even if a blog does not contain the excatly keywords in the query, if it contains many semantically related words to the keywords, we can assume that the blog is probably relevant in some way

  * Steps 1
    * if negation marker "not" or "n't" appears, get the antonyms of the token being modified
    * remove the phrase including "not" (eg. "not happy")
    * add antonyms to the query token list

In [7]:
augm_q = AugmentedQuery(tokenized_query)
augment_obj = augm_q.augment_query()

In [8]:
print("Original Query:\n", tokenized_query)
print("Antonyms of 'happy':\n", augment_obj.antonyms_set)
for token in augment_obj.delete_set:
   tokenized_query.remove(token)
print("Deleted 'not' phrase:\n", tokenized_query)
tokenized_query += list(augment_obj.antonyms_set)
print("Revised Query:\n", tokenized_query)

Original Query:
 ['who', 'be', 'not', 'happy', 'in', 'new york']
Antonyms of 'happy':
 {'unhappy'}
Deleted 'not' phrase:
 ['who', 'be', 'in', 'new york']
Revised Query:
 ['who', 'be', 'in', 'new york', 'unhappy']


  * Steps 2
     * Get each token(except stopwords)'s synonyms, definition of synonyms, hyponyms, hypernyms
     * Put them in a set as related words

In [9]:
print("Synonyms:", augment_obj.synonyms_set, '\n')
print("Definition of Synonyms:", augment_obj.definition_set, '\n')
print("Hyponyms:", augment_obj.hyponyms_set, '\n')
print("Hypernyms:", augment_obj.hypernyms_set, '\n')
related_words = list(augment_obj.synonyms_set | augment_obj.definition_set | augment_obj.hyponyms_set | augment_obj.hypernyms_set)
print("Related Words:", related_words)

Synonyms: {'New York State', 'Empire State', 'New York City', 'Greater New York', 'New York', 'NY'} 

Definition of Synonyms: {'city', 'locate', 'colony', 'new', '13', 'southeastern', 'york', 'original', 'mouth', 'state', 'form', 'major', 'mid', 'large', 'river', 'center', 'cultural', 'hudson', 'financial', 'british', 'united'} 

Hyponyms: set() 

Hypernyms: set() 

Related Words: ['city', 'locate', 'New York City', 'colony', 'new', '13', 'southeastern', 'york', 'Empire State', 'NY', 'original', 'New York State', 'mouth', 'state', 'form', 'Greater New York', 'New York', 'major', 'mid', 'large', 'river', 'cultural', 'hudson', 'financial', 'center', 'british', 'united']


  * another example to show what is hypernyms and hyponyms

<img style="float: center;" src="https://upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Hyponym_and_hypernym.svg/1200px-Hyponym_and_hypernym.svg.png" width="70%"> 

In [10]:
query = ['purple']
augmented_q = AugmentedQuery(query)
augment_obj = augmented_q.augment_query()
print("Hypernyms: ", '\n', augment_obj.hypernyms_set, '\n')
print("Hyponyms: ", '\n', augment_obj.hyponyms_set)

Hypernyms:  
 {'nobility', 'color', 'colorise', 'discolor', 'colour in', 'colourise', 'noblesse', 'discolour', 'colorize', 'chromatic colour', 'colourize', 'color in', 'colour', 'spectral color', 'chromatic color', 'spectral colour'} 

Hyponyms:  
 {'lavender', 'reddish purple', 'violet', 'mauve', 'royal purple', 'reddish blue'}


### 4. Computing Similarity

#### 1) TF-IDF (first rank)
  * Score each blog that contains at least one target word 
    * create a dictionary for each query
    * take the blog ID containing the target word as the keyword
    * add up the tf-idf scores for each contained target word as the value
    * rank the blogs by score

In [11]:
print("Revised Query:", tokenized_query)
print("Related Words:", related_words)

Revised Query: ['who', 'be', 'in', 'new york', 'unhappy']
Related Words: ['city', 'locate', 'New York City', 'colony', 'new', '13', 'southeastern', 'york', 'Empire State', 'NY', 'original', 'New York State', 'mouth', 'state', 'form', 'Greater New York', 'New York', 'major', 'mid', 'large', 'river', 'cultural', 'hudson', 'financial', 'center', 'british', 'united']


* The words in the 'Revised Query' and 'Related Words' are both considered as target word
* The TF-IDF score for each 'Related Words' was multiplied by 0.2, as it was considered less important than the query word

#### 2) Embeddings (re-rank)
  * Motivation
      * the ranking results according to tf-idf scores are influenced by the scope of the blog content covered
      * after testing, the high scoring results include exam papers and questionnaires
  * Presupposition
      * using a pre-trained language model to obtain sentence embeddings and calculating cosine similarity scores for queries and each blog may help to obtain semantically more relevant results
  * Pre-trained language model: MiniLM

# Demos

In [None]:
main()

Query example:

 #1 New York 	(Sensitive to named entities)
 #2 I'm not happy 	(Understand adjective phrases modified by 'not')
 #3 apple slow 	(Ambiguity)
 #4 apple pie 	(Ambiguity)
 #5 Americam 	(Auto correct typo)

What do you search for:
America

Loading...
Ranking...
Ready to return results...

       Score                                               Post               Date  Blog ID  User ID  Gender Age              Industry    Astrology
0   0.720023                           What exactly is America?        25,May,2004   561014  3297447    male  24     Museums-Libraries        Libra
1   0.691705                                 God Bless America.       04,July,2004   137081   671748    male  27  Communications-Media        Aries
2   0.674030                  The world according to America...     09,August,2004   415066   894945    male  27            Technology       Cancer
3   0.674030                  The world according to America...       28,June,2004   101595   705633    mal

# Explanation

In [26]:
import requests
import json
from SPARQLWrapper import SPARQLWrapper, JSON
sparql = SPARQLWrapper("http://dbpedia.org/sparql")

def get_taxonomy(results,entity,hypernym_list):

    '''This recursive function keeps on fetching the hypernyms of the 
    DBpedia resource recursively till the highest concept or root is reached'''

    if entity == 'null':
        return hypernym_list
    else :
        query = ''' 
        SELECT ?hypernyms 
        WHERE {<'''+entity+'''> <http://purl.org/linguistics/gold/hypernym> ?hypernyms .}
        '''
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        for result in results["results"]["bindings"]:
            hypernym_list.append(result['hypernyms']['value'])
        if len(results["results"]["bindings"]) == 0:
            return get_taxonomy(results,'null',hypernym_list)
        return get_taxonomy(results,results["results"]["bindings"][0]['hypernyms']['value'],hypernym_list)

def get_taxonomy_of_resource(dbpedia_resource):
    list_for_hypernyms=[]
    results = {}
    results["results"]={}
    results["results"]["bindings"]=[1,2,3]
    taxonomy_list = get_taxonomy(results,dbpedia_resource,list_for_hypernyms)
    return taxonomy_list


In [27]:
get_taxonomy_of_resource("http://dbpedia.org/resource/purple")

[]

# Extentions