In [1]:
from build_dictionary import tokenizer
from get_data import GroupData, get_filename, get_data
from analysis_query import spelling_correction, AugmentedQuery
from get_simularity import Score
from dp_similarity import SentenceTransformers
import pandas as pd

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import pickle

import nltk
# nltk.download('wordnet')
import numpy as np

# Outline
* Explain the components and functions of the search engine
* Demonstrate demos and test the speed of different cases
* Explain the ranking basis with an example
* Discuss some possible future extensions

# Components

### 1. Data Loader - store each blog as objects

In [46]:
class GroupData:
    def __init__(self, blog_id, user_id, gender, age, industry, astrology, date, post):
        self.blog_id = blog_id
        self.user_id = user_id
        self.gender = gender
        self.age = age
        self.industry = industry
        self.astrology = astrology
        self.date = date
        self.post = post

In [5]:
with open('./group_data_objects.pickle', 'rb') as f:
    data_lists = pickle.load(f) # Read a pickle file

    
i = data_lists[40]
print("Blog ID:\t\t", i.blog_id, "\nUser ID:\t\t", i.user_id,
      "\nUser's Gender:\t\t", i.gender, "\nUser's Age:\t\t", i.age,
      "\nUser's Industry:\t", i.industry, "\nUser's Astrology:\t", i.astrology,
      "\nPosting Date:\t\t", i.date, "\nPosted blog:\t\t", i.post, '\n')

Blog ID:		 40 
User ID:		 3489929 
User's Gender:		 female 
User's Age:		 25 
User's Industry:	 Student 
User's Astrology:	 Cancer 
Posting Date:		 23,July,2004 
Posted blog:		 urlLink        Why this is, I do not know. But, OK. 



### 2. Vocaburary Dictionaries

#### 1) A Tokenizer - split paragraphs into a list with all the single tokens

In [42]:
blog = ["Why this is, I do not know. But, OK."]
tokenized_blog = tokenizer(blog)[0]  # spaCy
print("Tonkenized blog: ", tokenized_blog)

Tonkenized blog:  ['why', 'this', 'be', ',', 'i', 'do', 'not', 'know', '.', 'but', ',', 'ok', '.']


#### 2) Four Dictionaries

   * Vocaburary Dictionary:  {word &rarr; word frequency in the whole dataset}

   * Word-to-ID & ID-to-Word: {word &rarr; word's unique ID; wordword's unique ID &rarr; word}

   * Posting Lists: {word's upique ID &rarr; (blog ID, word frequence in this blog, Word frequency in the whole dataset)}

In [48]:
voc_dic = pickle.load(open('voc_dic.pickle', 'rb'))
print("5 samples of vocaburary dictionary:\n", list(voc_dic.items())[:5], "\n")
voc2id = pickle.load(open('voc2id.pickle', 'rb'))
print("5 samples of word to id:\n", list(voc2id.items())[1:6], "\n")
id2voc = pickle.load(open('id2voc.pickle', 'rb'))
print("5 samples of id to word:\n", list(id2voc.items())[:5], "\n")
posting_list = pickle.load(open('posting_list.pickle', 'rb'))
first_posting_list = next(iter(posting_list.items()))
print("5 samples of posting list:\n", first_posting_list[0], first_posting_list[1][0:5],"\n")

5 samples of vocaburary dictionary:
 [('destiny', 1803), ('...', 1001878), ('hear', 36743), ('chosen', 3411), ('life', 167720)] 

5 samples of word to id:
 [('destiny', 0), ('...', 1), ('hear', 2), ('chosen', 3), ('life', 4)] 

5 samples of id to word:
 [(0, 'destiny'), (1, '...'), (2, 'hear'), (3, 'chosen'), (4, 'life')] 

5 samples of posting list:
 0 [(0, 3, 169), (1, 1, 183), (213, 1, 866), (373, 1, 201), (591, 1, 247)] 



### 3. Query Pre-processors

#### 1) A Tokenizer
  * Recognize name entity

In [89]:
query = ["who is not happy in New York"]
tokenized_query = tokenizer(query)[0]  # spaCy
print(tokenized_query)

['who', 'be', 'not', 'happy', 'in', 'new york']


#### 2) Spelling Correction (demonstrated in the demo)

#### 3) Augmentation

  * Motivation:
       * even if some blogs do not contain the excatly keywords in the query,  the system can still consider them to be relevant based on the added augmentation data

  * Presupposition：
       * even if a blog does not contain the excatly keywords in the query, if it contains many semantically related words to the keywords, we can assume that the blog is probably relevant in some way

  * Steps 1
    * if negation marker "not" or "n't" appears, get the antonyms of the token being modified
    * remove the phrase including "not" (eg. "not happy")
    * add antonyms to the query token list

In [90]:
augm_q = AugmentedQuery(tokenized_query)
augment_obj = augm_q.augment_query()

In [91]:
print("Original Query:\n", tokenized_query)
print("Antonyms of 'happy':\n", augment_obj.antonyms_set)
for token in augment_obj.delete_set:
   tokenized_query.remove(token)
print("Deleted 'not' phrase:\n", tokenized_query)
tokenized_query += list(augment_obj.antonyms_set)
print("Revised Query:\n", tokenized_query)

Original Query:
 ['who', 'be', 'not', 'happy', 'in', 'new york']
Antonyms of 'happy':
 {'unhappy'}
Deleted 'not' phrase:
 ['who', 'be', 'in', 'new york']
Revised Query:
 ['who', 'be', 'in', 'new york', 'unhappy']


  * Steps 2
     * Get each token(except stopwords)'s synonyms, definition of synonyms, hyponyms, hypernyms
     * Put them in a set as related words

In [97]:
print("Synonyms:", augment_obj.synonyms_set, '\n')
print("Definition of Synonyms:", augment_obj.definition_set, '\n')
print("Hyponyms:", augment_obj.hyponyms_set, '\n')
print("Hypernyms:", augment_obj.hypernyms_set, '\n')
related_words = list(augment_obj.synonyms_set | augment_obj.definition_set | augment_obj.hyponyms_set | augment_obj.hypernyms_set)
print("Related Words:", related_words)

Synonyms: {'NY', 'New York State', 'New York City', 'Empire State', 'New York', 'Greater New York'} 

Definition of Synonyms: {'financial', 'river', 'major', 'united', 'form', 'original', 'large', 'new', 'southeastern', 'york', '13', 'colony', 'hudson', 'british', 'city', 'center', 'mid', 'cultural', 'state', 'mouth', 'locate'} 

Hyponyms: set() 

Hypernyms: set() 

Related Words: ['financial', 'river', 'united', 'form', 'original', 'New York City', 'large', 'new', 'Empire State', 'southeastern', 'york', '13', 'colony', 'hudson', 'british', 'city', 'center', 'New York', 'mid', 'NY', 'New York State', 'cultural', 'major', 'state', 'mouth', 'Greater New York', 'locate']


  * another example to show what is hypernyms and hyponyms

<img style="float: center;" src="https://upload.wikimedia.org/wikipedia/commons/thumb/b/b4/Hyponym_and_hypernym.svg/1200px-Hyponym_and_hypernym.svg.png" width="70%"> 

In [37]:
query = ['purple']
augmented_q = AugmentedQuery(query)
augment_obj = augmented_q.augment_query()
print("Hypernyms: ", '\n', augment_obj.hypernyms_set, '\n')
print("Hyponyms: ", '\n', augment_obj.hyponyms_set)

Hypernyms:  
 {'colourise', 'spectral color', 'colorise', 'spectral colour', 'chromatic colour', 'colour', 'color', 'color in', 'chromatic color', 'colorize', 'nobility', 'noblesse', 'discolor', 'colourize', 'discolour', 'colour in'} 

Hyponyms:  
 {'violet', 'royal purple', 'reddish purple', 'reddish blue', 'mauve', 'lavender'}


### 4. Computing Similarity

#### 1) TF-IDF

In [95]:
print("Revised Query:", tokenized_query)
print("Related Words:", related_words)

Revised Query: ['who', 'be', 'in', 'new york', 'unhappy']
Related words: ['financial', 'river', 'united', 'form', 'original', 'New York City', 'large', 'new', 'Empire State', 'southeastern', 'york', '13', 'colony', 'hudson', 'british', 'city', 'center', 'New York', 'mid', 'NY', 'New York State', 'cultural', 'major', 'state', 'mouth', 'Greater New York', 'locate']


# Demos

# Explanation

# Extentions