In [None]:
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


### Challenge: Semantic Search Algorithm

Design and implement a semantic search algorithm that is able to score and rank a
set of keywords (trends) by how strongly associated they are to a given query term.
The algorithmic approach could borrow techniques from association rule mining to
analyze the co-occurrence of terms within a corpora of tweets and reddit posts, and
should take into consideration the uniqueness of the trend and the recency of the
association. For example, the algorithm should be able to determine that the query
‘iPhone’ is more strongly associated to trends like ‘MagSafe’, ‘5G’, and ‘pacific blue'
then it is to “Biden” or “perfume”.

In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
import string
import re

from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

!python -m spacy download en_core_web_md
import en_core_web_md
nlp = en_core_web_md.load()

!pip install transformers
!pip install sentence_transformers

import tensorflow as tf
import transformers
import scipy
from sentence_transformers import SentenceTransformer

In [None]:
project_name = 'nwo-sample'

In [None]:
%time
from google.cloud import bigquery

client = bigquery.Client(project = project_name)

# Perform a query.
QUERY = ('SELECT * FROM `nwo-sample.graph.reddit` LIMIT 10000')

query_job = client.query(QUERY)  # API request
df = query_job.result().to_dataframe()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


## Preprocess Helper functions

In [None]:
#Preprocess
def remove_articles(text):
  text = re.sub('\s+(a|an|the|The|A|An)(\s+)', ' ', text)
  return text

def remove_special_character(text):
  text = re.sub('[^a-zA-Z0-9 ]', '', text)
  return text

#Get Noun words from string
def noun_chunks(doc):
  ret_list = []
  for np in doc.noun_chunks:
    if np.text not in ret_list:
      ret_list.append(np.text)
  return ret_list

def get_nouns(text):
  doc = nlp(text)
  ret = noun_chunks(doc)
  return ret

#Replace the noun chunk's space with _
def noun_chunk_space(doc):
  noun_chunk = noun_chunks(doc)
  for i in range(len(noun_chunk)):
    noun_chunk[i] = re.sub('\s', '_', noun_chunk[i])
  return noun_chunk

def replace_space(text):
  doc = nlp(text)
  noun_chunk = noun_chunks(doc)
  replaced_nouns = noun_chunk_space(doc)
  for i in range(len(noun_chunk)):
    text = text.replace(noun_chunk[i], replaced_nouns[i])
  return text

def remove_punct(text):
  return text.translate(str.maketrans('', '', string.punctuation))


stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text

def remove_url(text):
  text = re.sub(r'http\S+', '', text)
  return text

def preprocess(text):
  text = remove_special_character(text)
  text = remove_articles(text)
  text = remove_punct(text)
  return text

## Trial 1

Training Word2Vec with the texts, and find similar vocabs or noun_phrase in the text.

In order to find the similarity between given query and words or context from the text data, I have trained gensim's Word2Vec model with tokenized words (concanate noun phrases with _). With the Word2Vec model, the given query will look up if the query is in trained corpus, if it does, the function will output top_n most similar vocabs or noun phrases. If it doesn't, the function will output error.

Pro of this model is that it output not bad result if the query is existing in the model. However, as a con, the model cannot handle synonym or abbreviation if the vocab does not exist in the raw text data. This model can be improved with better Word Embedding model, Fast Text from facebook, where it takes word as n-gram not a token, hence can take unknown words as well.

In [None]:
df['preprocess'] = df['body'].apply(preprocess)

In [None]:
%%time

df['preprocess'] = df['preprocess'].apply(replace_space)

CPU times: user 1min 50s, sys: 253 ms, total: 1min 50s
Wall time: 1min 50s


In [None]:
df['preprocess'][0]

'Unfortunately most_dermatologists dont like or understand OCM It just doesnt make sense to them and theyre training and Ive found that its really hard to teach doctor new_tricks after theyve graduated and started working But science behind OCM is confusing and shouldnt make sense  but if it IS working for you then what_harm can it do httpsyoutubesK7UufZam2UBut I wouldnt ditch derm just because she didnt understand OCM Are you oily_Acne prone Using lot of prescriptions These are reasons its best to still be under care of dermatologist while still using OCM of finding new_products You can try to find another_derm though that you feel more comfortable with Secondly like doctors there are good_estheticians and bad_ones Do your_research Ask your_friends and family ask on your_local_subreddit or facebook_page look at genuine_reviews for salon Dont just pick fanciest_looking_salon and make appointment Sadly youll never really know until you drop money and go and if its bad then youll have to

In [None]:
%%time
from gensim.models import Word2Vec

training_data = []
for sent in df['preprocess']:
  training_data.append([word.lower() for word in sent.split()])

#w2v model training
w2v_model = Word2Vec(min_count=1,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20)

w2v_model.build_vocab(training_data, progress_per=10000)

w2v_model.train(training_data, total_examples=w2v_model.corpus_count, epochs=100, report_delay=1)

CPU times: user 3min 5s, sys: 576 ms, total: 3min 5s
Wall time: 1min 10s


In [None]:
def query_w2v(query, w2v_model, top_n = 10):
  '''
  Input: query: string, a word or phrase
        w2v_model: gensim Word2Vec trained model

  Output: top n most similar vocabs or phrase on trained Word2Vec
  '''
  #preprocess query
  query = query.lower()
  query = remove_special_character(query)
  query = re.sub('\s', '_', query)

  #Retrieve similar words
  similar_vocabs = w2v_model.wv.most_similar(positive = [query], topn = top_n)

  return similar_vocabs

In [None]:
query_w2v('Donald Trump', w2v_model, top_n = 10)

[('usprsidenten', 0.8149840235710144),
 ('casually', 0.7828455567359924),
 ('fucking_idiot', 0.7643813490867615),
 ('this_mannerbecause', 0.7606171369552612),
 ('november_traf_sie_sich_mit_dem_designierten', 0.7519022226333618),
 ('und', 0.7278372645378113),
 ('solemn_pledge', 0.6685554385185242),
 ('ironically', 0.6450556516647339),
 ('boogeyman_tales', 0.6291605234146118),
 ('nationalist_uprising', 0.6145074963569641)]

#Trial 2

Using Sentence Bert to understand context and find similarity between given texts and query.

I wanted to check whether the transfer learning helps for the semantic search. I believe it will definitely helps, but it does not help much on the below case. 

I have only took noun phrase from the sentences with spacy, then have embedded all with the pre-trained Sentence Bert model. With a given query, the function embeds the query and find most similar embedded noun_phrases by cosine similarity. However, as the noun_phrases are not clear enough, and sentence BERT has been pre-trained with sentences, not noun_phrases, it does not land a good result.




In [None]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
def preprocess(text):
  text = text.lower()
  text = remove_stopwords(text)
  text = remove_url(text)
  text = remove_special_character(text)
  text = remove_articles(text)
  text = remove_punct(text)
  
  return text

In [None]:
%%time

df['preprocess'] = df['body'].apply(preprocess)

CPU times: user 191 ms, sys: 1.01 ms, total: 192 ms
Wall time: 191 ms


In [None]:
%%time

def get_nouns(text):
  doc = nlp(text)
  ret = noun_chunks(doc)
  return ret

df['nouns'] = df['preprocess'].apply(get_nouns)

CPU times: user 1min 28s, sys: 284 ms, total: 1min 28s
Wall time: 1min 28s


In [None]:
df['nouns'][0]

['most dermatologists',
 'ocm',
 'sense',
 'they',
 'i',
 'doctor',
 'new tricks',
 'science',
 'working harm',
 'derm',
 'ocm oily acne',
 'lot prescriptions reasons',
 'dermatologist',
 'ocm finding new products',
 'another derm',
 'doctors good estheticians',
 'bad ones research',
 'friends',
 'family',
 'local subreddit facebook page',
 'genuine reviews',
 'salon',
 'fanciest looking salon',
 'appointment',
 'drop money']

In [None]:
#only for noun_chuncks
#store all nouns in list for embedding
%%time

d_word = {}
for nouns in df['nouns']:
  for noun in nouns:
    if noun in d_word:
      pass
    else:
      d_word[noun] = model.encode(noun)

print("Total noun word & phrase in corpus: ", len(d_word))

Total noun word & phrase in corpus:  37312
CPU times: user 8min 59s, sys: 27.2 s, total: 9min 27s
Wall time: 9min 26s


In [None]:
%%time

d_index = {}
for key, value in enumerate(d_word):
  d_index[key] = value
  if key == 0:
    arr = d_word[value]
  else:
    arr = np.vstack((arr, d_word[value]))

print(arr.shape)

(37312, 768)
CPU times: user 9min 16s, sys: 14.8 s, total: 9min 31s
Wall time: 9min 31s


In [None]:
%%time
from scipy.spatial import distance

distances = distance.cdist([d_word[' book intelligent investor']], arr, 'cosine')[0]

CPU times: user 78.5 ms, sys: 22 ms, total: 100 ms
Wall time: 100 ms


In [None]:
def get_top_n(query, d_index, arr, top_n = 10):
  '''
  Input: Query: string
        d_index: dict, index of corresponding np array (embedded sentence)
        top_n: int
  OutPut: list: top n most similar embedded noun phrase related to query
  '''

  emb_query = model.encode(query)
  distances = distance.cdist([emb_query], arr, 'cosine')[0]
  top_n *= -1
  top_n_ind = np.argpartition(distances, top_n)[top_n:]
  ret = []
  for ind in top_n_ind:
    ret.append((d_index[ind], distances[ind]))
  return ret

In [None]:
get_top_n('Donald Trump', d_index, arr, top_n = 20)

[('kids school etc cat shop', 0.9930142811111167),
 ('short conflict classmates friends', 0.9951881535942174),
 ('typical smallblockers', 0.9958957331807448),
 ('small variety foods', 0.9958199168314167),
 ('even small businesses', 0.995242731546684),
 ('poor innocent babies', 0.9958903279448739),
 ('weak jelly fish', 0.9973101241578203),
 ('physical abuse school nurse', 1.0032249980412395),
 ('existing small shop surroundings', 1.0039280293841895),
 ('talk school counselor depression anxiety counselor', 1.0042624749129578),
 ('really good hear kid school lunches', 1.0430745542856925),
 ('many small business owners', 1.0028514594397928),
 ('girls class', 1.031834770385056),
 ('rat crawl toilet', 0.9985858047626106),
 ('slowly sandblasted fine sand', 1.0160287910007435),
 ('cheapest whore whorehouse', 1.0029017439791874),
 ('kijiji kids violin', 1.007112975067471),
 ('feed kids lunch', 1.0164274705059746),
 ('poor girls', 1.0152680425011076),
 ('cafeteria lunch', 1.0014526682911895)]