## BONUS [Not in Chaper]:  bi-gram , tri_gram and quad_gram  phrase detection

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch04/1.setting-up-the-retrotech-dataset.ipynb) notebook.

In [1]:
import nltk
from nltk.collocations import *
import re
import pandas 
from nltk.corpus import webtext
from nltk.tokenize import RegexpTokenizer
nltk.download('averaged_perceptron_tagger')
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("aips-ch6").getOrCreate()

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [3]:
#signal_sample= pandas.read_json("../data/temp/signal_sample.json")

signal_all= pandas.read_csv("../../data/retrotech/signals.csv")


  signal_all = pandas.read_csv("data/retrotech/signals.csv")


In [4]:
is_query =  signal_all['type']=='query'
signal_query  = signal_all[is_query]

In [5]:
signal_query[1:3]

Unnamed: 0,query_id,user,type,target,signal_time
1,u2_1_2,u2,query,rca,2020-05-04 08:28:21.1848
2,u3_0_1,u3,query,macbook,2019-12-22 00:07:07.0152


In [6]:
#use the real signals
signals_collection="signals"
signals_opts={"zkhost": "aips-zk", "collection": signals_collection}
df = spark.read.format("solr").options(**signals_opts).load()
df.createOrReplaceTempView("signals")

### Create user-searchs table each raw represent one search query.
query_signals = spark.sql("""
  SELECT lower(trim(searches.target)) as keyword, searches.user as user 
  FROM signals as searches where searches.type='query'
  GROUP BY keyword, user""").collect() #only one signal per user per keyword to prevent spam

### Step 1: data cleaning: 

tokenize text and only keep words or digits, only keep tokens which have length greater than 2

In [7]:
def cleaning(text):
    tokens = []
    tokenizer = RegexpTokenizer(r'\w+') 
    
    tokened = tokenizer.tokenize(text.lower())
    
    for token in tokened:
        if len(token) > 2 and not token.isdigit(): #keep tokens longer than 2 characters and drop digit only tokens
            tokens.append(token)
            
    return tokens

signal_tokened = []

#for query in signal_sample["query_s"]:
for row in query_signals:
    query = row["keyword"]
#for query in signal_query["target"]:
    tokens = cleaning(query)
    #print(tokens)
    signal_tokened.append(tokens)
    
signal_tokened[:5]

[['beats', 'dre'],
 ['beats', 'dre', 'headphones'],
 ['epad'],
 ['fringe'],
 ['gps', 'tracker']]

### Step 2: Find candidate bi-gram phrases based on frequency. 

using nltk collocation bigram function to find candidate bigram phrases, a frequency filter is applied to only keep bigrams with frequency greater or equal to 3. 

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
quadgram_measures = nltk.collocations.QuadgramAssocMeasures()

finder_bi = BigramCollocationFinder.from_documents(signal_tokened) #input is a list of token list
finder_tri = TrigramCollocationFinder.from_documents(signal_tokened)
finder_quad = QuadgramCollocationFinder.from_documents(signal_tokened)

freq_threshold=3
## apply freq filter for all grams
finder_bi.apply_freq_filter(freq_threshold)
finder_tri.apply_freq_filter(freq_threshold)
finder_quad.apply_freq_filter(freq_threshold)

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
quadgram_measures = nltk.collocations.QuadgramAssocMeasures()

### Step 3: Sort candidate phrases based on PMI and likelihood ratio. 

In [None]:
## check PMI score
print(f"Bigrams: {finder_bi.score_ngrams(bigram_measures.pmi)[:20]}")
print(f"Trigrams: {finder_tri.score_ngrams(trigram_measures.pmi)[:20]}")
print(f"Quadgrams: {finder_quad.score_ngrams(quadgram_measures.pmi)}")

Bigrams: [(('stanley', 'kubrick'), 18.078850517367755), (('viva', 'bam'), 18.078850517367755), (('anthony', 'hamilton'), 17.756922422480393), (('bangkok', 'knockout'), 17.756922422480393), (('barbara', 'streisand'), 17.756922422480393), (('bypass', 'module'), 17.756922422480393), (('cadillac', 'records'), 17.756922422480393), (('charred', 'walls'), 17.756922422480393), (('conditions', 'parole'), 17.756922422480393), (('daddy', 'yankee'), 17.756922422480393), (('darkwing', 'duck'), 17.756922422480393), (('dierks', 'bentley'), 17.756922422480393), (('drawn', 'together'), 17.756922422480393), (('due', 'date'), 17.756922422480393), (('dukes', 'hazzard'), 17.756922422480393), (('edward', 'scissorhands'), 17.756922422480393), (('ergo', 'proxy'), 17.756922422480393), (('fleet', 'foxes'), 17.756922422480393), (('greatest', 'hits'), 17.756922422480393), (('gurren', 'lagann'), 17.756922422480393)]
Trigrams: [(('airborne', 'toxic', 'event'), 33.64990639453681), (('everybody', 'loves', 'raymond'),

### Step 4: combine candidate list from PMI and likelihood ratio
only keep phrases that shown in top 1000 in both lists. 

In [None]:
intersection=[]
for finder_tup in [(finder_bi,bigram_measures),(finder_tri,trigram_measures),(finder_quad,quadgram_measures)]:
#for finder_tup in [(finder_bi,bigram_measures),(finder_tri,trigram_measures)]:
    
    finder=finder_tup[0]
    finder_measure=finder_tup[1]
    intersection += [value for value in finder.nbest(finder_measure.likelihood_ratio, 1000) 
                     if value in finder.nbest(finder_measure.pmi, 1000)] 

In [None]:
intersection

[('puddle', 'mudd'),
 ('tim', 'mcgraw'),
 ('chemical', 'romance'),
 ('always', 'sunny'),
 ('avril', 'lavigne'),
 ('carrie', 'underwood'),
 ('smashing', 'pumpkins'),
 ('velvet', 'revolver'),
 ('pans', 'labyrinth'),
 ('various', 'artists'),
 ('bobby', 'valentino'),
 ('brad', 'paisley'),
 ('stranger', 'tides'),
 ('boardwalk', 'empire'),
 ('before', 'christmas'),
 ('its', 'always'),
 ('assassins', 'creed'),
 ('brantley', 'gilbert'),
 ('double', 'din'),
 ('criminal', 'minds'),
 ('mortal', 'kombat'),
 ('skyward', 'sword'),
 ('take', 'care'),
 ('pulp', 'fiction'),
 ('jeff', 'dunham'),
 ('cyber', 'shot'),
 ('kitchen', 'aid'),
 ('deathly', 'hallows'),
 ('david', 'guetta'),
 ('ace', 'combat'),
 ('noise', 'canceling'),
 ('french', 'door'),
 ('taylor', 'swift'),
 ('heart', 'rate'),
 ('mass', 'effect'),
 ('kung', 'panda'),
 ('wears', 'prada'),
 ('hocus', 'pocus'),
 ('mindless', 'behavior'),
 ('paranormal', 'activity'),
 ('foo', 'fighters'),
 ('gossip', 'girl'),
 ('something', 'borrowed'),
 ('ncaa',

### Step 5: further filter bi-gram to get Noun phrases
fiter based on POS tagging patterns JJ_NN or NN_NN.

In [None]:
bi_gram_noun_phrases=[]
for phrase_token in intersection:
    POS = nltk.pos_tag(phrase_token)
    POS_first_word = POS[0][1]
    POS_second_word = POS[1][1]
    if POS_first_word in ['NN','JJ'] and POS_second_word == 'NN':
        #print("dddd")
        bi_gram_noun_phrases.append(' '.join([POS[0][0],POS[1][0]]))

In [None]:
bi_gram_noun_phrases

['puddle mudd',
 'tim mcgraw',
 'chemical romance',
 'avril lavigne',
 'carrie underwood',
 'velvet revolver',
 'bobby valentino',
 'brad paisley',
 'boardwalk empire',
 'brantley gilbert',
 'double din',
 'mortal kombat',
 'skyward sword',
 'pulp fiction',
 'jeff dunham',
 'cyber shot',
 'kitchen aid',
 'david guetta',
 'ace combat',
 'noise canceling',
 'french door',
 'taylor swift',
 'heart rate',
 'mass effect',
 'kung panda',
 'hocus pocus',
 'mindless behavior',
 'paranormal activity',
 'gossip girl',
 'ncaa football',
 'sunny philadelphia',
 'randy orton',
 'rosetta stone',
 'rick ross',
 'professor layton',
 'jill scott',
 'elm street',
 'snow leopard',
 'noise cancelling',
 'leap frog',
 'flight simulator',
 'demi lovato',
 'ben hur',
 'gran turismo',
 'thermal paste',
 'display port',
 'alice cooper',
 'mylo xyloto',
 'juice pack',
 'complete saga',
 'toy story',
 'tech n9ne',
 'marvel capcom',
 'jane eyre',
 'jersey shore',
 'street fighter',
 'scooby doo',
 'rice cooker',
