# bi-gram , tri_gram and quad_gram  phrase detection

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch4/1.ch4-setting-up-the-retrotech-dataset.ipynb) notebook.

In [None]:
import nltk
from nltk.collocations import *
import re
import pandas as pd
from nltk.corpus import webtext
from nltk.tokenize import RegexpTokenizer
nltk.download('averaged_perceptron_tagger')

In [None]:
#signal_sample= pd.read_json("../data/temp/signal_sample.json")

signal_all= pd.read_csv("../data/retrotech/signals.csv")


In [None]:
is_query =  signal_all['type']=='query'
signal_query  = signal_all[is_query]

In [7]:
signal_query[1:3]


Unnamed: 0,query_id,user,type,target,signal_time
1,u2_1_2,u2,query,rca,2020-05-04 08:28:21.1848
2,u3_0_1,u3,query,macbook,2019-12-22 00:07:07.0152


### Step 1: data cleaning: 

tokenize text and only keep words or digits, only keep tokens which have length greater than 2

In [8]:
def cleaning(text):
    tokens = []
    tokenizer = RegexpTokenizer(r'\w+') 
    
    tokened = tokenizer.tokenize(text.lower())
    
    for token in tokened:
        if len(token) > 2 and not token.isdigit(): #keep tokens longer than 2 characters and drop digit only tokens
            tokens.append(token)
            
    return tokens

signal_tokened = []

#for query in signal_sample["query_s"]:
for query in signal_query["target"]:
    tokens = cleaning(query)
    #print(tokens)
    signal_tokened.append(tokens)
    
signal_tokened[:5]

[['nook'], ['rca'], ['macbook'], ['antenna'], ['power', 'cord']]

### Step 2: Find candidate bi-gram phrases based on frequency. 

using nltk collocation bigram function to find candidate bigram phrases, a frequency filter is applied to only keep bigrams with frequency greater or equal to 3. 

In [19]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
#quadgram_measures = nltk.collocations.QuadgramAssocMeasures()

finder_bi = BigramCollocationFinder.from_documents(signal_tokened) #input is a list of token list
finder_tri = TrigramCollocationFinder.from_documents(signal_tokened)
#finder_quad = QuadgramCollocationFinder.from_documents(signal_tokened)

freq_threshold=3
## apply freq filter for all grams
finder_bi.apply_freq_filter(freq_threshold)
finder_tri.apply_freq_filter(freq_threshold)
#finder_quad.apply_freq_filter(freq_threshold)


In [20]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
#quadgram_measures = nltk.collocations.QuadgramAssocMeasures()


### Step 3: Sort candidate phrases based on PMI and likelihood ratio. 

In [47]:
## check PMI score
finder_bi.score_ngrams(bigram_measures.pmi)
#finder_tri.score_ngrams(trigram_measures.pmi)
#finder_quad.score_ngrams(quadgram_measures.pmi)

[(('anthony', 'hamilton'), 17.763849060363295),
 (('bypass', 'module'), 17.763849060363295),
 (('cadillac', 'records'), 17.763849060363295),
 (('charred', 'walls'), 17.763849060363295),
 (('conditions', 'parole'), 17.763849060363295),
 (('daddy', 'yankee'), 17.763849060363295),
 (('darkwing', 'duck'), 17.763849060363295),
 (('dierks', 'bentley'), 17.763849060363295),
 (('drawn', 'together'), 17.763849060363295),
 (('due', 'date'), 17.763849060363295),
 (('dukes', 'hazzard'), 17.763849060363295),
 (('edward', 'scissorhands'), 17.763849060363295),
 (('ergo', 'proxy'), 17.763849060363295),
 (('fleet', 'foxes'), 17.763849060363295),
 (('greatest', 'hits'), 17.763849060363295),
 (('gurren', 'lagann'), 17.763849060363295),
 (('haunted', 'mansion'), 17.763849060363295),
 (('ikki', 'tousen'), 17.763849060363295),
 (('jesus', 'culture'), 17.763849060363295),
 (('kate', 'voegele'), 17.763849060363295),
 (('kidz', 'bop'), 17.763849060363295),
 (('kierra', 'sheard'), 17.763849060363295),
 (('kobe'

### Step 4: combine candidate list from PMI and likelihood ratio
only keep phrases that shown in top 1000 in both lists. 

In [48]:
intersection=[]
#for finder_tup in [(finder_bi,bigram_measures),(finder_tri,trigram_measures),(finder_quad,quadgram_measures)]:
for finder_tup in [(finder_bi,bigram_measures),(finder_tri,trigram_measures)]:
    
    finder=finder_tup[0]
    finder_measure=finder_tup[1]
    intersection += [value for value in finder.nbest(finder_measure.likelihood_ratio, 900) if value in finder.nbest(finder_measure.pmi, 900)] 

In [49]:
intersection

[('criminal', 'minds'),
 ('skyward', 'sword'),
 ('take', 'care'),
 ('pulp', 'fiction'),
 ('jeff', 'dunham'),
 ('cyber', 'shot'),
 ('kitchen', 'aid'),
 ('deathly', 'hallows'),
 ('david', 'guetta'),
 ('french', 'door'),
 ('taylor', 'swift'),
 ('heart', 'rate'),
 ('kung', 'panda'),
 ('mass', 'effect'),
 ('wears', 'prada'),
 ('hocus', 'pocus'),
 ('mindless', 'behavior'),
 ('paranormal', 'activity'),
 ('foo', 'fighters'),
 ('gossip', 'girl'),
 ('something', 'borrowed'),
 ('rick', 'ross'),
 ('devil', 'wears'),
 ('pistol', 'annies'),
 ('randy', 'orton'),
 ('saints', 'row'),
 ('rosetta', 'stone'),
 ('elder', 'scrolls'),
 ('professor', 'layton'),
 ('britney', 'spears'),
 ('jill', 'scott'),
 ('doctor', 'who'),
 ('elm', 'street'),
 ('snow', 'leopard'),
 ('avenged', 'sevenfold'),
 ('willy', 'wonka'),
 ('flight', 'simulator'),
 ('demi', 'lovato'),
 ('ben', 'hur'),
 ('tiger', 'woods'),
 ('gran', 'turismo'),
 ('citizen', 'kane'),
 ('alice', 'cooper'),
 ('hank', 'williams'),
 ('spooky', 'buddies'),
 (

### Step 5: further filter bi-gram to get Noun phrases
fiter based on POS tagging patterns JJ_NN or NN_NN.

In [60]:
bi_gram_noun_phrases=[]
for phrase_token in intersection:
    POS = nltk.pos_tag(phrase_token)
    POS_first_word = POS[0][1]
    POS_second_word = POS[1][1]
    if POS_first_word in ['NN','JJ'] and POS_second_word == 'NN':
        #print("dddd")
        bi_gram_noun_phrases.append(' '.join([POS[0][0],POS[1][0]]))

In [61]:
bi_gram_noun_phrases

['skyward sword',
 'pulp fiction',
 'jeff dunham',
 'cyber shot',
 'kitchen aid',
 'david guetta',
 'french door',
 'taylor swift',
 'heart rate',
 'kung panda',
 'mass effect',
 'hocus pocus',
 'mindless behavior',
 'paranormal activity',
 'gossip girl',
 'rick ross',
 'randy orton',
 'rosetta stone',
 'professor layton',
 'jill scott',
 'elm street',
 'snow leopard',
 'flight simulator',
 'demi lovato',
 'ben hur',
 'gran turismo',
 'alice cooper',
 'thermal paste',
 'mylo xyloto',
 'tech n9ne',
 'complete saga',
 'toy story',
 'jane eyre',
 'marvel capcom',
 'street fighter',
 'jersey shore',
 'scooby doo',
 'rice cooker',
 'rockford fosgate',
 'monte carlo',
 'deep fryer',
 'alien ware',
 'cool pix',
 'jake owen',
 'definitive technology',
 'battle field',
 'american capitalist',
 'luke bryan',
 'toby keith',
 'selena gomez',
 'cobra starship',
 'burn notice',
 'jimi hendrix',
 'universe online',
 'kid cudi',
 'butch walker',
 'transform ultra',
 'fullmetal alchemist',
 'san franci