# bi-gram phrase detection

**NOTE**: This notebook depends upon the the Retrotech dataset. If you have any issues, please rerun the [Setting up the Retrotech Dataset](../ch4/1.ch4-setting-up-the-retrotech-dataset.ipynb) notebook.

In [1]:
import nltk
from nltk.collocations import *
import re
import pandas as pd
from nltk.corpus import webtext
from nltk.metrics import BigramAssocMeasures
from nltk.tokenize import RegexpTokenizer
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [2]:
#NOTE: this is sample data for now. Need to replace this with actually using the RetroTech dataset. 
# Run this notebook for details: http://localhost:8888/notebooks/ch4/1.ch4-setting-up-the-retrotech-dataset.ipynb
signal_sample=pd.read_json("../data/temp/signal_sample.json")

### Step 1: data cleaning: 

tokenize text and only keep words or digits, only keep tokens which have length greater than 2

In [3]:
def cleaning(text):
    tokens = []
    tokenizer = RegexpTokenizer(r'\w+') 
    
    tokened = tokenizer.tokenize(text.lower())
    
    for token in tokened:
        if len(token) > 2 and not token.isdigit(): #keep tokens longer than 2 characters and drop digit only tokens
            tokens.append(token)
            
    return tokens

signal_tokened = []

for query in signal_sample["query_s"]:
    tokens = cleaning(query)
    signal_tokened.append(tokens)
    
signal_tokened[:5]

[['jillian', 'micheals'],
 ['pda'],
 ['sony', 'radio'],
 ['car'],
 ['pioneer', 'speakers']]

### Step 2: Find candidate bi-gram phrases based on frequency. 

using nltk collocation bigram function to find candidate bigram phrases, a frequency filter is applied to only keep bigrams with frequency greater or equal to 3. 

In [4]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents(signal_tokened) #input is a list of token list
finder.apply_freq_filter(3)

### Step 3: Sort candidate phrases based on PMI and likelihood ratio. 

In [5]:
finder.score_ngrams(bigram_measures.pmi)

[(('before', 'christmas'), 12.418643249375899),
 (('bone', 'thugs'), 12.418643249375899),
 (('brad', 'paisley'), 12.418643249375899),
 (('carrie', 'underwood'), 12.418643249375899),
 (('deathly', 'hallows'), 12.418643249375899),
 (('hallows', 'part'), 12.418643249375899),
 (('harmon', 'kardon'), 12.418643249375899),
 (('pre', 'order'), 12.418643249375899),
 (('ryan', 'adams'), 12.418643249375899),
 (('chemical', 'romance'), 12.003605750097055),
 (('driver', 'san'), 12.003605750097055),
 (('geek', 'squad'), 12.003605750097055),
 (('grand', 'theft'), 12.003605750097055),
 (('justin', 'timberlake'), 12.003605750097055),
 (('nba', '2k12'), 12.003605750097055),
 (('rockford', 'fosgate'), 12.003605750097055),
 (('stainless', 'steel'), 12.003605750097055),
 (('vampire', 'diaries'), 12.003605750097055),
 (('velvet', 'revolver'), 12.003605750097055),
 (('walkie', 'talkie'), 12.003605750097055),
 (('young', 'jeezy'), 12.003605750097055),
 (('anti', 'virus'), 11.681677655209693),
 (('assassins', 

In [6]:
finder.score_ngrams(bigram_measures.likelihood_ratio)

[(('blu', 'ray'), 709.1566468396438),
 (('star', 'wars'), 489.83534693364135),
 (('hard', 'drive'), 420.9843682241864),
 (('ipod', 'touch'), 380.85301311413014),
 (('captain', 'america'), 318.5996746854715),
 (('beats', 'dre'), 311.436629949274),
 (('lil', 'wayne'), 300.3680883388915),
 (('galaxy', 'tab'), 299.12593056529687),
 (('gears', 'war'), 275.1085160720254),
 (('lion', 'king'), 273.37878896295985),
 (('samsung', 'galaxy'), 247.01102060935628),
 (('home', 'theater'), 245.20386699737819),
 (('arkham', 'city'), 235.0330024434602),
 (('turtle', 'beach'), 220.14533822030018),
 (('guitar', 'hero'), 213.0480834794117),
 (('external', 'hard'), 208.51980885599556),
 (('htc', 'flyer'), 195.48035131054004),
 (('dvd', 'player'), 190.7246031489744),
 (('skull', 'candy'), 175.8991884948761),
 (('first', 'class'), 170.81630900734757),
 (('power', 'supply'), 170.0793217918571),
 (('harry', 'potter'), 168.0734054911679),
 (('wireless', 'router'), 167.26643824554176),
 (('macbook', 'pro'), 166.7

### Step 4: combine candidate list from PMI and likelihood ratio
only keep phrases that shown in top 200 in both lists. 

In [7]:
intersection = [value for value in finder.nbest(bigram_measures.likelihood_ratio, 200) if value in finder.nbest(bigram_measures.pmi, 200)] 

### Step 5: further filter bi-gram to get Noun phrases
fiter based on POS tagging patterns JJ_NN or NN_NN.

In [8]:
final_list=[]
for phrase_token in intersection:
    POS = nltk.pos_tag(phrase_token)
    POS_first_word = POS[0][1]
    POS_second_word = POS[1][1]
    
    if POS_first_word in ['NN','JJ'] and POS_second_word == 'NN':
        final_list.append(' '.join([POS[0][0],POS[1][0]]))

In [9]:
final_list

['captain america',
 'lil wayne',
 'lion king',
 'home theater',
 'arkham city',
 'turtle beach',
 'guitar hero',
 'skull candy',
 'power supply',
 'harry potter',
 'sound bar',
 'pearl jam',
 'hello kitty',
 'modern warfare',
 'dragon ball',
 'virgin mobile',
 'air conditioner',
 'pink floyd',
 'play station',
 'surge protector',
 'call duty',
 'green lantern',
 'wall mount',
 'action replay',
 'surround sound',
 'sharp aquos',
 'acer iconia',
 'dsl modem',
 'kindle fire',
 'web cam',
 'power cord',
 'dead island',
 'price match',
 'jurassic park',
 'toaster oven',
 'western digital',
 'memory stick',
 'death punch',
 'finger death',
 'lady gaga',
 'nook color',
 'lap top',
 'bang theory',
 'big bang',
 'french door',
 'world warcraft',
 'batman arkham',
 'family guy',
 'boost mobile',
 'blue tooth',
 'chemical romance',
 'grand theft',
 'velvet revolver',
 'walkie talkie',
 'young jeezy',
 'otter box',
 'smart start',
 'cyber shot',
 'rock band',
 'radar detector',
 'universal remote