# Word Association Mining:Syntagmatic

This notebook implements syntagmatic word association mining.Two words have syntagmatic relation if they can be
combined with each other. 

**Method**:
<br/>I use the idea of mutual information, words with high co-occurrences but relatively low individual occurrences. In this notebook, I work on noun&noun and adj&noun pairs.I also applied smoothing technique.

In [1]:
import re
import numpy as np
import pandas as pd
import json
import pymongo
from pymongo import MongoClient
import csv

In [4]:
from __future__ import unicode_literals, print_function
import spacy
from spacy.en import English
import en_core_web_sm
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS as stop

In [13]:
import math
import operator

In [2]:
'''Variable
db: database to be used
review_collection: collection to be use
'''

'''AWS connection'''

# client = MongoClient("", 27017)
# client.the_database.authenticate('','', mechanism='', source='')
# db = client['nike_collections_legacy']
# review_collection = db['nike_reviews_trial']

'''local connection'''

client = MongoClient("localhost", 27018)
client.the_database.authenticate('','', mechanism='', source='')
db = client['nike_collections']
review_collection = db['dev_clean_reviews']

In [3]:
# generate sample

review_data = review_collection.aggregate(
    [{'$sample': {'size':10000}},
     {'$project': {'_id': 1, 'review_text':1}}
    ])

reviews = pd.DataFrame(list(review_data))
reviews.head()

#use population

# review_data = review_collection.find({})
# reviews = pd.DataFrame(list(review_data))
# reviews.head()

Unnamed: 0,_id,review_text
0,amazonR24P1S9UXJDBT8,Great No problems
1,amazonR8VRWEFCI638E,These shoes are fabulous. I ordered 1/2 size ...
2,kohls131578626,If you play disc golf this is it ! This shoe i...
3,dicks166977931,This is my second pair of these exact shoes. ...
4,amazonR33S9L65EQBL9B,Fits great.


### Sentence Parsing

In [5]:
# load spacy nlp pipeline
nlp = en_core_web_sm.load()

In [6]:
'''sentence parser''' 
sentence = []
for index, row in reviews.iterrows():
    if row['review_text'] is not None:
        #spacy sentence parsing exception handling
        review = row['review_text'].replace('|','.') 
        review = re.sub('\(|\)',' ',review)
        review = re.sub('!+','.',review)
        review = re.sub('[ ]*![ ]*','.',review)
        review = re.sub('\.\.+','.',review)
        review = re.sub('-*','',review) 
        tokens = nlp(review) 
        for sen in tokens.sents:
            s = re.sub('^[^a-zA-z]*|[^a-zA-Z]*$','',sen.text)
            if s!='':
                sentence.append(s)

In [7]:
# number of segment: sentence
sen_num = len(sentence)

### Frequency Count: Individual Word & Word Pair 

In [8]:
tokens_text = {}
pair_text = {}

for sent in sentence:
    doc = nlp(sent)
    noun = []
    for word in doc:
        if (word.pos_ == 'NOUN' or word.pos_ == 'ADJ') and word.lemma_ not in stop:
            noun.append(word.text)
    # duplicate removal: what we want is the number of sentences each word appear 
    # not the actual word count
    noun = list(set(noun))

    for n in noun:
        tokens_text[n] = tokens_text.get(n,0) + 1
        for y in noun[noun.index(n)+1:]:
            key = frozenset((n,y))
            pair_text[key] = pair_text.get(key,0) + 1

In [9]:
# smoothing 
# filter out words that appear less than 11 times
p_w1_1 = {k: (v+0.5)/(sen_num+1) for k, v in tokens_text.items() if v>10}
p_w1_0 = {k: 1-v for k, v in p_w1_1.items()}  

In [10]:
# for all word pairs that don't appear in dataset, give them a probability 0 for future smoothing
words = list(p_w1_1.keys())
for w1 in words:
    for w2 in words[words.index(w1)+1:]:
        if frozenset((w1,w2)) not in pair_text:
            pair_text[frozenset((w1,w2))] = 0

In [11]:
# smoothing
p_w1_1_w2_1 = {k: (v+0.25)/(sen_num+1) for k, v in pair_text.items()}
p_w1_1_w2_0 = {}
p_w1_0_w2_0 = {}

In [12]:
# calculate P(w1=1,w2=0), P(w1=0,w2=1) and P(w1=0,w2=0) for all word pairs
for w1 in words:
    for w2 in words[words.index(w1)+1:]:
        # P(w1=1)
        pw1 = p_w1_1[w1]
        # P(w2=1)
        pw2 = p_w1_1[w2]
        k10 = (w1,w2)
        k01 = (w2,w1)
        # P(w1=1,w2=1)
        p11 = p_w1_1_w2_1[frozenset(k10)]
        # P(w1=1,w2=0)
        p_w1_1_w2_0[k10] = pw1 - p11
        # P(w1=0,w2=1)
        p_w1_1_w2_0[k01] = pw2 - p11
        if frozenset(k10) not in p_w1_0_w2_0:
            # P(w1=0,w2=0)
            p_w1_0_w2_0[frozenset(k10)] = p_w1_0[w1] - p_w1_1_w2_0[k01]     

### MI Calculation

In [14]:
# mutual_info: key: word pair; value: MI calculated using KL divergence 
mutual_info = {}
for w1 in words:
    for w2 in words[words.index(w1)+1:]:
        k10 = (w1,w2)
        k01 = (w2,w1)
        if frozenset(k10) not in mutual_info:
            # 1,1
            pw1_1 = p_w1_1[w1]
            pw2_1 = p_w1_1[w2] 
            p11 = p_w1_1_w2_1[frozenset(k10)]
            a = p11 * math.log2(p11/(pw1_1*pw2_1))
            
            # 0,0
            pw1_0 = p_w1_0[w1]
            pw2_0 = p_w1_0[w2]
            p00 = p_w1_0_w2_0[frozenset(k10)]
            b = p00 * math.log2(p00/(pw1_0*pw2_0))
            
            p10 = p_w1_1_w2_0[k10]
            p01 = p_w1_1_w2_0[k01]
            c = p10 * math.log2(p10/(pw1_1*pw2_0))
            d = p01 * math.log2(p01/(pw1_0*pw2_1))
            mutual_info[frozenset(k10)] = a + b + c + d

### Set Threshold (to filter result) 

In [16]:
# to select threshold
a = np.array(list(mutual_info.values()))
total_num = len(mutual_info)
print('Total Number: ',total_num)
print('Threshold for Top 100: ',np.percentile(a, (total_num-100)/total_num*100))
print('Threshold for Top 150: ',np.percentile(a, (total_num-150)/total_num*100))
print('Threshold for Top 200: ',np.percentile(a, (total_num-200)/total_num*100))

In [17]:
mi_threshold = 0.003

In [18]:
# filter result based on MI value 
mutual_info_filter = {k: v for k, v in mutual_info.items() if v > mi_threshold}
# sort based on MI value
mutual_info_sorted = sorted(mutual_info_filter.items(), key=operator.itemgetter(1),reverse=True)

### Generate Output & Export

In [20]:
# transform the result into dictionary with master word labeling 
MI = []
for item in mutual_info_sorted:
    result = {}
    words = item[0]
    for i,w in enumerate(words):
        name1 = 'word_' + str(i)
        result[name1] = w
    result['mutual_information'] = item[1]
    MI.append(result)

In [21]:
with open('syntagmatic_word_association.csv', 'w',encoding='utf-8') as f:
    w = csv.DictWriter(f, MI[0].keys())
    w.writeheader()
    for i in MI:
        w.writerow(i)