# Word Association Mining:Paradigmatic

This notebook implements paradigmatic word association mining.Two words have paradigmatic relation if they can be
substituted for each other. In other words, they are in the same semantic class, or syntactic class and replacement wouldn’t infect understanding of sentences.In this notebook, I just work on Nouns.

**Method**:
1. Represent each noun by a list of nouns that ever appear in the same sentences as this noun (context)
2. Convert each word list into tf-idf vector and compute pairwise consine similarity
3. Nouns with high context similarity likely have paradigmatic relation

In [21]:
import re
import numpy as np
import pandas as pd
import pymongo
from pymongo import MongoClient
import csv

In [22]:
from __future__ import unicode_literals, print_function
import spacy
from spacy.en import English
import en_core_web_sm
from gensim.parsing.preprocessing import STOPWORDS as stop
from gensim import corpora, models, similarities

In [23]:
from collections import Counter,defaultdict
import operator

### Get Data from MongoDB & Parse Reviews into Sentences

In [24]:
'''Variable
db: database to be used
review_collection: collection to be use
'''

'''AWS connection'''

# client = MongoClient("", 27017)
# client.the_database.authenticate('','', mechanism='', source='')
# db = client['nike_collections_legacy']
# review_collection = db['nike_reviews_trial']

'''local connection'''

client = MongoClient("localhost", 27018)
client.the_database.authenticate('','', mechanism='', source='')
db = client['nike_collections']
review_collection = db['dev_clean_reviews']

In [25]:
# generate sample

review_data = review_collection.aggregate(
    [{'$sample': {'size':10000}},
     {'$project': {'_id': 1, 'review_text':1}}
    ])

reviews = pd.DataFrame(list(review_data))
reviews.head()

#use population

# review_data = review_collection.find({})
# reviews = pd.DataFrame(list(review_data))
# reviews.head()

Unnamed: 0,_id,review_text
0,amazonR3RBSGXB60GJCM,I have a really hard time finding shoes that f...
1,eastbay30301564,I purchased these for my nephew who says they ...
2,amazonRY7EM3S8LZ7RA,Cheap shoe. Don't buy it
3,amazonR2FUQ0XY7TOIDG,Runs 1/2 size large
4,amazonR2STVGL1OF7IKG,Great shoes and very comfortable


In [26]:
# load spacy nlp pipeline
nlp = en_core_web_sm.load()

In [27]:
'''sentence parser'''
sentence = []
for index, row in reviews.iterrows():
    if row['review_text'] is not None:
        #spacy sentence parsing exception handling
        review = row['review_text'].replace('|','.') 
        review = re.sub('\(|\)',' ',review)
        review = re.sub('!+','.',review)
        review = re.sub('[ ]*![ ]*','.',review)
        review = re.sub('\.\.+','.',review)
        review = re.sub('-*','',review) 
        tokens = nlp(review) 
        for sen in tokens.sents:
            s = re.sub('^[^a-zA-z]*|[^a-zA-Z]*$','',sen.text)
            if s!='':
                sentence.append(s)

### Sentencce Tokenization 

In [28]:
tokens_lemma = []
for sent in sentence:
    doc = nlp(sent)
    for word in doc:
        if word.pos_ == 'NOUN' and word.lemma_ not in stop:
            tokens_lemma.append(word.lemma_)       

In [29]:
tokens_lemma_count = Counter(tokens_lemma)
freq_noun = {k: v for k, v in tokens_lemma_count.items() if v > 10}
candidate_noun = list(freq_noun.keys())

### Context Acquisition
context level: sentence <br/>
context type: left, right (adjacent); general (non-adjacent)

In [32]:
left_context =defaultdict(list)
right_context = defaultdict(list)
general_context = defaultdict(list)

In [None]:
# split word and their context 
def index_context(dictionary):
    list_index = []
    list_text = []
    for key,value in dictionary.items():
        list_index.append(key)
        list_text.append(value)
    return (list_index,list_text)

In [33]:
for sent in sentence:
    doc = nlp(sent)
    if len(doc)> 1:
        for word in doc:
            if word.pos_ == 'NOUN' and word.lemma_ in candidate_noun:
                if word.i == 0:
                    right_context[word.text].append(doc[1].text)
                    general_context[word.text].extend([token.text for token in doc[2:]])
                elif word.i == len(doc)-1:
                    left_context[word.text].append(doc[word.i-1].text)
                    general_context[word.text].extend([token.text for token in doc[:word.i-1]])
                else:   
                    left_context[word.text].append(doc[word.i-1].text)
                    right_context[word.text].append(doc[word.i+1].text)
                    general_context[word.text].extend([token.text for token in doc if token.i not in list(range(word.i-1,word.i+2))])

In [39]:
left_index,left_text = index_context(left_context)
right_index,right_text = index_context(right_context)
general_index, general_text = index_context(general_context)

In [35]:
# word dictionary
left_dictionary = corpora.Dictionary(left_text)
right_dictionary = corpora.Dictionary(right_text)
general_dictionary = corpora.Dictionary(general_text)

In [36]:
# tf vector
left_corpus = [left_dictionary.doc2bow(text) for text in left_text]
right_corpus = [right_dictionary.doc2bow(text) for text in right_text]
general_corpus = [general_dictionary.doc2bow(text) for text in general_text]

In [37]:
# tf_idf models
left_tfidf = models.TfidfModel(left_corpus)
right_tfidf = models.TfidfModel(right_corpus)
general_tfidf = models.TfidfModel(general_corpus)

In [38]:
# tf_idf vector
left_corpus_tfidf = [left_tfidf[i] for i in left_corpus]
right_corpus_tfidf = [right_tfidf[i] for i in right_corpus]
general_corpus_tfidf = [general_tfidf[i] for i in general_corpus]

### Compute Pairwise Cosine Similarity 

In [44]:
def compute_similarity(corpus):
    result = {}
    doc = similarities.MatrixSimilarity(corpus)
    count = 0
    for i in corpus:
        sims = doc[i] 
        result[count] = sorted(enumerate(sims), key=lambda item: -item[1])
        count += 1
    return result

In [45]:
left_sim = compute_similarity(left_corpus_tfidf)
right_sim = compute_similarity(right_corpus_tfidf)
general_sim = compute_similarity(general_corpus_tfidf)

### Generate Output

In [52]:
output = []
threshold = 0.8

In [53]:
def get_context(context_list,index_list,word):
    temp = {}
    for pair in context_list:
        if pair[1]>=threshold:
            word_temp = index_list[pair[0]]
            if word_temp != word:
                temp[word_temp] = pair[1]
    return sorted(temp.items(), key=operator.itemgetter(1),reverse=True) 
    

In [72]:
output = []
for i in range(len(general_sim)):
    word = general_index[i]
    glist = general_sim[i]
    
    try:
        llist = left_sim[left_index.index(word)]
    except:
        llist = None
    try:
        rlist = right_sim[right_index.index(word)]
    except:
        rlist = None
        
    item = {}
    item['word'] = word
    item['left_context'] = []
    item['right_context'] = []
    item['general_context'] = []

    if llist:
        item['left_context'] = get_context(llist,left_index,word)

    if rlist:
        item['right_context'] = get_context(rlist,right_index,word)

    if glist:
        item['general_context'] = get_context(glist,general_index,word)

    output.append(item) 

In [73]:
import csv 
with open('paradigmatic_word_association.csv', 'w',encoding='utf-8') as f:
    w = csv.DictWriter(f, output[0].keys())
    w.writeheader()
    for i in output:
        w.writerow(i)