## Web Scraping - FLAX

In [1]:
from bs4 import BeautifulSoup, NavigableString
# from urllib2 import urlopen
from urllib.request import urlopen

import requests
from lxml import html
from xml.etree import cElementTree as ET

import xml, json
import pandas as pd

import os
import time
from time import gmtime, strftime

import spacy
nlp = spacy.load('en_core_web_lg')

import itertools
from operator import itemgetter
from itertools import groupby
from pprint import pprint

In [2]:
# Download all the files in the 'code' folder and place it in the present working directory
%cd r'D:\Education\vocabexpert\acl_bea_paper\code'
%pwd

In [4]:
if not os.path.isdir('./pattern'):
    !git clone 'https://github.com/clips/pattern.git'
    
# Note: Download the pattern folder manually from the development branch of the github link (https://github.com/clips/pattern.git) for Python 3.5+

In [5]:
# Change path (use full path) to current directory + pattern i.e. where the CLIPS-Pattern repository was cloned.
# MODULE = './pattern'
MODULE = r'D:\Education\vocabexpert\acl_bea_paper\code\pattern'
import sys;
if MODULE not in sys.path: 
    sys.path.append(MODULE)

# os.path.abspath('./pattern/')

In [6]:
# import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
import os, sys; sys.path.insert(0, os.path.join(os.path.dirname(MODULE), "..", ".."))

from pattern.en import article, referenced
from pattern.en import pluralize, singularize
from pattern.en import comparative, superlative
from pattern.en import conjugate, lemma, lexeme, tenses
from pattern.en import NOUN, VERB, ADJECTIVE, ADVERB

# The en module has a range of tools for word inflection:
# pluralization and singularization, comparative and superlative adjectives, verb conjugation.


In [7]:
def mapFlaxToClipsPatternsPOSTags(tag):
    if tag == 'n':
        return NOUN
    elif tag in 'v':
        return VERB
    elif tag in 'a':
        return ADJECTIVE
    elif tag in 'r':
        return ADVERB
    else:
        return NOUN

In [8]:
def getClipsPatterns(lemma, posTag):
    words = []
    
    clipsPOSTag = mapFlaxToClipsPatternsPOSTags(posTag)
        
    # VERB CONJUGATION
    if clipsPOSTag == VERB:        
        words = lexeme(lemma)
        # words = conjugate(lemma)
        # words = tenses(lemma)
        
    elif clipsPOSTag == ADJECTIVE:
        words.append(lemma)
        wc = comparative(lemma)
        words.append(wc)
        ws = superlative(lemma)
        words.append(ws)
        
    else:
        # Lemma form
        words.append(lemma)
    
        # PLURALIZATION
        w = pluralize(lemma, pos=clipsPOSTag)
        words.append(w)
    
    return words    

In [9]:
def mapSpacyFlaxPOSTags(tag):
    if tag in ['NN', 'NNS', 'NNP', 'NNPS']:
        return 'n'
        
    elif tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        return 'v'
    
    elif tag in ['JJ', 'JJR', 'JJS']:
        return 'a'
    
    elif tag in ['RB', 'RBR', 'RBS']:
        return 'r'
    
    else:
        return tag

In [10]:
def getWordForms(lemma, posTag):
    lstWordForms = []
    words = getClipsPatterns(lemma, posTag)
    for word in words:
        lstWordForms.append((lemma, word))
    
    return lstWordForms

In [12]:
def getFlaxCollocations(lemma, posTag, colType, colPosTag):

    lstFlaxCols = []
    lstFlaxColsLemPosCollocates = []
    lstFlaxColChecked = []
    lstFlaxColFiltered = []
    lstWordForms = getWordForms(lemma, posTag)
    colTypeQuery = colType.split('-')[1]
    
    # URL to get the requests
    strBaseURL = 'http://flax.nzdl.org/greenstone3/flax?a=pr&rt=r&ro=1&o=xml&c=collocations&s=CollocationSampleRetrieve&s1.dbName=BAWE&s1.colloType='
    query = '&s1.query='
    wordTag = '&s1.wordType='
    
    for word in lstWordForms:
        requestUrl = strBaseURL + colTypeQuery + query + word[1] + wordTag + posTag
        with requests.Session() as session:
            r = session.get(requestUrl)
            respObj = r.text
            respObj = ET.fromstring(respObj)
            for data in respObj.getiterator('collo'):
#                 print(data.get('word'), data.get('colloType'), data.find('text').text, data.get('fre'))
                strCollocates = data.find('text').text
                lstCollocates = strCollocates.split(' ')
                lstFlaxCols.append((lemma, data.get('word'), posTag, strCollocates, data.get('colloType'), data.get('fre')))
                
                collocate = ''
                for w in lstCollocates:
                    if w != word[1]:
                        collocate += w
                        collocate += ' '
                
                strCollocates = collocate.strip()
                
                collocates = ()
                colFiltered = ('', '')
                i = 0
                doc = nlp(strCollocates)
                match = False
                for token in doc:
                    collocates += (token.lemma_, token, token.tag_)
                    tag = token.tag_
                    tagFL = mapSpacyFlaxPOSTags(tag)
                    if tagFL == colPosTag:
                        colFiltered = (token.lemma_, tagFL)
                        match = True
                    i+=1
                
                while i < 5:
                    collocates += ('', '', '')
                    i+=1
                
                if match ==  False:
                    collocateNoMatchLemma = ''
                    collocateNoMatchTag = ''
                    for token in doc:                    
                        tag = token.tag_
                        tagFL = mapSpacyFlaxPOSTags(tag)
                        collocateNoMatchLemma += token.lemma_ + ' '
                        collocateNoMatchTag += tagFL + ' '

                    collocateNoMatchLemma = collocateNoMatchLemma.strip()
                    collocateNoMatchTag = collocateNoMatchTag.strip()
                    colFiltered = (collocateNoMatchLemma, collocateNoMatchTag)

                
                lstFlaxColsLemPosCollocates.append((lemma, data.get('word'), posTag, collocates[0], collocates[1], collocates[2], 
                                                    collocates[3], collocates[4], collocates[5], collocates[6], collocates[7], 
                                                    collocates[8], collocates[9], collocates[10], collocates[11], collocates[12], 
                                                    collocates[13], collocates[14], data.get('colloType'), data.get('fre')))
                
                lstFlaxColChecked.append((lemma, posTag, colFiltered[0], colFiltered[1], data.get('colloType'), int(data.get('fre'))))
                

    grouper = itemgetter(0,1,2,3,4)
    lstFlaxColFiltered = []
    for key, grp in groupby(sorted(lstFlaxColChecked, key = grouper), grouper):
        temp_dict = dict(zip([0,1,2,3,4], key))
        temp_dict[5] = sum(item[5] for item in grp)
        vals = tuple(temp_dict.values())
        lstFlaxColFiltered.append(vals)
        
    lstFlaxColFiltered = sorted(lstFlaxColFiltered, key=lambda x: x[5], reverse=True)
               
    return lstFlaxCols, lstFlaxColsLemPosCollocates, lstFlaxColFiltered

In [14]:
# Change path to the folder where all collocation data files will be stored
colFolder = r'D:\Education\vocabexpert\acl_bea_paper\eval\collocations\flax' 

# Change path to the folder where all reference files are stored. 
# Basically, this folder should then have 2 folders of 'se_flax' and 'elia' which contain the reference files.
%cd D:\Education\vocabexpert\acl_bea_paper\eval\reference
%pwd


D:\Education\vocabexpert\acl_bea_paper\eval\reference


'D:\\Education\\vocabexpert\\acl_bea_paper\\eval\\reference'

In [15]:
def getWordList(wordListCSVFile):
    colNameWord = 'Headword'
    colNamePOS = 'POS'
    df = pd.read_csv(wordListCSVFile)
    
    words = df[colNameWord].tolist()
    posTags = df[colNamePOS].tolist()
    
    wordList = list(zip(words, posTags))
    
    wordList = list(set(wordList))
    
    wordList = sorted(wordList)
    
    return wordList

In [16]:
lstRefColTypesFileNames = ['n1_n2', 'n2_n1', 'n2_v1', 'n2_adj1', 'v1_n2', 'v1_adj2', 'v1_adv2', 
                           'v2_adv1', 'adj1_n2', 'adj2_v1', 'adj2_adv1']
lstRefColTypesFLAX = ['n-nn', 'n-nn', 'n-vn', 'n-an', 'v-vn', 'v-vppa', 'v-vr', 'v-rv', 'a-an', 'a-vppa', 'a-ra']
lstRefColPosTag = ['n', 'n', 'v', 'a', 'n', 'a', 'r', 'r', 'n', 'v', 'r']

In [180]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

for i in range(11): # Use time delay (sleep) as there is a limit on the number of hits.
#     i = 0
    colTypeFileName = lstRefColTypesFileNames[i]
    colType = lstRefColTypesFLAX[i]
    colPosTag = lstRefColPosTag[i]
    
    wordListCSVFile = './se_flax/ACL - Reference set - ' + colTypeFileName + '.csv'
    lstWords = getWordList(wordListCSVFile) #Read from word list
    print(colTypeFileName + ' : ' , str(len(lstWords)))
    
    colFlax = os.path.join(colFolder, colTypeFileName)
    
    for word in lstWords:
        print(word)
        lemma = word[0].lower()
        posTag = word[1]
        if posTag == 'adj':
            posTag = 'a'
        
#         print(lemma, posTag, colType)
        lstFlaxCols, lstFlaxColsLemPosCollocates, lstFlaxColFiltered = getFlaxCollocations(lemma, posTag, colType, colPosTag)
        
        if len(lstFlaxCols) != 0:
            
            final_directory = colFlax
            if not os.path.exists(final_directory):
                os.makedirs(final_directory)
            fname = final_directory + '/' + lemma + '_' + posTag + '_' + 'Raw' + '_' + colTypeFileName + '+' + colType + '.csv'
            
            headers = ['HeadWord_Lemma', 'HeadWord_Form', 'HeadWord_Lempos', 'Collocate_Words', 'Collo_Type', 'Frequency']
            df = pd.DataFrame(lstFlaxCols, columns=headers)
            df.to_csv(fname, index=False)
            
            
            fname = final_directory + '/' + lemma + '_' + posTag + '_' + 'ColWordsLemPos' + '_' + colTypeFileName + '+' + colType + '.csv'
            
            headersColLemPos = ['HeadWord_Lemma', 'HeadWord_Form', 'HeadWord_Lempos', 'ColWord_Lemma1', 'ColWord1', 'ColWord_Lempos1', 
                                'ColWord_Lemma2', 'ColWord2', 'ColWord_Lempos2', 'ColWord_Lemma3', 'ColWord3', 'ColWord_Lempos3', 
                                'ColWord_Lemma4', 'ColWord4', 'ColWord_Lempos4', 'ColWord_Lemma5', 'ColWord5', 'ColWord_Lempos5', 
                                'Collo_Type', 'Frequency']
            df = pd.DataFrame(lstFlaxColsLemPosCollocates, columns=headersColLemPos)
            df.to_csv(fname, index=False)
            
            
            fname = final_directory + '/' + lemma + '_' + posTag + '_' + 'ColWordsFiltered' + '_' + colTypeFileName + '+' + colType + '.csv'
            headersColFiltered = ['HeadWord_Lemma', 'HeadWord_Lempos', 'ColWord_Lemma', 'ColWord_Lempos', 'Collo_Type', 'Frequency']
            df = pd.DataFrame(lstFlaxColFiltered, columns=headersColFiltered)
            df.to_csv(fname, index=False)

        else:
            print('This word doesnt have any collocation type generated! - ' + str(i) + ' ' + lemma + '_' + posTag + '_' + colTypeFileName + '+' + colType)

print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-04-26 12:48:39
n1_n2 :  39
('assessment', 'n')
('background', 'n')
('business', 'n')
('career', 'n')
('class', 'n')
('climate', 'n')
('conflict', 'n')
('data', 'n')
('employment', 'n')
('expert', 'n')
('field', 'n')
('gender', 'n')
('government', 'n')
('information', 'n')
('internet', 'n')
('learning', 'n')
('medium', 'n')
('minority', 'n')
('opinion', 'n')
('party', 'n')
('peace', 'n')
('pilot', 'n')
('planning', 'n')
('problem', 'n')
('research', 'n')
('risk', 'n')
('security', 'n')
('service', 'n')
('source', 'n')
('state', 'n')
('stress', 'n')
('survey', 'n')
('target', 'n')
('teaching', 'n')
('test', 'n')
('thinking', 'n')
('thought', 'n')
('transport', 'n')
('welfare', 'n')
n2_n1 :  52
('access', 'n')
('area', 'n')
('assessment', 'n')
('audience', 'n')
('change', 'n')
('consciousness', 'n')
('control', 'n')
('coverage', 'n')
('data', 'n')
('department', 'n')
('development', 'n')
('difficulty', 'n')
('effort', 'n')
('environment', 'n')
('equality', 'n')
('evidence', 'n')
('ex

('inform', 'v')
('integrate', 'v')
('intend', 'v')
('involve', 'v')
('know', 'v')
('link', 'v')
('mention', 'v')
('motivate', 'v')
('oppose', 'v')
('populate', 'v')
('read', 'v')
('realize', 'v')
This word doesnt have any collocation type generated! - 7 realize_v_v2_adv1+v-rv
('receive', 'v')
('recognize', 'v')
('reduce', 'v')
('refer', 'v')
('regard', 'v')
('relate', 'v')
('remove', 'v')
('resemble', 'v')
('root', 'v')
('select', 'v')
('share', 'v')
('structure', 'v')
('suggest', 'v')
('suit', 'v')
('think', 'v')
('tie', 'v')
('understand', 'v')
('use', 'v')
('value', 'v')
adj1_n2 :  416
('abstract', 'adj')
('academic', 'adj')
('acceptable', 'adj')
('accurate', 'adj')
('active', 'adj')
('additional', 'adj')
('administrative', 'adj')
('advanced', 'adj')
('adverse', 'adj')
('alternative', 'adj')
('ample', 'adj')
('analytical', 'adj')
('anecdotal', 'adj')
('annual', 'adj')
('appropriate', 'adj')
('armed', 'adj')
('artificial', 'adj')
('associated', 'adj')
('atomic', 'adj')
('available', 