# Extracting Collocations for ELIA using Dependency Parsing

### Import dependencies

In [1]:
import os
import pickle
from time import gmtime, strftime
import os.path
import shutil
import pandas as pd

### Load Corpus

In [2]:
# In this cell, one needs to change the following variables
# 1. colFolder
# 2. The path following the %cd command

# Location of the Ranked Collocates .csv files
colDataFolder = r'D:\Education\vocabexpert\elia_collocations\elia_db'

# Location of the Evaluation Results .csv files
colEvalFolder = r'D:\Education\vocabexpert\elia_collocations\elia_eval'

# Change path to the folder where the reference files are stored
%cd D:\Education\vocabexpert\acl_bea_paper\eval
%pwd

# Note: The folder where all the data files are stored should also contain a folder called 'reference'.
# Basically, this folder should then have 2 folders of 'se_flax' and 'elia' which contain the reference files.

D:\Education\vocabexpert\acl_bea_paper\eval


'D:\\Education\\vocabexpert\\acl_bea_paper\\eval'

### Functions

In [3]:
def chkPOSTagColType(colType, word):
    if word[1] =='NN' and 'NOUN' in colType: # NOUN + VERB', 'NOUN + NOUN', 'NOUN + PREP', 
        return True #'PREP + NOUN', 'ADJ + NOUN', 'DET + NOUN', 'VERB + NOUN'
        
    elif word[1] == 'VB' and 'VERB' in colType: # 'NOUN + VERB', 'VERB + NOUN', 'VERB + ADJ', 
        return True #'VERB + PREP', 'VERB + VERB', 'VERB + ADV', 'ADV + to VERB', 'ADJ + to VERB'
    
    elif word[1] == 'JJ' and 'ADJ' in colType: # 'ADJ + NOUN', 'VERB + ADJ', 'ADJ + to VERB', 'ADJ + PREP', 'ADV + ADJ'
        return True
        
    elif word[1] == 'RB' and 'ADV' in colType: # ''VERB + ADV', 'ADV + to VERB', 'ADV + ADJ']
        return True
    
    else:
        return False

In [4]:
def getWordList(wordListCSVFile):
    colNameWord = 'Headword'
    colNamePOS = 'POS'
    df = pd.read_csv(wordListCSVFile)
    
    words = df[colNameWord].tolist()
    posTags = df[colNamePOS].tolist()
    
    wordList = list(zip(words, posTags))
    
    wordList = list(set(wordList))
    
    wordList = sorted(wordList)
    
    return wordList

In [5]:
lstRefColTypesFileNames = ['n1_n2', 'n2_n1', 'n2_v1', 'n2_adj1', 'v1_n2', 'v1_adj2', 'v1_adv2', 
                           'v2_adv1', 'adj1_n2', 'adj2_v1', 'adj2_adv1']
lstRefColTypesElia = ['NOUN + NOUN', 'NOUN + NOUN', 'VERB + NOUN', 'ADJ + NOUN', 'VERB + NOUN', 'VERB + ADJ', 'VERB + ADV', 
                      'VERB + ADV', 'ADJ + NOUN', 'VERB + ADJ', 'ADV + ADJ']

In [15]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

for i in range(11):
#     i = 5
    colTypeFileName = lstRefColTypesFileNames[i]
    colType = lstRefColTypesElia[i]
    
    wordListCSVFile = './reference/elia/ACL - Reference set - ' + colTypeFileName + '.csv'
    lstEvalWords = getWordList(wordListCSVFile) #Read from word list
    print(colTypeFileName + ' : ' , str(len(lstEvalWords)))
    
    colEliaData = os.path.join(colDataFolder, colType)
    colEliaEval = os.path.join(colEvalFolder, colTypeFileName)
    
    for word in lstEvalWords:
        print(word)
        lemma = word[0].lower()        
        posTag = word[1].capitalize()
        if posTag == 'Adj':
            posTag = 'JJ'
        elif posTag == 'Adv':
            posTag = 'RB'
        elif posTag == 'N':
            posTag = 'NN'
        elif posTag == 'V':
            posTag = 'VB'
            
        if posTag == 'Jj':
            posTag ='JJ'
            
        formattedWord = (lemma, posTag)
        if chkPOSTagColType(colType, formattedWord): # Check if the collocation type is valid for that word
            
            # Find the file, rename it and move it to the folder location for evaluation
            if os.path.exists(colEliaData):
                oldFname = colEliaData + '/' + lemma + '_' + posTag + '_' + colType + '.csv'

                if os.path.isfile(oldFname):
                    newFname = colEliaEval + '/' + lemma + '_' + posTag + '_' + colTypeFileName + '+' + colType + '.csv'
                    if not os.path.exists(colEliaEval):
                        os.makedirs(colEliaEval)
                    shutil.copy(oldFname, newFname)
                        
                else:
                    print('This word doesnt have any collocation file generated! - ' + str(i) + ' ' + lemma + '_' + posTag + '_' + colTypeFileName + '+' + colType)
            else:
                print('colEliaData folder doesnot exist')

        else:
            print('This word doesnt have any collocations generated as the collocation type is invalid - ' + str(i) + ' ' + lemma + '_' + posTag + '_' + colTypeFileName + '+' + colType)

print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-05-21 15:06:16
v1_adj2 :  8
('become', 'v')
('consider', 'v')
('deem', 'v')
('get', 'v')
('make', 'v')
('prove', 'v')
('remain', 'v')
('seem', 'v')
2018-05-21 15:06:16
