## Evaluation for Collocations

In [1]:
import os
import pandas as pd
import time
from time import gmtime, strftime

In [2]:
# Change path to the folder where all reference files are stored. 
# Basically, this folder should then have 2 folders of 'se_flax' and 'elia' which contain the reference files.
%cd D:\Education\vocabexpert\acl_bea_paper\eval\reference
%pwd

D:\Education\vocabexpert\acl_bea_paper\eval\reference


'D:\\Education\\vocabexpert\\acl_bea_paper\\eval\\reference'

In [3]:
def getWordList(wordListCSVFile):
    lstHeadwordCols = []
    
    colTopHeadword = 3 #'# Collocates'
    colNameHeadword = 4 #'Headword'
    colNameHeadwordPOS = 5 #'POS'    
    colNameCollocate = 7 #'Collocate'
    colNameCollocatePOS = 8 #'POS'
    
    df = pd.read_csv(wordListCSVFile)
    
    topCollocates = df[df.columns[colTopHeadword]].tolist()
    headwords = df[df.columns[colNameHeadword]].tolist()
    heawordPosTags = df[df.columns[colNameHeadwordPOS]].tolist()
    collocates = df[df.columns[colNameCollocate]].tolist()
    collocatePosTags = df[df.columns[colNameCollocatePOS]].tolist()

    lstHeadwordCols = list(zip(headwords, heawordPosTags, collocates, collocatePosTags, topCollocates))
    
    lstHeadwordCols = list(set(lstHeadwordCols))
    
    lstHeadwordCols = sorted(lstHeadwordCols)
    
    return lstHeadwordCols

## Sketch Engine

In [4]:
# Internal mapping of collocation types
lstRefColTypesFileNames = ['n1_n2', 'n2_n1', 'n2_v1', 'n2_adj1', 'v1_n2', 'v1_adj2', 'v1_adv2', 
                           'v2_adv1', 'adj1_n2', 'adj2_v1', 'adj2_adv1']
lstRefColTypesSE = ['modifies', 'modifier', 'object_of', 'modifier', 'object', 'adj_comp', 'modifier', 
                    'modifier', 'modifies', 'adj_comp_of', 'modifier']

# File-format
headersSERef = ['ColType_Original', 'ColType_SketchEngine', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'Pair_Exists', 'Count', 'Score']
headersSETest = ['ColType_Original', 'ColType_SketchEngine', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'Count', 'Score']

# Change path to the folder where all collocation data files are stored
colFolder = r'D:\Education\vocabexpert\acl_bea_paper\eval\collocations\sketchengine'

# Naming convention for the generated files
fnameSketchEngine = 'SE_'
fnameRef = 'ref'
fnameTest = 'test'
fnameTop = 'top'
fname10 = '10'
fname20 = '20'

# Path to the folder where all the evaluation files will be generated
compFolder = r'D:\Education\vocabexpert\acl_bea_paper\results\sketchengine'

In [5]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

for i in range(11):
    colTypeFileName = lstRefColTypesFileNames[i]
    colType = lstRefColTypesSE[i]
    
    wordListCSVFile = './se_flax/ACL - Reference set - ' + colTypeFileName + '.csv'
    lstHeadColWords = getWordList(wordListCSVFile) # Get collocations from reference file
    print(colTypeFileName + ' : ' , str(len(lstHeadColWords)))
    
    colSketchEngine = os.path.join(colFolder, colTypeFileName)
    final_directory = colSketchEngine
    
    fnameColType = colTypeFileName.split('_')[0] + colTypeFileName.split('_')[1]
    
    lstEvalSketchEngineTest = []
    lstEvalSketchEngineTestTopRanked = []
    lstEvalSketchEngineTestTop10Ranked = []
    lstEvalSketchEngineTestTop20Ranked = []
    
    lstEvalSketchEngineRef = []
    lstEvalSketchEngineRefTopRanked = []    
    lstEvalSketchEngineRefTop10Ranked = []
    lstEvalSketchEngineRefTop20Ranked = []
    
    if os.path.exists(final_directory):
        for headwordCol in lstHeadColWords:
            headwordLemma = headwordCol[0]
            headwordPosTag = headwordCol[1].capitalize()
            collocateLemma = headwordCol[2]
            colPosTag = headwordCol[3].capitalize()
            if headwordPosTag == 'Adj':
                headwordPosTag = 'J'
                
            if colPosTag == 'Adj':
                colPosTag = 'J'
            elif colPosTag == 'Adv':
                colPosTag = 'R'
                
            topCollocate = int(headwordCol[4])
            
            fname = final_directory + '/' + headwordLemma + '_' + headwordPosTag + '_' + colTypeFileName + '+' + colType + '.csv'
            
            if not os.path.isfile(fname):
                # Reference
                lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)
            else:
                df = pd.read_csv(fname)
                lstSketchEngineCols = [tuple(x) for x in df.values]                
                
                lstHeadwordColRef = ()
                for row in lstSketchEngineCols:
                    if row[0] == headwordLemma and row[1].split('-')[1] == headwordPosTag and row[2] == collocateLemma and row[3].split('-')[1] == colPosTag:
                        lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[5], row[6])
                        break;
                # Reference                       
                if len(lstHeadwordColRef) == 0:
                    lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)
            
            lstEvalSketchEngineRef.append(lstHeadwordColRef)
            
            j = 0
            k = 0
            l = 0
            # Test
            for row in lstSketchEngineCols:
                if row[0] == headwordLemma and row[1].split('-')[1] == headwordPosTag and row[3].split('-')[1] == colPosTag:
                    lstHeadwordColTest = (colTypeFileName, colType, headwordLemma, headwordPosTag, row[2], colPosTag, row[5], row[6])
                    if lstHeadwordColTest not in lstEvalSketchEngineTest:
                        lstEvalSketchEngineTest.append(lstHeadwordColTest)
                        # Test -Top Candidate Ranking
                        if j < topCollocate:
                            lstEvalSketchEngineTestTopRanked.append(lstHeadwordColTest)
                            j+=1
                            
                        # Test - Top 10 Candidate Ranking
                        if k < 10:
                            lstEvalSketchEngineTestTop10Ranked.append(lstHeadwordColTest)
                            k+=1

                        # Test - Top 20 Candidate Ranking
                        if l < 20:
                            lstEvalSketchEngineTestTop20Ranked.append(lstHeadwordColTest)
                            l+=1
   
    
    fname = compFolder + '/' + fnameSketchEngine + fnameRef + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineRef, columns=headersSERef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineTest, columns=headersSETest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + fnameTop + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineTestTopRanked, columns=headersSETest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + fnameTop + fname10 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineTestTop10Ranked, columns=headersSETest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + fnameTop + fname20 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineTestTop20Ranked, columns=headersSETest)
    df.to_csv(fname, index=False)
    
    
    # Ref - Top Candidate Ranking
    for headwordCol in lstHeadColWords:
        headwordLemma = headwordCol[0]
        headwordPosTag = headwordCol[1].capitalize()
        collocateLemma = headwordCol[2]
        colPosTag = headwordCol[3].capitalize()
        if headwordPosTag == 'Adj':
            headwordPosTag = 'J'

        if colPosTag == 'Adj':
            colPosTag = 'J'
        elif colPosTag == 'Adv':
            colPosTag = 'R'

        lstHeadwordColRefTop = ()
        for row in lstEvalSketchEngineTestTopRanked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;
                          
        if len(lstHeadwordColRefTop) == 0:
            lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)

        lstEvalSketchEngineRefTopRanked.append(lstHeadwordColRefTop)
        
        lstHeadwordColRefTop10 = ()
        for row in lstEvalSketchEngineTestTop10Ranked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;

        if len(lstHeadwordColRefTop10) == 0:
            lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)

        lstEvalSketchEngineRefTop10Ranked.append(lstHeadwordColRefTop10)


        lstHeadwordColRefTop20 = ()
        for row in lstEvalSketchEngineTestTop20Ranked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;

        if len(lstHeadwordColRefTop20) == 0:
            lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)

        lstEvalSketchEngineRefTop20Ranked.append(lstHeadwordColRefTop20)

    
    fname = compFolder + '/' + fnameSketchEngine + fnameRef + fnameTop + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineRefTopRanked, columns=headersSERef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameRef + fnameTop + fname10 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineRefTop10Ranked, columns=headersSERef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameRef + fnameTop + fname20 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSketchEngineRefTop20Ranked, columns=headersSERef)
    df.to_csv(fname, index=False)

print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-05-18 10:24:18
n1_n2 :  62
n2_n1 :  62
n2_v1 :  306
n2_adj1 :  1769
v1_n2 :  306
v1_adj2 :  30
v1_adv2 :  29
v2_adv1 :  139
adj1_n2 :  1769
adj2_v1 :  30
adj2_adv1 :  124
2018-05-18 10:26:22


### Changes to merge words with two categories into one by taking max of the two scores.

In [6]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

for i in [5,9]:

    colTypeFileName = lstRefColTypesFileNames[i]
    colType = lstRefColTypesSE[i]
    
    wordListCSVFile = './se_flax/ACL - Reference set - ' + colTypeFileName + '.csv'
    lstHeadColWords = getWordList(wordListCSVFile) # Get collocations from reference file
    print(colTypeFileName + ' : ' , str(len(lstHeadColWords)))
    
    colSketchEngine = os.path.join(colFolder, colTypeFileName)
    final_directory = colSketchEngine
    
    fnameColType = colTypeFileName.split('_')[0] + colTypeFileName.split('_')[1]   
        
    lstEvalSEMergedRef = []
    lstEvalSEMergedRefTopRanked = []
    lstEvalSEMergedRefTop10Ranked = []
    lstEvalSEMergedRefTop20Ranked = []
    
    lstEvalSEMergedTest = []
    lstEvalSEMergedTestTopRanked = []
    lstEvalSEMergedTestTop10Ranked = []
    lstEvalSEMergedTestTop20Ranked = []
    
    if os.path.exists(final_directory):
        for headwordCol in lstHeadColWords:
            headwordLemma = headwordCol[0]
            headwordPosTag = headwordCol[1].capitalize()
            collocateLemma = headwordCol[2]
            colPosTag = headwordCol[3].capitalize()
            if headwordPosTag == 'Adj':
                headwordPosTag = 'J'
                
            if colPosTag == 'Adj':
                colPosTag = 'J'
            elif colPosTag == 'Adv':
                colPosTag = 'R'
                        
            fname = final_directory + '/' + headwordLemma + '_' + headwordPosTag + '_' + colTypeFileName + '+' + colType + '.csv'
            
            if os.path.isfile(fname):
                df = pd.read_csv(fname)
                lstSketchEngineCols = [tuple(x) for x in df.values]
                
                if i==5:
                    tempColType = 'np_adj_comp'
                elif i == 9:
                    tempColType = 'np_adj_comp_of'
                else:
                    tempColType = colType
                fname1 = final_directory + '/' + headwordLemma + '_' + headwordPosTag + '_' + colTypeFileName + '+' + tempColType + '.csv'
                    
                if os.path.isfile(fname1):                    
                    df1 = pd.read_csv(fname1)
                    frames = [df, df1]
                    dfCombined = pd.concat(frames)
                    dfMax = dfCombined.sort_values('Score', ascending=False).drop_duplicates(['Headword_Lempos', 'CollocateWord_Lempos'])

                    lstSketchEngineCols = [tuple(x) for x in dfMax.values]                

                lstHeadwordColRef = ()
                for row in lstSketchEngineCols:
                    if row[0] == headwordLemma and row[1].split('-')[1] == headwordPosTag and row[2] == collocateLemma and row[3].split('-')[1] == colPosTag:
                        lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[5], row[6])
                        break;
                # Reference     
                if len(lstHeadwordColRef) == 0:
                    lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)
                
                lstEvalSEMergedRef.append(lstHeadwordColRef)
                
                j = 0
                k = 0
                l = 0
                # Test
                for row in lstSketchEngineCols:
                    if row[0] == headwordLemma and row[1].split('-')[1] == headwordPosTag and row[3].split('-')[1] == colPosTag:
                        lstHeadwordColTest = (colTypeFileName, colType, headwordLemma, headwordPosTag, row[2], colPosTag, row[5], row[6])
                        if lstHeadwordColTest not in lstEvalSEMergedTest:
                            lstEvalSEMergedTest.append(lstHeadwordColTest)
                            # Test -Top Candidate Ranking
                            if j < topCollocate:
                                lstEvalSEMergedTestTopRanked.append(lstHeadwordColTest)
                                j+=1
                                
                            # Test - Top 10 Candidate Ranking
                            if k < 10:
                                lstEvalSEMergedTestTop10Ranked.append(lstHeadwordColTest)
                                k+=1
                                
                            # Test - Top 20 Candidate Ranking
                            if l < 20:
                                lstEvalSEMergedTestTop20Ranked.append(lstHeadwordColTest)
                                l+=1

    fname = compFolder + '/' + fnameSketchEngine + fnameRef + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedRef, columns=headersSERef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedTest, columns=headersSETest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + fnameTop + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedTestTopRanked, columns=headersSETest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + fnameTop + fname10 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedTestTop10Ranked, columns=headersSETest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameTest + fnameTop + fname20 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedTestTop20Ranked, columns=headersSETest)
    df.to_csv(fname, index=False)
    
    
    # Ref - Top Candidate Ranking
    for headwordCol in lstHeadColWords:
        headwordLemma = headwordCol[0]
        headwordPosTag = headwordCol[1].capitalize()
        collocateLemma = headwordCol[2]
        colPosTag = headwordCol[3].capitalize()
        if headwordPosTag == 'Adj':
            headwordPosTag = 'J'

        if colPosTag == 'Adj':
            colPosTag = 'J'
        elif colPosTag == 'Adv':
            colPosTag = 'R'     

        lstHeadwordColRefTop = ()
        for row in lstEvalSEMergedTestTopRanked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;
        
        if len(lstHeadwordColRefTop) == 0:
            lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)

        lstEvalSEMergedRefTopRanked.append(lstHeadwordColRefTop)
        
        
        lstHeadwordColRefTop10 = ()
        for row in lstEvalSEMergedTestTop10Ranked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;

        if len(lstHeadwordColRefTop10) == 0:
            lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)

        lstEvalSEMergedRefTop10Ranked.append(lstHeadwordColRefTop10)


        lstHeadwordColRefTop20 = ()
        for row in lstEvalSEMergedTestTop20Ranked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;

        if len(lstHeadwordColRefTop20) == 0:
            lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0, 0)

        lstEvalSEMergedRefTop20Ranked.append(lstHeadwordColRefTop20)

    
    fname = compFolder + '/' + fnameSketchEngine + fnameRef + fnameTop + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedRefTopRanked, columns=headersSERef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameRef + fnameTop + fname10 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedRefTop10Ranked, columns=headersSERef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameSketchEngine + fnameRef + fnameTop + fname20 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalSEMergedRefTop20Ranked, columns=headersSERef)
    df.to_csv(fname, index=False)

            
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-05-18 10:29:56
v1_adj2 :  30
adj2_v1 :  30
2018-05-18 10:29:56


## FLAX

In [7]:
# Internal mapping of collocation types
lstRefColTypesFileNames = ['n1_n2', 'n2_n1', 'n2_v1', 'n2_adj1', 'v1_n2', 'v1_adj2', 'v1_adv2', 
                           'v2_adv1', 'adj1_n2', 'adj2_v1', 'adj2_adv1']
lstRefColTypesFLAX = ['n-nn', 'n-nn', 'n-vn', 'n-an', 'v-vn', 'v-vppa', 'v-vr', 'v-rv', 'a-an', 'a-vppa', 'a-ra']
lstRefColPosTag = ['n', 'n', 'v', 'a', 'n', 'a', 'r', 'r', 'n', 'v', 'r']

# File-format 
headersFlaxRef = ['ColType_Original', 'ColType_Flax', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'Pair_Exists', 'ColType', 'Frequency']
headersFlaxTest = ['ColType_Original', 'ColType_Flax', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'ColType', 'Frequency']

# Change path to the folder where all collocation data files are stored
colFolder = r'D:\Education\vocabexpert\acl_bea_paper\eval\collocations\flax'

# Naming convention for the generated files
fnameFlax = 'FL_'
fnameRef = 'ref'
fnameTest = 'test'
fnameTop = 'top'
fname10 = '10'
fname20 = '20'

# Path to the folder where all the evaluation files will be generated
compFolder = r'D:\Education\vocabexpert\acl_bea_paper\results\flax'

In [8]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

for i in range(11):
    colTypeFileName = lstRefColTypesFileNames[i]
    colType = lstRefColTypesFLAX[i]
    
    wordListCSVFile = './se_flax/ACL - Reference set - ' + colTypeFileName + '.csv'
    lstHeadColWords = getWordList(wordListCSVFile) # Get collocations from reference file
    print(colTypeFileName + ' : ' , str(len(lstHeadColWords)))
    
    colFlax = os.path.join(colFolder, colTypeFileName)
    final_directory = colFlax
    
    fnameColType = colTypeFileName.split('_')[0] + colTypeFileName.split('_')[1]
    
    lstEvalFlaxTest = []
    lstEvalFlaxTestTopRanked = []
    lstEvalFlaxTestTop10Ranked = []
    lstEvalFlaxTestTop20Ranked = []
    
    lstEvalFlaxRef = []
    lstEvalFlaxRefTopRanked = []    
    lstEvalFlaxRefTop10Ranked = []
    lstEvalFlaxRefTop20Ranked = []
    
    
    if os.path.exists(final_directory):
        for headwordCol in lstHeadColWords:
            headwordLemma = headwordCol[0]
            headwordPosTag = headwordCol[1]
            collocateLemma = headwordCol[2]
            colPosTag = headwordCol[3]
            if headwordPosTag == 'adj':
                headwordPosTag = 'a'
                        
            if colPosTag == 'adj':
                colPosTag = 'a'
            elif colPosTag == 'adv':
                colPosTag = 'r'
            
            topCollocate = int(headwordCol[4])
            
            fname = final_directory + '/' + headwordLemma + '_' + headwordPosTag + '_' + 'ColWordsFiltered' + '_' + colTypeFileName + '+' + colType + '.csv'
            
            if not os.path.isfile(fname):
                # Reference
                lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', colType.split('-')[1], 0)
            else:
                df = pd.read_csv(fname)
                lstFlaxCols = [tuple(x) for x in df.values]                
                
                lstHeadwordColRef = ()
                for row in lstFlaxCols:
                    if row[0] == headwordLemma and row[1] == headwordPosTag and row[2] == collocateLemma and row[3] == colPosTag:
                        lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[4], row[5])
                        break;
                # Reference     
                if len(lstHeadwordColRef) == 0:
                    lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', colType.split('-')[1], 0)

                j = 0
                k = 0
                l = 0
                # Test
                for row in lstFlaxCols:
                    if row[0] == headwordLemma and row[1] == headwordPosTag and row[3] == colPosTag:
                        lstHeadwordColTest = (colTypeFileName, colType, headwordLemma, headwordPosTag, row[2], colPosTag, row[4], row[5])
                        if lstHeadwordColTest not in lstEvalFlaxTest:
                            lstEvalFlaxTest.append(lstHeadwordColTest)
                            # Test -Top Candidate Ranking
                            if j < topCollocate:
                                lstEvalFlaxTestTopRanked.append(lstHeadwordColTest)
                                j+=1
                                
                            # Test - Top 10 Candidate Ranking
                            if k < 10:
                                lstEvalFlaxTestTop10Ranked.append(lstHeadwordColTest)
                                k+=1
                                
                            # Test - Top 20 Candidate Ranking
                            if l < 20:
                                lstEvalFlaxTestTop20Ranked.append(lstHeadwordColTest)
                                l+=1
                            
            lstEvalFlaxRef.append(lstHeadwordColRef)

    fname = compFolder + '/' + fnameFlax + fnameRef + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxRef, columns=headersFlaxRef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameFlax + fnameTest + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxTest, columns=headersFlaxTest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameFlax + fnameTest + fnameTop + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxTestTopRanked, columns=headersFlaxTest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameFlax + fnameTest + fnameTop + fname10 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxTestTop10Ranked, columns=headersFlaxTest)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameFlax + fnameTest + fnameTop + fname20 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxTestTop20Ranked, columns=headersFlaxTest)
    df.to_csv(fname, index=False)
    
    # Ref - Top Candidate Ranking
    for headwordCol in lstHeadColWords:
        headwordLemma = headwordCol[0]
        headwordPosTag = headwordCol[1]
        collocateLemma = headwordCol[2]
        colPosTag = headwordCol[3]
        if headwordPosTag == 'adj':
            headwordPosTag = 'a'

        if colPosTag == 'adj':
            colPosTag = 'a'
        elif colPosTag == 'adv':
            colPosTag = 'r'              
        
        lstHeadwordColRefTop = ()
        for row in lstEvalFlaxTestTopRanked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;
        
        if len(lstHeadwordColRefTop) == 0:
            lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', colType.split('-')[1], 0)

        lstEvalFlaxRefTopRanked.append(lstHeadwordColRefTop)
        
        
        lstHeadwordColRefTop10 = ()
        for row in lstEvalFlaxTestTop10Ranked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;

        if len(lstHeadwordColRefTop10) == 0:
            lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', colType.split('-')[1], 0)

        lstEvalFlaxRefTop10Ranked.append(lstHeadwordColRefTop10)


        lstHeadwordColRefTop20 = ()
        for row in lstEvalFlaxTestTop20Ranked:
            if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6], row[7])
                break;

        if len(lstHeadwordColRefTop20) == 0:
            lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', colType.split('-')[1], 0)

        lstEvalFlaxRefTop20Ranked.append(lstHeadwordColRefTop20)

    
    fname = compFolder + '/' + fnameFlax + fnameRef + fnameTop + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxRefTopRanked, columns=headersFlaxRef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameFlax + fnameRef + fnameTop + fname10 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxRefTop10Ranked, columns=headersFlaxRef)
    df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameFlax + fnameRef + fnameTop + fname20 + '_' + str(i+1) + '_' + fnameColType + '.csv'
    df = pd.DataFrame(lstEvalFlaxRefTop20Ranked, columns=headersFlaxRef)
    df.to_csv(fname, index=False)


print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-05-18 10:31:11
n1_n2 :  62
n2_n1 :  62
n2_v1 :  306
n2_adj1 :  1769
v1_n2 :  306
v1_adj2 :  30
v1_adv2 :  29
v2_adv1 :  139
adj1_n2 :  1769
adj2_v1 :  30
adj2_adv1 :  124
2018-05-18 10:34:48


## ELIA

In [37]:
lstRefColTypesFileNames = ['n1_n2', 'n2_n1', 'n2_v1', 'n2_adj1', 'v1_n2', 'v1_adj2', 'v1_adv2', 
                           'v2_adv1', 'adj1_n2', 'adj2_v1', 'adj2_adv1']
lstRefColTypesElia = ['NOUN + NOUN', 'NOUN + NOUN', 'VERB + NOUN', 'ADJ + NOUN', 'VERB + NOUN', 'VERB + ADJ', 'VERB + ADV', 
                      'VERB + ADV', 'ADJ + NOUN', 'VERB + ADJ', 'ADV + ADJ']

# File-format 
headersEliaRef = ['ColType_Original', 'ColType_Elia', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'Pair_Exists', 'Association_Measure']
headersEliaTest = ['ColType_Original', 'ColType_Elia', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'Association_Measure']

# Change path to the folder where all collocation data files are stored
colFolder = r'D:\Education\vocabexpert\acl_bea_paper\eval\collocations\elia'

# Naming convention for the generated files
fnameElia = 'EL_'
fnameRef = 'ref'
fnameTest = 'test'
fnameTop = 'top'
fname10 = '10'
fname20 = '20'

# Path to the folder where all the evaluation files will be generated
compFolder = r'D:\Education\vocabexpert\acl_bea_paper\results\elia'

In [36]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

for j in ['twf']:
    print(j)
    fnameMetric = j
    colFolder = os.path.join(colFolder, fnameMetric)
    
    for i in range(11):
        colTypeFileName = lstRefColTypesFileNames[i]
        colType = lstRefColTypesElia[i]

        wordListCSVFile = './elia/ACL - Reference set - ' + colTypeFileName + '.csv'
        lstHeadColWords = getWordList(wordListCSVFile) # Get collocations from reference file
        print(colTypeFileName + ' : ' , str(len(lstHeadColWords)))

        colElia = os.path.join(colFolder, colTypeFileName)
        final_directory = colElia

        fnameColType = colTypeFileName.split('_')[0] + colTypeFileName.split('_')[1]   

        lstEvalEliaTest = []
        lstEvalEliaTestTopRanked = []
        lstEvalEliaTestTop10Ranked = []
        lstEvalEliaTestTop20Ranked = []
        
        lstEvalEliaRef = []
        lstEvalEliaRefTopRanked = []
        lstEvalEliaRefTop10Ranked = []
        lstEvalEliaRefTop20Ranked = []
        
        if os.path.exists(final_directory):
            for headwordCol in lstHeadColWords:
                headwordLemma = headwordCol[0]
                headwordPosTag = headwordCol[1].capitalize()
                collocateLemma = headwordCol[2]
                colPosTag = headwordCol[3].capitalize()
                if headwordPosTag == 'Adj':
                    headwordPosTag = 'J'

                if colPosTag == 'Adj':
                    colPosTag = 'JJ'
                elif colPosTag == 'Adv':
                    colPosTag = 'RB'
                elif colPosTag == 'N':
                    colPosTag = 'NN'
                elif colPosTag == 'V':
                    colPosTag = 'VB'

                fnameHeadwordPosTag = ''
                if headwordPosTag == 'J':
                    fnameHeadwordPosTag = 'JJ'
                elif headwordPosTag == 'R':
                    fnameHeadwordPosTag = 'RB'
                elif headwordPosTag == 'N':
                    fnameHeadwordPosTag = 'NN'
                elif headwordPosTag == 'V':
                    fnameHeadwordPosTag = 'VB'

                headwordPosTag = fnameHeadwordPosTag

                topCollocate = int(headwordCol[4])

                fname = final_directory + '/' + headwordLemma + '_' + fnameHeadwordPosTag + '_' + colTypeFileName + '+' + colType + '.csv'

                if not os.path.isfile(fname):
                    # Reference
                    lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)
                else:
                    df = pd.read_csv(fname)
                    lstEliaCols = [tuple(x) for x in df.values]                

                    lstHeadwordColRef = ()
                    for row in lstEliaCols:
                        headwordLemPos = eval(row[0])
                        newHeadWord = headwordLemPos[0]
                        newHeadWordPOS =headwordLemPos[1]

                        collocateLemPos = eval(row[1])
                        newColWordForm = collocateLemPos[0]
                        newColWordLemma = collocateLemPos[1]
                        newColWordPOS = collocateLemPos[2]

                        if newHeadWord == headwordLemma and newHeadWordPOS == headwordPosTag and newColWordLemma == collocateLemma and colPosTag in newColWordPOS:
                            lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[2])
                            break;
                        elif newHeadWord == headwordLemma and newHeadWordPOS == headwordPosTag and collocateLemma in newColWordForm:
                            lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[2])
                            break;
                    # Reference                    
                    if len(lstHeadwordColRef) == 0:
                        lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

                lstEvalEliaRef.append(lstHeadwordColRef)
                
                j = 0
                k = 0
                l = 0
                # Test
                for row in lstEliaCols:
                    headwordLemPos = eval(row[0])
                    newHeadWord = headwordLemPos[0]
                    newHeadWordPOS =headwordLemPos[1]

                    collocateLemPos = eval(row[1])
                    newColWordForm = collocateLemPos[0]
                    newColWordLemma = collocateLemPos[1]
                    newColWordPOS = collocateLemPos[2]

                    if newHeadWord == headwordLemma and newHeadWordPOS == headwordPosTag and colPosTag in newColWordPOS:
                        lstHeadwordColTest = (colTypeFileName, colType, headwordLemma, headwordPosTag, newColWordLemma, colPosTag, row[2])
                        if lstHeadwordColTest not in lstEvalEliaTest:
                            lstEvalEliaTest.append(lstHeadwordColTest)

                            # Test -Top Candidate Ranking
                            if j < topCollocate:
                                lstEvalEliaTestTopRanked.append(lstHeadwordColTest)
                                j+=1

                            # Test - Top 10 Candidate Ranking
                            if k < 10:
                                lstEvalEliaTestTop10Ranked.append(lstHeadwordColTest)
                                k+=1
                                
                            # Test - Top 20 Candidate Ranking
                            if l < 20:
                                lstEvalEliaTestTop20Ranked.append(lstHeadwordColTest)
                                l+=1

            fname = compFolder + '/' + fnameElia + fnameRef + '_' + str(i+1) + '_' + fnameColType + '.csv'
            df = pd.DataFrame(lstEvalEliaRef, columns=headersEliaRef)
            df.to_csv(fname, index=False)

            fname = compFolder + '/' + fnameElia + fnameTest + '_' + str(i+1) + '_' + fnameColType + '.csv'
            df = pd.DataFrame(lstEvalEliaTest, columns=headersEliaTest)
            df.to_csv(fname, index=False)

            fname = compFolder + '/' + fnameElia + fnameTest + fnameTop + '_' + fnameMetric + '_' + str(i+1) + '_' + fnameColType + '.csv'
            df = pd.DataFrame(lstEvalEliaTestTopRanked, columns=headersEliaTest)
            df.to_csv(fname, index=False)            
            
            fname = compFolder + '/' + fnameElia + fnameTest + fnameTop + fname10 + '_' + fnameMetric + '_' + str(i+1) + '_' + fnameColType + '.csv'
            df = pd.DataFrame(lstEvalEliaTestTop10Ranked, columns=headersEliaTest)
            df.to_csv(fname, index=False)

            fname = compFolder + '/' + fnameElia + fnameTest + fnameTop + fname20 + '_' + fnameMetric + '_' + str(i+1) + '_' + fnameColType + '.csv'
            df = pd.DataFrame(lstEvalEliaTestTop20Ranked, columns=headersEliaTest)
            df.to_csv(fname, index=False)


        # Ref - Top Candidate Ranking
        for headwordCol in lstHeadColWords:
            headwordLemma = headwordCol[0]
            headwordPosTag = headwordCol[1].capitalize()
            collocateLemma = headwordCol[2]
            colPosTag = headwordCol[3].capitalize()
            if headwordPosTag == 'Adj':
                headwordPosTag = 'J'

            if colPosTag == 'Adj':
                colPosTag = 'JJ'
            elif colPosTag == 'Adv':
                colPosTag = 'RB'
            elif colPosTag == 'N':
                colPosTag = 'NN'
            elif colPosTag == 'V':
                colPosTag = 'VB'

            fnameHeadwordPosTag = ''
            if headwordPosTag == 'J':
                fnameHeadwordPosTag = 'JJ'
            elif headwordPosTag == 'R':
                fnameHeadwordPosTag = 'RB'
            elif headwordPosTag == 'N':
                fnameHeadwordPosTag = 'NN'
            elif headwordPosTag == 'V':
                fnameHeadwordPosTag = 'VB'

            headwordPosTag = fnameHeadwordPosTag

            lstHeadwordColRefTop = ()
            for row in lstEvalEliaTestTopRanked:
                if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                    lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6])
                    break;

            if len(lstHeadwordColRefTop) == 0:
                lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

            lstEvalEliaRefTopRanked.append(lstHeadwordColRefTop)


            lstHeadwordColRefTop10 = ()
            for row in lstEvalEliaTestTop10Ranked:
                if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                    lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6])
                    break;

            if len(lstHeadwordColRefTop10) == 0:
                lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

            lstEvalEliaRefTop10Ranked.append(lstHeadwordColRefTop10)
            
            
            lstHeadwordColRefTop20 = ()
            for row in lstEvalEliaTestTop20Ranked:
                if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                    lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6])
                    break;

            if len(lstHeadwordColRefTop20) == 0:
                lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

            lstEvalEliaRefTop20Ranked.append(lstHeadwordColRefTop20)


        fname = compFolder + '/' + fnameElia + fnameRef + fnameTop + '_' + fnameMetric + '_' + str(i+1) + '_' + fnameColType + '.csv'
        df = pd.DataFrame(lstEvalEliaRefTopRanked, columns=headersEliaRef)
        df.to_csv(fname, index=False)

        fname = compFolder + '/' + fnameElia + fnameRef + fnameTop + fname10 + '_' + fnameMetric + '_' + str(i+1) + '_' + fnameColType + '.csv'
        df = pd.DataFrame(lstEvalEliaRefTop10Ranked, columns=headersEliaRef)
        df.to_csv(fname, index=False)
        
        fname = compFolder + '/' + fnameElia + fnameRef + fnameTop + fname20 + '_' + fnameMetric + '_' + str(i+1) + '_' + fnameColType + '.csv'
        df = pd.DataFrame(lstEvalEliaRefTop20Ranked, columns=headersEliaRef)
        df.to_csv(fname, index=False)
        

    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-05-18 14:41:33
twf
n1_n2 :  62
n2_n1 :  62
n2_v1 :  306
n2_adj1 :  1769
v1_n2 :  306
v1_adj2 :  30
v1_adv2 :  29
v2_adv1 :  139
adj1_n2 :  1769
adj2_v1 :  30
adj2_adv1 :  124
2018-05-18 15:17:53


### Comparison of different AMs with Elia as a baseline

In [59]:
lstRefColTypesFileNames = ['n1_n2', 'n2_n1', 'n2_v1', 'n2_adj1', 'v1_n2', 'v1_adj2', 'v1_adv2', 
                           'v2_adv1', 'adj1_n2', 'adj2_v1', 'adj2_adv1']
lstRefColTypesElia = ['NOUN + NOUN', 'NOUN + NOUN', 'VERB + NOUN', 'ADJ + NOUN', 'VERB + NOUN', 'VERB + ADJ', 'VERB + ADV', 
                      'VERB + ADV', 'ADJ + NOUN', 'VERB + ADJ', 'ADV + ADJ']

# File-format 
headersEliaRef = ['ColType_Original', 'ColType_Elia', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'Pair_Exists', 'Association_Measure']
headersEliaTest = ['ColType_Original', 'ColType_Elia', 'HeadWord_Lemma', 'Headword_Pos', 'CollocateWord_Lemma', 'CollocateWord_Pos', 'Association_Measure']

# Change path to the folder where all collocation data files are stored
colFolder = r'D:\Education\vocabexpert\emnlp_paper\eval\collocations\elia'

# Naming convention for the generated files
fnameElia = 'EL_'
fnameRef = 'ref'
fnameTest = 'test'
fnameTop = 'top'
fname10 = '10'
fname20 = '20'

# Path to the folder where all the evaluation files will be generated
compFolder = r'D:\Education\vocabexpert\emnlp_paper\results\elia'

In [60]:
print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

for j in ['frequency']: # ['deltap1', 'deltap2', 'frequency', 'logdice', 'loglikelihood', 'MI', 'MI2', 'MI3', 'salience', 'tscore', 'twf']:
    print(j)
    fnameMetric = j
    colFolder = os.path.join(colFolder, fnameMetric)
    
#     lstEvalEliaTestTopRanked = []
#     lstEvalEliaTestTop10Ranked = []
    lstEvalEliaTestTop20Ranked = []
    
#     lstEvalEliaRefTopRanked = []
#     lstEvalEliaRefTop10Ranked = []
    lstEvalEliaRefTop20Ranked = []
    
    for i in range(11):
        colTypeFileName = lstRefColTypesFileNames[i]
        colType = lstRefColTypesElia[i]

        wordListCSVFile = './elia/ACL - Reference set - ' + colTypeFileName + '.csv'
        lstHeadColWords = getWordList(wordListCSVFile) # Get collocations from reference file
        print(colTypeFileName + ' : ' , str(len(lstHeadColWords)))

        colElia = os.path.join(colFolder, colTypeFileName)
        final_directory = colElia

        fnameColType = colTypeFileName.split('_')[0] + colTypeFileName.split('_')[1]   

        lstEvalEliaTest = []        
        lstEvalEliaRef = []
        lstEliaCols = []
        
        if os.path.exists(final_directory):
            for headwordCol in lstHeadColWords:
                headwordLemma = headwordCol[0]
                headwordPosTag = headwordCol[1].capitalize()
                collocateLemma = headwordCol[2]
                colPosTag = headwordCol[3].capitalize()
                if headwordPosTag == 'Adj':
                    headwordPosTag = 'J'

                if colPosTag == 'Adj':
                    colPosTag = 'JJ'
                elif colPosTag == 'Adv':
                    colPosTag = 'RB'
                elif colPosTag == 'N':
                    colPosTag = 'NN'
                elif colPosTag == 'V':
                    colPosTag = 'VB'

                fnameHeadwordPosTag = ''
                if headwordPosTag == 'J':
                    fnameHeadwordPosTag = 'JJ'
                elif headwordPosTag == 'R':
                    fnameHeadwordPosTag = 'RB'
                elif headwordPosTag == 'N':
                    fnameHeadwordPosTag = 'NN'
                elif headwordPosTag == 'V':
                    fnameHeadwordPosTag = 'VB'

                headwordPosTag = fnameHeadwordPosTag

                topCollocate = int(headwordCol[4])

                fname = final_directory + '/' + headwordLemma + '_' + fnameHeadwordPosTag + '_' + colTypeFileName + '+' + colType + '.csv'

                if not os.path.isfile(fname):
                    # Reference
                    lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)
                else:
                    df = pd.read_csv(fname)
                    lstEliaCols = [tuple(x) for x in df.values]                

                    lstHeadwordColRef = ()
                    for row in lstEliaCols:
                        headwordLemPos = eval(row[0])
                        newHeadWord = headwordLemPos[0]
                        newHeadWordPOS =headwordLemPos[1]

                        collocateLemPos = eval(row[1])
                        newColWordForm = collocateLemPos[0]
                        newColWordLemma = collocateLemPos[1]
                        newColWordPOS = collocateLemPos[2]

                        if newHeadWord == headwordLemma and newHeadWordPOS == headwordPosTag and newColWordLemma == collocateLemma and colPosTag in newColWordPOS:
                            lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[2])
                            break;
                        elif newHeadWord == headwordLemma and newHeadWordPOS == headwordPosTag and collocateLemma in newColWordForm:
                            lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[2])
                            break;
                    # Reference                    
                    if len(lstHeadwordColRef) == 0:
                        lstHeadwordColRef = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

                lstEvalEliaRef.append(lstHeadwordColRef)
                
                j = 0
                k = 0
                l = 0
                # Test
                for row in lstEliaCols:
                    headwordLemPos = eval(row[0])
                    newHeadWord = headwordLemPos[0]
                    newHeadWordPOS =headwordLemPos[1]

                    collocateLemPos = eval(row[1])
                    newColWordForm = collocateLemPos[0]
                    newColWordLemma = collocateLemPos[1]
                    newColWordPOS = collocateLemPos[2]

                    if newHeadWord == headwordLemma and newHeadWordPOS == headwordPosTag and colPosTag in newColWordPOS:
                        lstHeadwordColTest = (colTypeFileName, colType, headwordLemma, headwordPosTag, newColWordLemma, colPosTag, row[2])
                        if lstHeadwordColTest not in lstEvalEliaTest:
                            lstEvalEliaTest.append(lstHeadwordColTest)

#                             # Test -Top Candidate Ranking
#                             if j < topCollocate:
#                                 lstEvalEliaTestTopRanked.append(lstHeadwordColTest)
#                                 j+=1

#                             # Test - Top 10 Candidate Ranking
#                             if k < 10:
#                                 lstEvalEliaTestTop10Ranked.append(lstHeadwordColTest)
#                                 k+=1

                            # Test - Top 20 Candidate Ranking
                            if l < 20:
                                lstEvalEliaTestTop20Ranked.append(lstHeadwordColTest)
                                l+=1

#             fname = compFolder + '/' + fnameElia + fnameRef + '_' + str(i+1) + '_' + fnameColType + '.csv'
#             df = pd.DataFrame(lstEvalEliaRef, columns=headersEliaRef)
#             df.to_csv(fname, index=False)

#             fname = compFolder + '/' + fnameElia + fnameTest + '_' + str(i+1) + '_' + fnameColType + '.csv'
#             df = pd.DataFrame(lstEvalEliaTest, columns=headersEliaTest)
#             df.to_csv(fname, index=False)


        # Ref - Top Candidate Ranking
        for headwordCol in lstHeadColWords:
            headwordLemma = headwordCol[0]
            headwordPosTag = headwordCol[1].capitalize()
            collocateLemma = headwordCol[2]
            colPosTag = headwordCol[3].capitalize()
            if headwordPosTag == 'Adj':
                headwordPosTag = 'J'

            if colPosTag == 'Adj':
                colPosTag = 'JJ'
            elif colPosTag == 'Adv':
                colPosTag = 'RB'
            elif colPosTag == 'N':
                colPosTag = 'NN'
            elif colPosTag == 'V':
                colPosTag = 'VB'

            fnameHeadwordPosTag = ''
            if headwordPosTag == 'J':
                fnameHeadwordPosTag = 'JJ'
            elif headwordPosTag == 'R':
                fnameHeadwordPosTag = 'RB'
            elif headwordPosTag == 'N':
                fnameHeadwordPosTag = 'NN'
            elif headwordPosTag == 'V':
                fnameHeadwordPosTag = 'VB'

            headwordPosTag = fnameHeadwordPosTag

#             lstHeadwordColRefTop = ()
#             for row in lstEvalEliaTestTopRanked:
#                 if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
#                     lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6])
#                     break;

#             if len(lstHeadwordColRefTop) == 0:
#                 lstHeadwordColRefTop = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

#             lstEvalEliaRefTopRanked.append(lstHeadwordColRefTop)


#             lstHeadwordColRefTop10 = ()
#             for row in lstEvalEliaTestTop10Ranked:
#                 if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
#                     lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6])
#                     break;

#             if len(lstHeadwordColRefTop10) == 0:
#                 lstHeadwordColRefTop10 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

#             lstEvalEliaRefTop10Ranked.append(lstHeadwordColRefTop10)
            
            
            lstHeadwordColRefTop20 = ()
            for row in lstEvalEliaTestTop20Ranked:
                if row[2] == headwordLemma and row[3] == headwordPosTag and row[4] == collocateLemma and row[5] == colPosTag:
                    lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'present', row[6])
                    break;

            if len(lstHeadwordColRefTop20) == 0:
                lstHeadwordColRefTop20 = (colTypeFileName, colType, headwordLemma, headwordPosTag, collocateLemma, colPosTag, 'absent', 0)

            lstEvalEliaRefTop20Ranked.append(lstHeadwordColRefTop20)


#     fname = compFolder + '/' + fnameElia + fnameRef + fnameTop + '_' + fnameMetric + '_' + str(i+1) + '_' + 'all' + '.csv'
#     df = pd.DataFrame(lstEvalEliaRefTopRanked, columns=headersEliaRef)
#     df.to_csv(fname, index=False)

#     fname = compFolder + '/' + fnameElia + fnameRef + fnameTop + fname10 + '_' + fnameMetric + '_' + str(i+1) + '_' + 'all' + '.csv'
#     df = pd.DataFrame(lstEvalEliaRefTop10Ranked, columns=headersEliaRef)
#     df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameElia + fnameRef + fnameTop + fname20 + '_' + fnameMetric + '_' + str(i+1) + '_' + 'all' + '.csv'
    df = pd.DataFrame(lstEvalEliaRefTop20Ranked, columns=headersEliaRef)
    df.to_csv(fname, index=False)
    
#     fname = compFolder + '/' + fnameElia + fnameTest + fnameTop + '_' + fnameMetric + '_' + str(i+1) + '_' + 'all' + '.csv'
#     df = pd.DataFrame(lstEvalEliaTestTopRanked, columns=headersEliaTest)
#     df.to_csv(fname, index=False)

#     fname = compFolder + '/' + fnameElia + fnameTest + fnameTop + fname10 + '_' + fnameMetric + '_' + str(i+1) + '_' + 'all' + '.csv'
#     df = pd.DataFrame(lstEvalEliaTestTop10Ranked, columns=headersEliaTest)
#     df.to_csv(fname, index=False)

    fname = compFolder + '/' + fnameElia + fnameTest + fnameTop + fname20 + '_' + fnameMetric + '_' + str(i+1) + '_' + 'all' + '.csv'
    df = pd.DataFrame(lstEvalEliaTestTop20Ranked, columns=headersEliaTest)
    df.to_csv(fname, index=False)

    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

2018-05-19 05:50:58
frequency
n1_n2 :  62
n2_n1 :  62
n2_v1 :  306
n2_adj1 :  1769
v1_n2 :  306
v1_adj2 :  30
v1_adv2 :  29
v2_adv1 :  139
adj1_n2 :  1769
adj2_v1 :  30
adj2_adv1 :  124
2018-05-19 06:16:42
