    Use CORESET to enlarge differences among samples rather than sampling from uncertain words. 

In [None]:
# Label
# 0: Physical
# 1: Mental

# Load Environment

In [1]:
'''
To use nltk's wordnet, you should download WordNet from https://www.nltk.org/nltk_data/ and then unzip wordnet.zip
under your home directory with path as /home_path/nltk_data/corpora/

'''
from nltk.corpus import wordnet as wn
import pandas as pd
import numpy as np
import tqdm
from collections import defaultdict
import joblib
import time
import json

In [68]:
import fasttext as ft
# we use fasttext embedding vector to represent word
# As 'cc.en.300.bin' is very large, it is not included in the submission. Download URL: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# Check more details in website: https://fasttext.cc/docs/en/crawl-vectors.html

model=ft.load_model('source/cc.en.300.bin') 



In [2]:
adj=pd.read_csv('Data/adj.csv') # load adjectives from `high_entropy.ipynb`
adj.shape

(7292, 4)

# Build TestSet

In [3]:
testset=pd.read_csv('Data/test.csv')

# CORESET

    All labeling steps below are implemented by two persons. If the labeling results disagree, we ask a third person to arbitrate.
    
    In iteration T, The L2 distance of each valid adjective against each training samples used in iteration T-1 is computed and the max distance of each valid adjective is recorded. Lastly, by sorting according to max distance descendly, we choose the topK words and join them to the training samples used in iteration T-1 as training samples of iteration T. We will repeat this process for some times in iteration T. 
  

In [204]:
topK=10 
repeat=12 # repeat times
batch_size=1500 # candset batch size 

In [106]:
# find samples given core words by CORESET strategy
def points2points_l2_norm(m1,m2):
    # return point-wise L2 distance
    # `m1` ndarray [N,D]
    # `m2` ndarray [M,D]

    N = m1.shape[0]
    M=m2.shape[0]
    m1 = np.expand_dims(m1, axis=0)
    m1 = np.repeat(m1, repeats=M, axis=0)  # [M,N,D]

    m2 = np.expand_dims(m2, axis=1)  # [M,1,D]
    m2 = np.repeat(m2, repeats=N, axis=1)  # [M,N,D]

    dist = np.sqrt(np.sum((m1 - m2) * (m1 - m2), axis=2).T)  # [N,M]

    return dist

def find_samples(cands,cores,model,topk,only_words=True):
    # return word list or tuple(word,score)
    
    # get embeddings
    core_emb=[]
    for w in cores:
        emb=model.get_word_vector(w).reshape((1,-1))
        core_emb.append(emb)
    core_emb=np.concatenate(core_emb,axis=0)

    cand_emb=[]
    for w in cands:
        try:
            emb=model.get_word_vector(w).reshape((1,-1))
        except:
            print("#%s# failed to get embedding. "%w)
            emb=np.zeros((1,300))
        cand_emb.append(emb)
    cand_emb=np.concatenate(cand_emb,axis=0)
    
    # calculate L2 distance
    dist=points2points_l2_norm(cand_emb,core_emb)

    min_d=dist.min(axis=1)

    q=[]
    for w,d in zip(cands,min_d):
        q.append((w,d))

    q.sort(key=lambda x: -x[1])
    
    if only_words:
        res=[x[0] for x in q]
        return res[:topk]
    else:
        return q[:topk]

def find_samples_batch(cands,cores,model,topk,batch_size,only_words=True):
    '''
    Split cands into batches to make distance computing available in memory. 
    [len(cands),len(cores)] tensor -> [len(batch_size),len(cores)] * batch_num
    '''
    # return word list or tuple(word,score) 
    q=[]
    
    N=len(cands)//batch_size+1
    for i in range(N):
        _cands=cands[i*batch_size:(i+1)*batch_size]
        if len(_cands)==0:
            continue
        q+=find_samples(_cands,cores,model,topk,only_words=False)
        
    q.sort(key=lambda x: -x[1])
    
    if only_words:
        res=[x[0] for x in q]
        return res[:topk]
    else:
        return q[:topk]  


# initialize candidate set
# candidate words are from all valid adjectives except testset
adj_cand=[]
for w in adj['word']:
    if w not in testset['word']:
        adj_cand.append(w)


## Iter1

### Setup

In [4]:
# Randomly annotate some positive and negative words for training. Proportion of samples, 1:1
# `desc`: `physical` adj, describe physical attributes
# `opin`: `mental` adj, usually relates to mental attributes

desc=['anemic','arranged','assorted','available','baked','bitter','black','blue','broken','cherry',
      'citric','corrugated','commercial','cooked','crispy','crushed','crusty','decorative','dietetic',
     'digestible','dried','drippy','edible','empty','fake']
opin=['amazing','awesome','awful','aware','bad','basic','beneficial','best','bold','bothersome','careful',
      'casual','certain','cheap','clean','clear','cold','comfortable','common','comparable','competitive',
     'complete','consistent','contributive','convenient','conventional','cool','costly','crazy']

In [5]:
word_type='adj'
trainset=[]
        
for w in desc:
    defn=adj.loc[adj.word==w,'text'].values[0]
    trainset.append((w,defn,0)) # 0: physical
    
for w in opin:
    defn=adj.loc[adj.word==w,'text'].values[0]
    trainset.append((w,defn,1)) # 1: mental


In [6]:
train=pd.DataFrame()
train['word'],train['text'],train['target']=list(zip(*(trainset)))

In [7]:
train.to_csv('Data/train.v4.1.round1.csv')

    Training is not done until iter 3. As sampling is only dependent on word embedding vector, there's no need to
    do training for iter1 and iter2. Total iteration number is 5 for comparison with other methods.

## Iter2

### Setup

In [131]:
DEBUG=False # True: do debug and unit test

In [123]:

cores=train['word'].tolist() # use words in train from last iteration as initialization

In [126]:
# detect all low_freq words and put them into cores

# we start from 0 to 19 and then from 20 to 79
birth_start=20
birth_loop=60

low_freq={}
# birth run to inclue noise words into cores
for i in range(birth_start,birth_start+birth_loop): 
    print('%d birth_loop.'%i)
    # main action
    _new=find_samples_batch(cands=adj_cand,cores=cores,model=model,
                       topk=topK,batch_size=batch_size, only_words=True)
    
    # update states
    cores+=_new
    
    low_freq[i]=_new

20 birth_loop.
#nan# failed to get embedding. 
21 birth_loop.
#nan# failed to get embedding. 
22 birth_loop.
#nan# failed to get embedding. 
23 birth_loop.
#nan# failed to get embedding. 
24 birth_loop.
#nan# failed to get embedding. 
25 birth_loop.
#nan# failed to get embedding. 
26 birth_loop.
#nan# failed to get embedding. 
27 birth_loop.
#nan# failed to get embedding. 
28 birth_loop.
#nan# failed to get embedding. 
29 birth_loop.
#nan# failed to get embedding. 
30 birth_loop.
#nan# failed to get embedding. 
31 birth_loop.
#nan# failed to get embedding. 
32 birth_loop.
#nan# failed to get embedding. 
33 birth_loop.
#nan# failed to get embedding. 
34 birth_loop.
#nan# failed to get embedding. 
35 birth_loop.
#nan# failed to get embedding. 
36 birth_loop.
#nan# failed to get embedding. 
37 birth_loop.
#nan# failed to get embedding. 
38 birth_loop.
#nan# failed to get embedding. 
39 birth_loop.
#nan# failed to get embedding. 
40 birth_loop.
#nan# failed to get embedding. 
41 birth_loop

In [125]:
low_freq

{0: ['xc', 'xl', 'xx', 'gi', '64', 'iv', 'x', '59'],
 1: ['il', '0', 'm', 'ex', 'v', 'k', 'gu', 'li'],
 2: ['u', 'c', 'd', 'i', '99', '90', '95', 'up'],
 3: ['no', 'l', '1', 'ok', 'go', 'on', '2', '3'],
 4: ['in', '50', 'hep', 'xxx', '65', '69', '73', 'sec'],
 5: ['rum', 'cod', 'otc', 'jet', 'dim', 'fey', 'neo', 'hip'],
 6: ['18', 'lax', 'gay', 'fat', 'dun', 'pet', 'nee', 'nth'],
 7: ['uric', 'ace', 'coy', 'bay', '9th', 'lit', 'pat', 'manx'],
 8: ['mid', 'shy', '170', 'fly', '140', '145', 'tan', 'mum'],
 9: ['net', '30', 'pro', 'ago', 'otic', 'ill', 'gaga', 'gamy'],
 10: ['raw', 'apt', '39', 'bum', '96', 'wee', 'pop', 'mown'],
 11: ['fab', 'dud', 'echt', 'icy', 'sly', '32', 'dank', '75'],
 12: ['ver', 'wet', '31', 'lay', 'deaf', 'pied', 'lewd', 'achy'],
 13: ['9', 'low', 'ten', 'hind', 'cut', '75th', 'sear', 'rust'],
 14: ['key', 'wan', 'oral', 'fit', '45', 'ashy', 'bated', 'rapt'],
 15: ['foul', 'won', 'thai', 'wiry', 'saudi', 'off', 'inky', 'gory'],
 16: ['drab', 'boxy', '89', 'avid'

In [127]:
low_freq

{20: ['eyed', 'waxy', 'carpal', 'rash', 'big', 'oozy', 'anti', 'old'],
 21: ['loamy', 'rosy', 'few', 'far', 'nosy', 'mint', 'jade', 'numb'],
 22: ['skim', 'cosy', 'cest', 'aged', 'cubic', 'bias', 'algal', 'oily'],
 23: ['saute', 'cyan', 'wavy', 'tall', 'bone', '98', '44', 'unfed'],
 24: ['odd', 'sage', 'gouty', 'wroth', 'limp', 'slim', '10', 'rank'],
 25: ['fetal', 'own', 'born', 'smug', 'halal', 'calm', 'wily', 'rife'],
 26: ['potty', 'holy', 'welsh', 'hazy', 'peppy', 'out', '60', 'iron'],
 27: ['airy', 'port', 'amino', 'hazel', 'damp', 'solo', 'ripe', 'snug'],
 28: ['fecal', 'oaken', 'mute', '80', 'teen', 'twin', 'bare', 'cuban'],
 29: ['parve', 'dirt', 'oval', 'loud', 'boggy', 'peaty', 'mass', 'rude'],
 30: ['rose', 'tarry', 'yogic', 'rear', 'camp', 'nasal', 'boric', 'posh'],
 31: ['moot', 'musky', '19th', 'new', 'liege', 'polar', 'lush', 'spiny'],
 32: ['tannic', 'nappy', 'ionic', 'star', 'worn', 'gelid', 'pale', 'dire'],
 33: ['edgy', 'renal', 'acid', 'tidal', 'winy', 'set', 'sole

In [220]:
# save for future backup
"""with open("Data/coreset_low_freq.json", "w") as outfile:
    json.dump(low_freq, outfile)
    
with open("Data/coreset_cores_iter2.json", "w") as outfile:
    json.dump(cores, outfile)"""

    From above result, iter0-19(160 words) in `low_freq` are regarded as noisy words. As the embedding vectors of these noisy words are usually very far from normal words, if they were not included in `core`, we would pick them up for annotation, which is not desired.

In [219]:
len(cores)

278

In [135]:
# sampling states
new_samples=[] # samples extracted out in this iteration

# remove iter 20-79 words in cores, only keep words in iter1 and noise words
for i in range(20,80):
    for w in low_freq[i]:
        if w in cores:
            cores.remove(w)
        
if DEBUG: # only for debug and unit test
    d_samples={}

start=time.time()
for i in range(repeat): 
    print('%d repeat.'%i)
    # main action
    _new=find_samples_batch(cands=adj_cand,cores=cores,model=model,
                       topk=topK,batch_size=batch_size, only_words=True)
    
    # update states
    new_samples+=_new
    cores+=_new
    
    if DEBUG:
        d_samples[i]=_new
dur=time.time()-start
print("Runtime: %.2f sec"%dur)

# after sampling, the `new_samples` will be annotated.

0 repeat.
#nan# failed to get embedding. 
1 repeat.
#nan# failed to get embedding. 
2 repeat.
#nan# failed to get embedding. 
3 repeat.
#nan# failed to get embedding. 
4 repeat.
#nan# failed to get embedding. 
5 repeat.
#nan# failed to get embedding. 
6 repeat.
#nan# failed to get embedding. 
7 repeat.
#nan# failed to get embedding. 
Runtime: 29.55 sec


In [206]:
desc2=['eyed','waxy','carpal','rash','big','oozy','far','mint','jade',
       'skim','cubic','oily','saute','cyan','wavy','tall','bone','welsh',
      'hazy','iron','damp']

opin2=['anti','rosy','few','nosy','numb','aged','bias','wroth','own',
       'smug','calm','wily','rife','potty','holy','peppy','rude','tarry',
      'dire','lazy']


# If any word exists in testset, delete it and resample.
for w in desc2+opin2:
    if w in testset['word']:
        print(w)

desc_all=desc+desc2
opin_all=opin+opin2


word_type='adj'
trainset=[]
for w in desc_all:
    trainset.append((w,wn_ext_defn(w,word_type),0)) # 0: physical
for w in opin_all:
    trainset.append((w,wn_ext_defn(w,word_type),1)) # 1: mental
    
    
train2=pd.DataFrame()
train2['word'],train2['text'],train2['target']=list(zip(*(trainset)))

train2.to_csv('Data/train.v4.2.round2.csv')

    Training is done by `train.py`. Remember setting Config.is_predict=False before training. 

### Evaluate

In [8]:
pred=joblib.load('Data/pred.v4.2.round2') # load prediction result
assert pred.shape[0]==adj.shape[0]

In [9]:
# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.575
positive::recall 0.639
negative::precision 0.783
negative::recall 0.734
F1: 0.61
N F1: 0.76


## Iter3

### Setup

In [221]:
DEBUG=False # True: do debug and unit test

In [263]:
# sampling states
new_samples=[] # samples extracted out in this iteration
        
if DEBUG: # only for debug and unit test
    d_samples={}

start=time.time()
for i in range(repeat): 
    print('%d repeat.'%i)
    # main action
    _new=find_samples_batch(cands=adj_cand,cores=cores,model=model,
                       topk=topK,batch_size=batch_size, only_words=True)
    
    # update states
    new_samples+=_new
    cores+=_new
    
    if DEBUG:
        d_samples[i]=_new
dur=time.time()-start
print("Runtime: %.2f sec"%dur)

# after sampling, the `new_samples` will be annotated.

0 repeat.
#nan# failed to get embedding. 
1 repeat.
#nan# failed to get embedding. 
2 repeat.
#nan# failed to get embedding. 
3 repeat.
#nan# failed to get embedding. 
4 repeat.
#nan# failed to get embedding. 
5 repeat.
#nan# failed to get embedding. 
6 repeat.
#nan# failed to get embedding. 
7 repeat.
#nan# failed to get embedding. 
8 repeat.
#nan# failed to get embedding. 
9 repeat.
#nan# failed to get embedding. 
10 repeat.
#nan# failed to get embedding. 
11 repeat.
#nan# failed to get embedding. 
Runtime: 73.86 sec


In [264]:
for w in new_samples:
    print(w)

lean
nigh
grim
puny
gamey
mild
silty
mad
downy
self
prima
23
mock
vain
500th
gassy
flip
two
boss
dryer
nonfat
25
sable
texan
beady
evil
lone
raspy
blest
filmy
shut
faux
85
tied
dear
swiss
spayed
jerky
teary
oiled
bushy
tame
corned
dual
rich
chic
tonal
gummy
enteral
tippy
adagio
thin
hardy
ungreased
scaly
fatty
bent
12
balmy
fizzy
lite
size
viral
waxed
pussy
gusty
barky
wooly
olive
135th
riper
tops
91
dull
glace
fusty
pulpy
musty
sexy
2nd
tarter
firm
solar
slow
undue
scurvy
nosed
aural
mined
fetid
foamy
greek
vast
roman
nuts
uncut
51
wide
void
avian
sane
mini
63
boney
mirky
styptic
epic
mere
irish
plumb
ionian
steep
valved
atrial
itchy
vestal
fond
sour
dizzy
obese


In [423]:
desc3=['lean','nigh','silty','downy','prima','23','500th','gassy','flip',
       'dryer','nonfat','sable','texan','beady','filmy','shut','faux','tied',
      'swiss','jerky','oiled','corned','tonal','gummy','roman']

opin3=['grim','puny','gamey','mild','mad','mock','vain','evil','lone',
       'raspy','blest','dear','teary','tame','rich','chic','balmy','dull',
      'sexy','sane','itchy','fond','dizzy']


# If any word exists in testset, delete it and resample.
for w in desc3+opin3:
    if w in testset['word']:
        print(w)

desc_all=desc+desc2+desc3
opin_all=opin+opin2+opin3


word_type='adj'
trainset=[]
for w in desc_all:
    trainset.append((w,wn_ext_defn(w,word_type),0)) # 0: physical
for w in opin_all:
    trainset.append((w,wn_ext_defn(w,word_type),1)) # 1: mental
    
    
train3=pd.DataFrame()
train3['word'],train3['text'],train3['target']=list(zip(*(trainset)))

train3.to_csv('Data/train.v4.3.round3.csv')

### Evaluate

In [10]:
pred=joblib.load('Data/pred.v4.3.round3') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.575
positive::recall 0.639
negative::precision 0.783
negative::recall 0.734
F1: 0.61
N F1: 0.76


In [337]:
with open("Data/coreset_cores_iter3.json", "w") as outfile:
    json.dump(cores, outfile)

### RunTwoMore

    We only provide the evaluation results of another two runs here.

In [11]:
pred=joblib.load('Data/pred.v4.3.2.round3') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance
pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.615
positive::recall 0.889
negative::precision 0.917
negative::recall 0.688
F1: 0.73
N F1: 0.79


In [13]:
pred=joblib.load('Data/pred.v4.3.3.round3') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.667
positive::recall 0.833
negative::precision 0.891
negative::recall 0.766
F1: 0.74
N F1: 0.82


## Iter4

### Setup

In [338]:
DEBUG=False # True: do debug and unit test

# sampling states
new_samples=[] # samples extracted out in this iteration
        
if DEBUG: # only for debug and unit test
    d_samples={}

start=time.time()
for i in range(repeat): 
    print('%d repeat.'%i)
    # main action
    _new=find_samples_batch(cands=adj_cand,cores=cores,model=model,
                       topk=topK,batch_size=batch_size, only_words=True)
    
    # update states
    new_samples+=_new
    cores+=_new
    
    if DEBUG:
        d_samples[i]=_new
dur=time.time()-start
print("Runtime: %.2f sec"%dur)

# after sampling, the `new_samples` will be annotated.

0 repeat.
#nan# failed to get embedding. 
1 repeat.
#nan# failed to get embedding. 
2 repeat.
#nan# failed to get embedding. 
3 repeat.
#nan# failed to get embedding. 
4 repeat.
#nan# failed to get embedding. 
5 repeat.
#nan# failed to get embedding. 
6 repeat.
#nan# failed to get embedding. 
7 repeat.
#nan# failed to get embedding. 
8 repeat.
#nan# failed to get embedding. 
9 repeat.
#nan# failed to get embedding. 
10 repeat.
#nan# failed to get embedding. 
11 repeat.
#nan# failed to get embedding. 
Runtime: 98.67 sec


In [339]:
for w in new_samples:
    print(w)

pubic
mossy
shot
pure
45th
rush
smoky
torn
syrian
lucid
daft
tubed
limper
driest
zero
corked
nubby
sign
35
laid
malted
seedy
leafy
mucous
peptic
celiac
woody
sick
sapid
footed
campy
rare
agile
baggy
indie
amber
naval
webby
baltic
sandy
silken
feral
vinous
zesty
busy
88
resiny
weedy
umber
rainy
leaky
purer
flat
red
needy
septic
tiled
tiny
piggy
catty
taboo
runny
tonic
tinny
fair
lithe
gonzo
milky
tense
turbid
spinal
meet
vile
necked
soft
toned
frizzy
cuter
asian
gilled
attic
perkier
postal
vulvar
huffy
lest
slavic
frail
wavier
poor
base
bumpy
unfit
scrub
birch
nitric
lurid
icky
mated
dutch
away
sorrel
dead
coral
liver
suave
nippy
dazed
glued
civic
55
fiver
furry
safe
stale
sworn
sneak
plummy
paid
buggy


In [495]:
desc4=['mossy','shot','pure','rush','smoky','torn','tubed','limper','driest',
       'corked','nubby','laid','leafy','mucous','woody','agile','baggy','indie',
      'amber','baltic','silken','rainy']

opin4=['lucid','daft','sick','sapid','campy','rare','feral','zesty','busy',
       'taboo','gonzo','vile','necked','toned','cuter','perkier','huffy','poor',
      'unfit','lurid']


# If any word exists in testset, delete it and resample.
for w in desc4+opin4:
    if w in testset['word']:
        print(w)

desc_all=desc+desc2+desc3+desc4
opin_all=opin+opin2+opin3+opin4


word_type='adj'
trainset=[]
for w in desc_all:
    trainset.append((w,wn_ext_defn(w,word_type),0)) # 0: physical
for w in opin_all:
    trainset.append((w,wn_ext_defn(w,word_type),1)) # 1: mental
    
    
train4=pd.DataFrame()
train4['word'],train4['text'],train4['target']=list(zip(*(trainset)))

train4.to_csv('Data/train.v4.4.2.round4.csv')

In [426]:
# 10 repeats give out enough training samples, therefore we runback `cores` by 2 repeats.
cores=cores[:len(cores)-2*topK]

### Evaluate

In [15]:
pred=joblib.load('Data/pred.v4.4.2.round4') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.600
positive::recall 0.750
negative::precision 0.836
negative::recall 0.719
F1: 0.67
N F1: 0.77


### RunTwoMore

    We only provide the evaluation results of another two runs here.

In [16]:
pred=joblib.load('Data/pred.v4.4.3.round4') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.636
positive::recall 0.778
negative::precision 0.857
negative::recall 0.750
F1: 0.70
N F1: 0.80


In [17]:
pred=joblib.load('Data/pred.v4.4.5.round4') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.653
positive::recall 0.889
negative::precision 0.922
negative::recall 0.734
F1: 0.75
N F1: 0.82


## Iter5

### Setup

In [428]:
DEBUG=False # True: do debug and unit test

# sampling states
new_samples=[] # samples extracted out in this iteration
        
if DEBUG: # only for debug and unit test
    d_samples={}

start=time.time()
for i in range(repeat): 
    print('%d repeat.'%i)
    # main action
    _new=find_samples_batch(cands=adj_cand,cores=cores,model=model,
                       topk=topK,batch_size=batch_size, only_words=True)
    
    # update states
    new_samples+=_new
    cores+=_new
    
    if DEBUG:
        d_samples[i]=_new
dur=time.time()-start
print("Runtime: %.2f sec"%dur)

# after sampling, the `new_samples` will be annotated.

0 repeat.
#nan# failed to get embedding. 
1 repeat.
#nan# failed to get embedding. 
2 repeat.
#nan# failed to get embedding. 
3 repeat.
#nan# failed to get embedding. 
4 repeat.
#nan# failed to get embedding. 
5 repeat.
#nan# failed to get embedding. 
6 repeat.
#nan# failed to get embedding. 
7 repeat.
#nan# failed to get embedding. 
8 repeat.
#nan# failed to get embedding. 
9 repeat.
#nan# failed to get embedding. 
10 repeat.
#nan# failed to get embedding. 
11 repeat.
#nan# failed to get embedding. 
Runtime: 115.55 sec


In [429]:
for w in new_samples:
    print(w)

away
sorrel
dead
coral
liver
suave
nippy
dazed
glued
civic
55
fiver
furry
safe
stale
sworn
sneak
plummy
paid
buggy
whiny
shaven
beat
boozy
stout
edged
enteric
nervy
laced
tuscan
pushy
ripest
5
straw
cured
comic
pasty
aloof
wild
alien
unkept
mussy
brushy
gamier
latin
94
pudgy
wheezy
alpha
firmest
vivid
gross
zany
lame
brute
larval
humid
grave
juicy
sold
minty
aired
rowdy
keen
wispy
witty
upwind
olden
hipped
truer
floury
weak
cystic
merry
kenyan
gluey
game
misty
fossil
airier
pearly
ferric
brindle
hurt
spicer
sissy
drafty
gushy
late
sappy
moist
honey
dark
vocal
fast
perky
dusky
stern
civil
royal
horny
ebony
fishy
sent
bully
warm
high
stark
kinky
silky
sheer
palatal
matte
matted
arable
side
rudest
risen
plane
rectal


In [497]:
desc5=['away','sorrel','dead','coral','liver','furry','stale','paid','buggy',
       'shaven','beat','edged','laced','ripest','straw','upwind','brushy','latin',
      'gross','larval','humid','aired']

opin5=['suave','nippy','dazed','safe','sworn','sworn','plummy','whiny','nervy',
       'pushy','comic','aloof','unkept','firmest','vivid','lame','rowdy','witty',
      'truer','merry']


# If any word exists in testset, delete it and resample.
for w in desc5+opin5:
    if w in testset['word']:
        print(w)

desc_all=desc+desc2+desc3+desc4+desc5
opin_all=opin+opin2+opin3+opin4+opin5


word_type='adj'
trainset=[]
for w in desc_all:
    trainset.append((w,wn_ext_defn(w,word_type),0)) # 0: physical
for w in opin_all:
    trainset.append((w,wn_ext_defn(w,word_type),1)) # 1: mental
    
    
train5=pd.DataFrame()
train5['word'],train5['text'],train5['target']=list(zip(*(trainset)))

train5.to_csv('Data/train.v4.5.round5.csv')

### Evaluate

In [19]:
pred=joblib.load('Data/pred.v4.5.round5') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.549
positive::recall 0.778
negative::precision 0.837
negative::recall 0.641
F1: 0.64
N F1: 0.73


### RunTwoMore

    We only provide the evaluation results of another two runs here.

In [20]:
pred=joblib.load('Data/pred.v4.5.2.round5') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.579
positive::recall 0.917
negative::precision 0.930
negative::recall 0.625
F1: 0.71
N F1: 0.75


In [21]:
pred=joblib.load('Data/pred.v4.5.3.round5') # load prediction result
assert pred.shape[0]==adj.shape[0]

# testset performance

pred_dict={}
for i,w in enumerate(adj['word'].tolist()):
    pred_dict[w]=pred[i]
    
    
testset['pred_score']=testset['word'].apply(lambda x: pred_dict[x])
testset['pred_label']=testset['pred_score'].apply(lambda x: 1 if x>0.5 else 0)

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))

np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.651
positive::recall 0.778
negative::precision 0.860
negative::recall 0.766
F1: 0.71
N F1: 0.81


# Summary

    Iter2: Annotate 120 words where we pick 20-20 for training.
    Iter3: Annotate 120 words where we  pick 20-20 for training.
    Iter4: Annotate 100 words where we  pick 20-20 for training.
    Iter5: Annotate 80 words where we  pick 20-20 for training.    
    Annotate 80~120words/iter. This method needs to annotate more samples than HighEntropy.
    
    For classification performance, CORESET doesn't beat HighEntropy, but better than Random.