# Config

In [None]:
# Label
# 0: Physical
# 1: Mental

In [3]:
import pandas as pd
import numpy as npy
import tqdm
from collections import defaultdict
import joblib
import time
import json
import traceback

In [4]:
workdir='./'
import sys
sys.path.append(workdir)
import os
os.chdir(workdir)

## SentiWordNet

In [27]:
sentinet=pd.read_csv('SentiWordNet3.csv',dtype={'ID':str})
sentinet=sentinet.drop(['Unnamed: 0','PosScore','NegScore'],axis=1)

In [28]:
sentinet.head()

Unnamed: 0,POS,ID,SynsetTerms,Gloss
0,a,1740,able#1,(usually followed by `to') having the necessar...
1,a,2098,unable#1,(usually followed by `to') not having the nece...
2,a,2312,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...
3,a,2527,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...
4,a,2730,acroscopic#1,facing or on the side toward the apex


In [None]:
assert sentinet.shape[0]==sentinet.loc[~sentinet['ID'].duplicated()].shape[0]

In [30]:
sentinet.loc[sentinet['ID']=='00001740']

Unnamed: 0,POS,ID,SynsetTerms,Gloss
0,a,1740,able#1,(usually followed by `to') having the necessar...
18156,n,1740,entity#1,that which is perceived or known or inferred t...
100271,r,1740,a_cappella#1,without musical accompaniment; they performed ...
103892,v,1740,take_a_breath#1 suspire#2 respire#3 breathe#1,"draw air into, and expel out of, the lungs; I ..."


    The original "ID" has duplicated values. We replace it with a new one.

In [31]:
sentinet=sentinet.drop(['ID'],axis=1)
sentinet['id']=[_ for _ in range(sentinet.shape[0])]

In [32]:
sentinet.head()

Unnamed: 0,POS,SynsetTerms,Gloss,id
0,a,able#1,(usually followed by `to') having the necessar...,0
1,a,unable#1,(usually followed by `to') not having the nece...,1
2,a,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...,2
3,a,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...,3
4,a,acroscopic#1,facing or on the side toward the apex,4


In [90]:
sentinet.to_csv('Data/SentiWordNet3_new.csv',sep='|')

In [7]:
sentinet=pd.read_csv('Data/SentiWordNet3_new.csv',sep='|')

In [8]:
adj=sentinet.loc[(sentinet.POS=='a')]

In [29]:
adj.shape

(18156, 6)

In [32]:
adv=sentinet.loc[(sentinet.POS=='r')]

In [9]:
verb=sentinet.loc[(sentinet.POS=='v')]

In [30]:
verb.shape

(13767, 5)

In [33]:
noun=sentinet.loc[(sentinet.POS=='n')]

In [37]:
sentinet['text']=sentinet['Gloss']
sentinet['target']=-1
sentinet.to_csv('Data/SentiWordNet3_new.csv',sep='|')

# TestSet

## ADJ

In [34]:
testset_a=adj.sample(n=100)

In [None]:
for _id,_w,_gloss in zip(testset_a['id'],testset_a['SynsetTerms'],testset_a['Gloss']):
    print('%d\t%s\t%s'%(_id,_w,_gloss))

In [10]:
# read annotation result
testset_adj=pd.read_csv('anno_adj.csv')[['id','words','gloss','class','subclass']]
testset_adj = testset_adj.fillna('unavailable')

In [44]:
testset_adj.head()

Unnamed: 0,id,words,gloss,class,subclass
0,17390,washingtonian#3,of or relating to the capital of the United St...,0,unavailable
1,11460,suggestive#2 significative#1 revelatory#1 indi...,(usually followed by `of') pointing out or rev...,1,unavailable
2,1133,ferny#1 ferned#1,abounding in or covered with ferns; the ferny ...,0,unavailable
3,14986,cancroid#1,of or relating to a cancroid,0,unavailable
4,2873,inefficient#2 ineffective#3,lacking the ability or skill to perform effect...,1,reasoning


## ADV

In [9]:
testset_adv=adv.sample(n=100)

In [None]:
for _id,_w,_gloss in zip(testset_adv['id'],testset_adv['SynsetTerms'],testset_adv['Gloss']):
    print('%d\t%s\t%s'%(_id,_w,_gloss))

    From above glosses, we find most of them bear a mental sense. The reason might be an adv word is used to 
    describe an action, and therefore the meaning of an adv word is related to the mentality that underlies an 
    action. No need to classify MPC.

## Verb

In [15]:
testset_v=verb.sample(n=100)

In [None]:
for _id,_w,_gloss in zip(testset_v['id'],testset_v['SynsetTerms'],testset_v['Gloss']):
    print('%d\t%s\t%s'%(_id,_w,_gloss))

In [11]:
# read annotation result
testset_v=pd.read_csv('anno_verb.csv')[['id','words','gloss','class','subclass']]

In [12]:
testset_v = testset_v.fillna('unavailable')

## Noun

In [13]:
testset_n=noun.sample(n=100)

In [None]:
for _id,_w,_gloss in zip(testset_n['id'],testset_n['SynsetTerms'],testset_n['Gloss']):
    print('%d\t%s\t%s'%(_id,_w,_gloss))

    Most noun glosses are related to a concrete entity bearing a physical concept. No need to classify MPC.

In [24]:
# get all ids in testset

test_ids=set(testset_a['id'].tolist()+testset_v['id'].tolist())

sentinet['is_test']=sentinet['id'].apply(lambda x: True if x in test_ids else False)

# Train (Adj+Verb)

        
    In each iteration, about 40 words are selected out for training. Iterations stop if performance converges.
    

In [17]:
K=20 # number of positive samples in each round and negative samples have equal number.

## Round1

### Setup

In [25]:
# use non-test and adj/verb as cand where we pick up glosses for annotation randomly
# 

cand=sentinet.loc[(~sentinet.is_test)&((sentinet.POS=='a')|(sentinet.POS=='v'))].sample(100)

In [None]:
for _id,_w,_p,_gloss in zip(cand['id'],cand['SynsetTerms'],cand['POS'],cand['Gloss']):
    print('%d\t%s\t%s\t%s'%(_id,_w,_p,_gloss))

In [None]:


"""physical1=['','','','','','','','','','',
      '','','','','','','','','',
     '','','','','','','']
mental1=['','','','','','','','','','',
      '','','','','','','','','',
     '','']
train_ids1=[]"""


In [50]:
# Randomly choose some positive(mental) and negative(physical) words for training. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_r1=pd.read_csv('Data/train_v1_r1.csv')
train_r1['text']=train_r1['gloss']
train_r1['target']=train_r1['class']

train_ids1=set(train_r1['id'].tolist())
train_r1.to_csv('Data/train_v1_r1_ready.csv')


In [49]:
sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids1 else False)

    Training is done by `train.py`. Remember setting Config.is_predict=False in `train.py` before training. 

### Evaluate

    Predict over the whole sentinet. 

In [51]:
# get the dataset for prediction
sentinet.loc[(sentinet.POS=='a')|(sentinet.POS=='v')].to_csv('Data/SentiWordNet3_new_a_v.csv',sep='|')

In [15]:
sentinet_av=sentinet.loc[(sentinet.POS=='a')|(sentinet.POS=='v')]

    Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

In [None]:
pred=npy.load('Data/pred.v1.r1.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))

In [76]:
# adj

testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.586
positive::recall 0.447
negative::precision 0.704
negative::recall 0.806
F1: 0.51
N F1: 0.75


In [89]:
# verb

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.560
positive::recall 0.757
negative::precision 0.820
negative::recall 0.651
F1: 0.64
N F1: 0.73


### Uncertain

    Filter out already chosen glosses and sort the others by uncertainty from the whole predictions.

In [None]:
cand=sentinet_av.loc[(~sentinet_av.is_test)&(~sentinet_av.is_train)].sort_values(by='uncertainty',ascending=True)[:100]

for _id,_w,_p,_gloss,_s in zip(cand['id'],cand['SynsetTerms'],cand['POS'],cand['Gloss'],cand['score']):
    print('%d\t%s\t%s\t%s\t%.4f'%(_id,_w,_p,_gloss,_s))

## Round2

### Setup

In [96]:
# Choose training samples from those most uncertain words in round1. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_r2=pd.read_csv('Data/train_v1_r2.csv')
#train_r2['text']=train_r2['gloss'] 
#train_r2['target']=train_r2['class']

train=pd.concat([train_r1[['id','text','target']],train_r2[['id','text','target']]])

train_ids=set(train['id'].tolist())

sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids else False)

train.to_csv('Data/train_v1_r1_r2.csv')

    Training is done by `train.py`. Remember setting Config.is_predict=False in `train.py` before training. 

### Evaluate

In [None]:
# Predict over the whole sentinet. Filter out already chosen glosses and sort the others by uncertainty.

# Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

pred=npy.load('Data/pred.v1.r2.2.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))

In [98]:
# adj
try:
    testset_adj=testset_adj.drop(['score'],axis=1)
except:
    pass
testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.682
positive::recall 0.789
negative::precision 0.857
negative::recall 0.774
F1: 0.73
N F1: 0.81


In [99]:
# verb
try:
    testset_v=testset_v.drop(['score'],axis=1)
except:
    pass

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

positive::precision 0.611
positive::recall 0.892
negative::precision 0.913
negative::recall 0.667
F1: 0.73
N F1: 0.77


### Uncertainty

In [92]:
# Filter out already chosen glosses and sort the others by uncertainty from the whole predictions.

cand=sentinet_av.loc[(~sentinet_av.is_test)&(~sentinet_av.is_train)].sort_values(by='uncertainty',ascending=True)[:100]

for _id,_w,_p,_gloss,_s in zip(cand['id'],cand['SynsetTerms'],cand['POS'],cand['Gloss'],cand['score']):
    print('%d\t%s\t%s\t%s\t%.4f'%(_id,_w,_p,_gloss,_s))



10135	processed#1	a	subjected to a special process or treatment; prepared ergot; processed cheeses are easy to spread	0.5000
105778	evangelize#2 evangelise#2	v	convert to Christianity; The missionaries evangelized the Pacific Islanders	0.5000
9958	unpolished#1	a	not carefully reworked or perfected or made smooth by polishing; dull unpolished shoes	0.5000
12378	bullate#1	a	of leaves; appearing puckered as if blistered	0.5000
8648	transmundane#1	a	existing or extending beyond the physical world; whatever of transmundane...insight...we may carry- William James	0.5000
6669	high-rise#1	a	used of buildings of many stories equipped with elevators; tall; avenues lined with high-rise apartment buildings	0.5000
10555	nonrandom#1	a	not random	0.5000
115080	deal#13	v	sell; deal hashish	0.5000
109152	station#1 send#5 post#4 place#14	v	assign to a station	0.5000
104153	impregnate#3	v	fertilize and cause to grow; the egg was impregnated	0.5000
8325	expeditionary#1	a	(used of military forces) designed

## Round3

In [101]:
# -------- SetUp -------- #

# Choose training samples from those most uncertain words in round1. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_cur=pd.read_csv('Data/train_v1_r3.csv')

train_last=pd.read_csv('Data/train_v1_r1_r2.csv')

train=pd.concat([train_cur[['id','text','target']],train_last[['id','text','target']]])

train_ids=set(train['id'].tolist())

sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids else False)

train.to_csv('Data/train_v1_r1tor3.csv')

In [None]:
# -------- Evaluate -------- #

# Predict over the whole sentinet. Filter out already chosen glosses and sort the others by uncertainty.

# Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

pred=npy.load('Data/pred.v1.r3.1.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))



# -- adj -- #
print('# -- adj -- #')
try:
    testset_adj=testset_adj.drop(['score'],axis=1)
except:
    pass
testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))



# -- verb -- #
print('# -- verb -- #')
try:
    testset_v=testset_v.drop(['score'],axis=1)
except:
    pass

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

In [104]:
# Filter out already chosen glosses and sort the others by uncertainty from the whole predictions.

cand=sentinet_av.loc[(~sentinet_av.is_test)&(~sentinet_av.is_train)].sort_values(by='uncertainty',ascending=True)[:100]

for _id,_w,_p,_gloss,_s in zip(cand['id'],cand['SynsetTerms'],cand['POS'],cand['Gloss'],cand['score']):
    print('%d\t%s\t%s\t%s\t%.4f'%(_id,_w,_p,_gloss,_s))

3363	formulaic#1	a	characterized by or in accordance with some formula	0.5000
11854	hyperopic#1 hypermetropic#1	a	abnormal ability to focus of distant objects	0.5000
113344	mill_around#2 mill_about#2 mill#1	v	move about in a confused manner	0.5000
993	easy#11	a	less in demand and therefore readily obtainable; commodities are easy this quarter	0.5000
117655	fog_up#1	v	get foggy; The windshield fogged up	0.5000
10114	delayed#1	a	not as far along as normal in development	0.5000
115770	withdraw#2 retire#2	v	withdraw from active participation; He retired from chess	0.5000
112489	symphonize#1 symphonise#1	v	play or sound together, in harmony	0.5000
8666	unneeded#1 unnecessary#1	a	not necessary	0.5000
597	derivational#1	a	characterized by inflections indicating a semantic relation between a word and its base; the morphological relation between `sing' and `singer' and `song' is derivational	0.5000
116932	rid_of#1 obviate#1 eliminate#2	v	do away with	0.5000
6104	unstinting#1 unstinted#1 unspari

## Round4

In [13]:
# -------- SetUp -------- #

# Choose training samples from those most uncertain words in round1. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_cur=pd.read_csv('Data/train_v1_r4.csv')

train_last=pd.read_csv('Data/train_v1_r1tor3.csv')

train=pd.concat([train_cur[['id','text','target']],train_last[['id','text','target']]])

train_ids=set(train['id'].tolist())

sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids else False)

train.to_csv('Data/train_v1_r1tor4.csv')

In [None]:
# -------- Evaluate -------- #

# Predict over the whole sentinet. Filter out already chosen glosses and sort the others by uncertainty.

# Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

pred=npy.load('Data/pred.v1.r4.1.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))



# -- adj -- #
print('# -- adj -- #')
try:
    testset_adj=testset_adj.drop(['score'],axis=1)
except:
    pass
testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))



# -- verb -- #
print('# -- verb -- #')
try:
    testset_v=testset_v.drop(['score'],axis=1)
except:
    pass

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

In [None]:
# -------- Evaluate -------- #

# Predict over the whole sentinet. Filter out already chosen glosses and sort the others by uncertainty.

# Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

pred=npy.load('Data/pred.v1.r4.2.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))



# -- adj -- #
print('# -- adj -- #')
try:
    testset_adj=testset_adj.drop(['score'],axis=1)
except:
    pass
testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))



# -- verb -- #
print('# -- verb -- #')
try:
    testset_v=testset_v.drop(['score'],axis=1)
except:
    pass

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

In [19]:
# Filter out already chosen glosses and sort the others by uncertainty from the whole predictions.

cand=sentinet_av.loc[(~sentinet_av.is_test)&(~sentinet_av.is_train)].sort_values(by='uncertainty',ascending=True)[:200]

for _id,_w,_p,_gloss,_s in zip(cand['id'],cand['SynsetTerms'],cand['POS'],cand['Gloss'],cand['score']):
    print('%d\t%s\t%s\t%s\t%.4f'%(_id,_w,_p,_gloss,_s))

187	active#1	a	tending to become more severe or wider in scope; active tuberculosis	0.5000
116283	wed#2 tie#7 splice#2 marry#2	v	perform a marriage ceremony; The minister married us on Saturday; We were wed the following week; The couple got spliced on Hawaii	0.5000
15657	orphic#1	a	ascribed to Orpheus or characteristic of ideas in works ascribed to Orpheus	0.5000
5572	lame#2 halting#1 halt#1 gimpy#1 game#1 crippled#1	a	disabled in the feet or legs; a crippled soldier; a game leg	0.5000
109637	procure#2 pimp#1 pander#2	v	arrange for sexual partners for others	0.5000
106287	pollute#1 foul#2 contaminate#1	v	make impure; The industrial wastes polluted the lake	0.4999
10741	strong#7	a	of verbs not having standard (or regular) inflection; `sing' is a strong verb	0.5001
10122	occasional#2 episodic#2	a	occurring or appearing at usually irregular intervals; episodic in his affections; occasional headaches	0.5001
106016	burn_out#1 blow_out#1 blow#21	v	melt, break, or become otherwise unusable; 

## Round5

In [None]:
# focus ONLY on verb

In [20]:
# -------- SetUp -------- #

# Choose training samples from those most uncertain words in round1. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_cur=pd.read_csv('Data/train_v1_r5.csv')

train_last=pd.read_csv('Data/train_v1_r1tor4.csv')

train=pd.concat([train_cur[['id','text','target']],train_last[['id','text','target']]])

train_ids=set(train['id'].tolist())

sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids else False)

train.to_csv('Data/train_v1_r1tor5.csv')

In [None]:
# -------- Evaluate -------- #

# Predict over the whole sentinet. Filter out already chosen glosses and sort the others by uncertainty.

# Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

pred=npy.load('Data/pred.v1.r5.1.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))



# -- adj -- #
print('# -- adj -- #')
try:
    testset_adj=testset_adj.drop(['score'],axis=1)
except:
    pass
testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))



# -- verb -- #
print('# -- verb -- #')
try:
    testset_v=testset_v.drop(['score'],axis=1)
except:
    pass

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

In [22]:
# Filter out already chosen glosses and sort the others by uncertainty from the whole predictions.

cand=sentinet_av.loc[(~sentinet_av.is_test)&(~sentinet_av.is_train)].sort_values(by='uncertainty',ascending=True)[:200]

for _id,_w,_p,_gloss,_s in zip(cand['id'],cand['SynsetTerms'],cand['POS'],cand['Gloss'],cand['score']):
    print('%d\t%s\t%s\t%s\t%.4f'%(_id,_w,_p,_gloss,_s))

9613	mint#1	a	as if new; in mint condition	0.5001
13162	sympathetic#4 likeable#1 likable#1 appealing#2	a	(of characters in literature or drama) evoking empathic or sympathetic feelings; the sympathetic characters in the play	0.5001
109007	hem#2	v	utter `hem' or `ahem'	0.4999
5639	unpermed#1	a	not having had a permanent wave; smooth glossy unpermed hair	0.4998
8983	spick-and-span#1 spic-and-span#1 brand-new#1 bran-new#1	a	conspicuously new; shiny brand-new shoes; a spick-and-span novelty	0.4996
109004	carry#17	v	pass on a communication; The news was carried to every village in the province	0.4995
17071	mediatorial#1	a	of or relating to a mediator or the duties of a mediator	0.4995
107074	name#9 diagnose#1	v	determine or distinguish the nature of a problem or an illness through a diagnostic analysis	0.4995
16849	baptistic#1	a	of or pertaining to or characteristic of the Baptist church; Baptistis baptismal practices	0.5005
16912	exponential#1	a	of or involving exponents; exponential growt

## Round6

In [24]:
# -------- SetUp -------- #

# Choose training samples from those most uncertain words in round1. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_cur=pd.read_csv('Data/train_v1_r6.csv')

train_last=pd.read_csv('Data/train_v1_r1tor5.csv')

train=pd.concat([train_cur[['id','text','target']],train_last[['id','text','target']]])

train_ids=set(train['id'].tolist())

sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids else False)

train.to_csv('Data/train_v1_r1tor6.csv')

In [None]:
# -------- Evaluate -------- #

# Predict over the whole sentinet. Filter out already chosen glosses and sort the others by uncertainty.

# Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

pred=npy.load('Data/pred.v1.r6.1.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))



# -- adj -- #
print('# -- adj -- #')
try:
    testset_adj=testset_adj.drop(['score'],axis=1)
except:
    pass
testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))



# -- verb -- #
print('# -- verb -- #')
try:
    testset_v=testset_v.drop(['score'],axis=1)
except:
    pass

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

In [26]:
# Filter out already chosen glosses and sort the others by uncertainty from the whole predictions.

cand=sentinet_av.loc[(~sentinet_av.is_test)&(~sentinet_av.is_train)].sort_values(by='uncertainty',ascending=True)[:200]

for _id,_w,_p,_gloss,_s in zip(cand['id'],cand['SynsetTerms'],cand['POS'],cand['Gloss'],cand['score']):
    print('%d\t%s\t%s\t%s\t%.4f'%(_id,_w,_p,_gloss,_s))

113485	motor#1 drive#2	v	travel or be transported in a vehicle; We drove to the university every morning; They motored to London for the theater	0.5001
109368	charm#3	v	protect through supernatural powers or charms	0.5002
116199	post#6	v	enter on a public list	0.5004
112167	brew#1	v	prepare by brewing; people have been brewing beer for thousands of years	0.5007
115314	give_away#3	v	formally hand over to the bridegroom in marriage; of a bride by her father	0.4993
5574	handicapped#1 disabled#1	a	incapable of functioning as a consequence of injury or illness	0.4990
108846	express#7	v	send by rapid transport or special messenger service; She expressed the letter to Florida	0.4989
14588	anacoluthic#1	a	of or related to syntactic inconsistencies of the sort known as anacoluthons	0.4989
2892	whole#1	a	including all components without exception; being one unit or constituting the full amount or extent or duration; complete; gave his whole attention; a whole wardrobe for the tropics; the whole 

## Round7

In [27]:
# -------- SetUp -------- #

# Choose training samples from those most uncertain words in round1. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_cur=pd.read_csv('Data/train_v1_r7.csv')

train_last=pd.read_csv('Data/train_v1_r1tor6.csv')

train=pd.concat([train_cur[['id','text','target']],train_last[['id','text','target']]])

train_ids=set(train['id'].tolist())

sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids else False)

train.to_csv('Data/train_v1_r1tor7.csv')

In [None]:
# -------- Evaluate -------- #

# Predict over the whole sentinet. Filter out already chosen glosses and sort the others by uncertainty.

# Predicting is done by `train.py`. Remember setting Config.is_predict=True in `train.py`.

pred=npy.load('Data/pred.v1.r7.1.npy') # load prediction result
assert pred.shape[0]==sentinet_av.shape[0]
sentinet_av['score']=pred.flatten()
sentinet_av['uncertainty']=sentinet_av['score'].apply(lambda x: abs(x-0.5))



# -- adj -- #
print('# -- adj -- #')
try:
    testset_adj=testset_adj.drop(['score'],axis=1)
except:
    pass
testset_adj=testset_adj.merge(sentinet_av[['id','score']],on='id')
testset_adj['pred_label']=testset_adj['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_adj['target']=testset_adj['class']

testset=testset_adj

# positive::precision
print("positive::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==1)&(testset_adj.target==1)].shape[0]/testset_adj.loc[(testset_adj.target==1)].shape[0]))


print("negative::precision %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset_adj.loc[(testset_adj.pred_label==0)&(testset_adj.target==0)].shape[0]/testset_adj.loc[(testset_adj.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))



# -- verb -- #
print('# -- verb -- #')
try:
    testset_v=testset_v.drop(['score'],axis=1)
except:
    pass

testset_v=testset_v.merge(sentinet_av[['id','score']],on='id')
testset_v['pred_label']=testset_v['score'].apply(lambda x: 1 if x>0.5 else 0)
testset_v['target']=testset_v['class']

testset=testset_v

# positive::precision
print("positive::precision %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]))
print("positive::recall %.3f"%(testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]))


print("negative::precision %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]))
print("negative::recall %.3f"%(testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]))

pp=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.pred_label==1)].shape[0]
pr=testset.loc[(testset.pred_label==1)&(testset.target==1)].shape[0]/testset.loc[(testset.target==1)].shape[0]

print("F1: %.2f"%(2*pp*pr/(pp+pr)))


np=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.pred_label==0)].shape[0]
nr=testset.loc[(testset.pred_label==0)&(testset.target==0)].shape[0]/testset.loc[(testset.target==0)].shape[0]

print("N F1: %.2f"%(2*np*nr/(np+nr)))

## Round7.2

In [29]:
# -------- SetUp -------- #

# Choose training samples from those most uncertain words in round1. Proportion of samples, 1:1
# positive: class=1
# negative: class=0

train_cur=pd.read_csv('Data/train_v1_r7_2.csv')

train_last=pd.read_csv('Data/train_v1_r1tor6.csv')

train=pd.concat([train_cur[['id','text','target']],train_last[['id','text','target']]])

train_ids=set(train['id'].tolist())

sentinet['is_train']=sentinet['id'].apply(lambda x: True if x in train_ids else False)

train.to_csv('Data/train_v1_r1tor7_2.csv')

    R7.2 evaluation is exactly the same as R7.1. R7.1 is worse, so we choose R6 as final.

# Release

In [102]:
sentinet_av.head()

Unnamed: 0.1,Unnamed: 0,POS,SynsetTerms,Gloss,id,is_test,text,target,is_train,score,uncertainty
0,0,a,able#1,(usually followed by `to') having the necessar...,0,False,(usually followed by `to') having the necessar...,-1,False,0.634957,0.134957
1,1,a,unable#1,(usually followed by `to') not having the nece...,1,False,(usually followed by `to') not having the nece...,-1,False,0.744525,0.244525
2,2,a,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...,2,False,facing away from the axis of an organ or organ...,-1,False,0.5184,0.0184
3,3,a,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...,3,False,nearest to or facing toward the axis of an org...,-1,False,0.552861,0.052861
4,4,a,acroscopic#1,facing or on the side toward the apex,4,False,facing or on the side toward the apex,-1,False,0.378825,0.121175


In [None]:
# Release result as a dataframe
# --------------------
# POS
# SynsetTerms
# Gloss
# ID
# Class: "physical" or "mental" or "unavailable". Class is "unavailable" if score = -1, "physical" if score < 0.5
#     "mental" if score >= 0.5.
# Score: [0,1] or -1. For POS in (n,r), score is -1. For POS in (adj,v), score belongs to [0,1]. 
#     More close to 1, more confident that it's "mental". 
#     Similarly, more close to 1, more confident that it's "physical".



In [None]:
adv['Score']=-1
adv['Class']="unavailable"
adv['ID']=adv['id']

noun['Score']=-1
noun['Class']="unavailable"
noun['ID']=noun['id']

sentinet_av['Score']=sentinet_av['score']
sentinet_av['Class']=sentinet_av['Score'].apply(lambda x: 'physical' if x<0.5 else 'mental')
sentinet_av['ID']=sentinet_av['id']

cols=['ID','POS','SynsetTerms','Gloss','Class','Score']
ment_phy_net=pd.concat([sentinet_av[cols],adv[cols],noun[cols]])

ment_phy_net.to_csv('Data/ment_phy_net_v2.csv')