# Split TWSI dataset into development and test sets.


In [1]:
import pandas
reader = pandas.read_csv('context-eval/data/Dataset-TWSI-2.csv', encoding="utf-8", delimiter="\t", dtype={'predict_related': object, 'gold_sense_ids':object, 'predict_sense_ids':object})
instances = reader

## Initial statistics

In [2]:
print "Instaces: ", len(instances)
print "Senses: ", len(instances.groupby(['target', 'gold_sense_ids']))
print "Words: ", len(instances.target.unique())

Instaces:  145140
Senses:  2403
Words:  1012


## Delete instances with undefined senses (no substitutions provided)

In [3]:
undef = instances[instances.golden_related.isnull()]
print "Undef instaces: ", len(undef)
print "in ", len(undef.target.unique()), " words: ", undef.target.unique()[:10], "..."
print "Examples: "
undef[undef.target == 'block'].tail()

Undef instaces:  2477
in  88  words:  [u'application' u'baby' u'basis' u'bit' u'block' u'bronze' u'CD'
 u'challenge' u'circle' u'club'] ...
Examples: 


Unnamed: 0,context_id,target,target_pos,target_position,gold_sense_ids,predict_sense_ids,golden_related,predict_related,context
11799,1572268,block,n,6874,11,,,,Polymer scientists use thermodynamics to descr...
11800,29219101,block,n,194199,11,,,,TPF will allow certain types of function trace...
11801,57130314,block,n,612,11,,,,These blocks are mot commonly referred to as m...
11955,3540731,block,n,105111,8,,,,Played in each of the final four matches ( eig...
11956,33626434,block,n,104110,8,,,,"Paris is the only player in NCAA history , mal..."


In [4]:
instances = instances[~instances.golden_related.isnull()]
print "Instaces: ", len(instances)
print "Senses: ", len(instances.groupby(['target', 'gold_sense_ids']))
print "Words: ", len(instances.target.unique())

Instaces:  142663
Senses:  2288
Words:  1012


## Count instances for every word sense

In [5]:
wordsenses = instances.groupby(['target', 'gold_sense_ids'], sort = False)['context_id'].apply(lambda x: len(x.unique()))
wordsenses = wordsenses.reset_index().rename(columns = {'context_id':'inst_count'})
wordsenses.head(n=20)

Unnamed: 0,target,gold_sense_ids,inst_count
0,ability,1,98
1,academic,1,114
2,academic,2,56
3,academic,3,10
4,access,1,95
5,accident,1,184
6,accident,2,9
7,account,1,69
8,account,2,29
9,acid,1,188


## Find words with unfrequents senses (less than 5 instances)

In [6]:
unfreq_senses = wordsenses[wordsenses.inst_count < 5]
print "Senses with less than 5 instances: ", len(unfreq_senses)
words_unfreq_senses = unfreq_senses.target.unique()
print "Number of words with unfrequent senses: ", len(words_unfreq_senses)
print "Examples of such words: ", words_unfreq_senses[:15]

Senses with less than 5 instances:  218
Number of words with unfrequent senses:  165
Examples of such words:  [u'acid' u'act' u'advance' u'agent' u'amount' u'article' u'baby' u'bar'
 u'basis' u'bill' u'bit' u'block' u'board' u'bottom' u'boundary']


## Remove words with unfrequent senses (completely)

In [7]:
wordsenses = wordsenses[~wordsenses.target.isin(words_unfreq_senses)]
print "Number of senses after removal: ", len(wordsenses)
print "Number of words after removal: ", len(wordsenses.target.unique())
wordsenses.head(n=20)

Number of senses after removal:  1617
Number of words after removal:  847


Unnamed: 0,target,gold_sense_ids,inst_count
0,ability,1,98
1,academic,1,114
2,academic,2,56
3,academic,3,10
4,access,1,95
5,accident,1,184
6,accident,2,9
7,account,1,69
8,account,2,29
15,action,1,79


## Count senses for every word

In [8]:
words = wordsenses.groupby('target', sort=False)['gold_sense_ids'].apply(lambda x: len(x.unique()))
words = words.reset_index().rename(columns = {'gold_sense_ids':'sense_count'})
words.head(n=20)

Unnamed: 0,target,sense_count
0,ability,1
1,academic,3
2,access,1
3,accident,2
4,account,2
5,action,2
6,activity,1
7,actor,1
8,actress,1
9,addition,2


## Remove words that have only one sense

In [9]:
multisemous_words = words[words.sense_count > 1]
print "Multisemous words: ", len(multisemous_words)
print "Senses in multisemous words: ", sum(multisemous_words.sense_count)
multisemous_words

Number of multisemous words:  463
Number of senses of multisemous words:  1233


Unnamed: 0,target,sense_count
1,academic,3
3,accident,2
4,account,2
5,action,2
9,addition,2
10,administration,2
11,adult,2
12,advantage,2
13,advertising,2
14,age,2


## Create and save devsets 
* **devset** -- 5 instances for each sense of multisemous words
* **devset_with_mono** -- 5 instanses for each sense of multisemous and monosemous words

In [30]:
devset = instances[instances.target.isin(multisemous_words.target)].groupby(['target', 'gold_sense_ids']).head(5)

print "Instances in development set (multisense only): ", len(devset)
print "Senses: ", len(devset.groupby(['target', 'gold_sense_ids']))
print "Words: ", len(devset.target.unique())
print ("Precentage of original dataset: %.2f%%" % (100*float(len(devset))/145140))
print "Examlpe: "
devset.head(10)

Instances in development set (multisense only):  6165
Senses:  1233
Words:  463
Precentage of original dataset: 4.25%
Examlpe: 


Unnamed: 0,context_id,target,target_pos,target_position,gold_sense_ids,predict_sense_ids,golden_related,predict_related,context
98,572658,academic,n,412,1,,"scholastic:21, educational:13, scholarly:9, un...",,The academic rigour of our business programs i...
99,8598564,academic,n,5563,1,,"scholastic:21, educational:13, scholarly:9, un...",,Torch Trinity is an inter - denominational the...
100,10376001,academic,n,2028,1,,"scholastic:21, educational:13, scholarly:9, un...",,Having finished the academic part the now Leut...
101,10654142,academic,n,110118,1,,"scholastic:21, educational:13, scholarly:9, un...",,IQ and the Wealth of Nations ' was not peer - ...
102,11874260,academic,n,99107,1,,"scholastic:21, educational:13, scholarly:9, un...",,"\"" The Dukes of Hazzard , Television ' s Simpl..."
212,1405871,academic,n,93101,2,,"school:3, educational:2, scholastic:2, school ...",,The KHSAA rule is that students must be under ...
213,2502562,academic,n,2432,2,,"school:3, educational:2, scholastic:2, school ...",,"During the 1985 to 1986 academic year , he lec..."
214,5169432,academic,n,255263,2,,"school:3, educational:2, scholastic:2, school ...",,A detailed announcement describing the selecti...
215,5529473,academic,n,6775,2,,"school:3, educational:2, scholastic:2, school ...",,When these new programs elevate to varsity sta...
216,13444483,academic,n,1927,2,,"school:3, educational:2, scholastic:2, school ...",,Today more than 60 academic staff work at SSEE...


In [31]:
devset_with_mono = instances[instances.target.isin(words.target)].groupby(['target', 'gold_sense_ids']).head(5)

print "Instances in development set (multisense and monosense): ", len(devset_with_mono)
print "Senses: ", len(devset_with_mono.groupby(['target', 'gold_sense_ids']))
print "Words: ", len(devset_with_mono.target.unique())
print ("Precentage of original dataset: %.2f%%" % (100*float(len(devset_with_mono))/145140))
print "Examlpe: "
devset_with_mono.head(15)

Instances in development set (multisense and monosense):  8085
Senses:  1617
Words:  847
Precentage of original dataset: 5.57%
Examlpe: 


Unnamed: 0,context_id,target,target_pos,target_position,gold_sense_ids,predict_sense_ids,golden_related,predict_related,context
0,10038908,ability,n,160169,1,,"capability:33, capacity:29, skill:19, power:15...",,"The following year , Harchester United reached..."
1,1418247,ability,n,4554,1,,"capability:33, capacity:29, skill:19, power:15...",,He has also more than once overestimated his a...
2,23647997,ability,n,9099,1,,"capability:33, capacity:29, skill:19, power:15...",,"According to this theory , if Dio existed at t..."
3,30973207,ability,n,110119,1,,"capability:33, capacity:29, skill:19, power:15...",,Their main goal was to produce an extremely st...
4,3369783,ability,n,3039,1,,"capability:33, capacity:29, skill:19, power:15...",,"Known for his astute tactical abilities , he i..."
98,572658,academic,n,412,1,,"scholastic:21, educational:13, scholarly:9, un...",,The academic rigour of our business programs i...
99,8598564,academic,n,5563,1,,"scholastic:21, educational:13, scholarly:9, un...",,Torch Trinity is an inter - denominational the...
100,10376001,academic,n,2028,1,,"scholastic:21, educational:13, scholarly:9, un...",,Having finished the academic part the now Leut...
101,10654142,academic,n,110118,1,,"scholastic:21, educational:13, scholarly:9, un...",,IQ and the Wealth of Nations ' was not peer - ...
102,11874260,academic,n,99107,1,,"scholastic:21, educational:13, scholarly:9, un...",,"\"" The Dukes of Hazzard , Television ' s Simpl..."


In [32]:
devset.to_csv("dev_TWSI.csv", sep='\t', na_rep='', index=False, encoding="utf-8")
devset_with_mono.to_csv("dev_TWSI_with_mono.csv", sep='\t', na_rep='', index=False, encoding="utf-8")

In [38]:
[(devset.target == "club") & (devset.gold_sense_ids == '3')].context

23012    The white rappers awaken and are soon met by M...
23013    In 1984 , while recording Nina Hagen In Ekstas...
23014    The 1970 fire that destroyed four of the club ...
23015    Atlantic Palace night club &amp; casino , tax ...
23016    Some Dirty Knobs material has been recorded an...
Name: context, dtype: object