# Filtering and Sampling:
**module**: lexsub.sample 

* ### sample on part-of-speech based lexical units: 
```
filter_examples(input_conll_file, output_conll_file, pos='v', syn_type = None)
```
* ### sample N examples per sentence: 
```
random_sample_perSentence(input_conll_file, output_conll_file, sample_size = 1, syn_type = None)
```
* ### sample percentage of total examples: 
```
random_sample_examples(input_conll_file, output_conll_file, sample_size = 1, syn_type = None):
```


## Filtering 

In [None]:
from sesame.dataio import read_conll
from sesame.conll09 import CoNLL09Example, CoNLL09Element
from sesame.sentence import Sentence
import sys
import lexsub.conll_helper as conll_helper
import lexsub.sample as sample


from ordered_set import OrderedSet

base_data_dir = 'data/open_sesame_v1_data/fn1.7'

all_data = 'original2'
verbs_data = 'verbs2'
nouns_data = 'nouns2'

dev_file = 'fn1.7.dev.syntaxnet.conll'
test_file = 'fn1.7.test.syntaxnet.conll'
train_file = 'fn1.7.fulltext.train.syntaxnet.conll'


### Verb Filter
-- to get all verb lexical-units

In [None]:
import importlib
importlib.reload(sample)
from lexsub.sample import filter_examples


input_dir = '{}/{}'.format(base_data_dir, all_data)
output_dir = '{}/{}'.format(base_data_dir, verbs_data)


filter_examples(f'{input_dir}/{train_file}', f'{output_dir}/{train_file}', pos = 'v')
filter_examples(f'{input_dir}/{dev_file}', f'{output_dir}/{dev_file}', pos = 'v')
filter_examples(f'{input_dir}/{test_file}', f'{output_dir}/{test_file}', pos = 'v')


### Noun Filter
-- to get all noun lexical-units

In [None]:
# import importlib
# importlib.reload(sample)
from lexsub.sample import filter_examples


input_dir = '{}/{}'.format(base_data_dir, all_data)
output_dir = '{}/{}'.format(base_data_dir, nouns_data)


filter_examples(f'{input_dir}/{train_file}', f'{output_dir}/{train_file}', pos = 'n')
filter_examples(f'{input_dir}/{dev_file}', f'{output_dir}/{dev_file}', pos = 'n')
filter_examples(f'{input_dir}/{test_file}', f'{output_dir}/{test_file}', pos = 'n')


## Sampling

### 1. sample N example per Sentence 
- nExPerSent_verbs_randAllExps
- nExPerSent_nouns_randAllExps

In [None]:
# import importlib
# importlib.reload(lexsub.sample) 
from lexsub.sample import random_sample_perSentence

input_dir = f'{base_data_dir}/verbs'

exps_dir = 'nExPerSent_verbs_randAllExps'

train_file = 'fn1.7.fulltext.train.syntaxnet.conll'

# to sample randomly using different sample size

exp_name = 'ExPerSent_verbs'
for i in range(1,2):
    output_dir = f'{base_data_dir}/{exps_dir}/{i:02d}{exp_name}_rand01'

    random_sample_perSentence(f'{input_dir}/{train_file}', 
                              f'{output_dir}/{train_file}', 
                                sample_size=i,
                                seed=i)



### 2. sample percentage of total examples 
- nPc_verbs_randAllExps
- nPc_nouns_randAllExps

In [None]:
from lexsub.sample import random_sample_examples


input_dir = f'{base_data_dir}/verbs'
train_file = 'fn1.7.fulltext.train.syntaxnet.conll'
exps_dir = 'nPc_verbs_randAllExps'

exp_name = 'pc_verbs'

for i in [10,20,30,40,50,100]:
    sample_size = i/100
    output_dir = f'{base_data_dir}/{exps_dir}/{i:03d}{exp_name}'
    print(sample_size, output_dir)
    random_sample_examples(f'{input_dir}/{train_file}', 
                            f'{output_dir}/{train_file}', 
                            sample_size=sample_size,
                            seed=i)
