Prepare Labeled and Unlabeled files for active learning experimentation. Prepare for specific tasks: sur(face)Seg(mentation)Gl(o)s, can(nonical)Seg(mentation)Gl(o)s, etc.

Test data should not change - always same 200 words. 

Result:

**train/dev** input & output files

**test** input & output file: words not in training data

**selection** (pseudo unlabeled) input file

In [1]:
import random

In [2]:
def join_data(dfile):
    print(dfile)
    words = [word.strip() for word in open(dfile+'.input', encoding='utf8')]
    labels = [label.strip() for label in open(dfile+'.output', encoding='utf8')]
    print('Test join data:', labels[:5])
    return list(zip(words, labels))
    
    
def split_test(joined_labeled_all):
    '''this creates a testset of unique words'''
    
    uniq_labeled_all = list(set(joined_labeled_all))
    total = len(uniq_labeled_all)
    print('Labeled types: ', total)
    statistics = 'Labeled types: '+str(total) + '\n' + 'Test size: ' + str(TESTSIZE) + '\n'
    
    return uniq_labeled_all[:TESTSIZE], statistics


def check_overlap(testpairs, allpairs):
    return [pair for pair in allpairs if pair not in testpairs]

def get_unlabeled(uData):
    # get words from unused labeled set
    return list(zip(*uData))[0]

def inout_file_prep(paired_set, designation='all'):
    list(list(zip(*paired_set))[0])
    
def dev_split(trainset):
    return trainset[:len(trainset)//10]

def unjoin(joined_data):
    unjoined = list(zip(*joined_data))
    return unjoined[0], unjoined[1]
                
def write_inout(joined_dataset, filename):
    "Write input and output files from zipped word and labels"
    in_dataset, out_dataset = unjoin(joined_dataset)
    with open(STOREDIR+filename+'.input', 'w', encoding='utf8') as I:
        I.write('\n'.join(in_dataset))
    with open(STOREDIR+filename+'.output','w', encoding='utf8') as O:
        O.write('\n'.join(out_dataset))

In [3]:
def main(lang,task,datafile):
    stats = lang + task + " " + CHAR + '\n'
    
    # zip in/x and out,y data to make search for types easier
    joined = join_data(datafile)
    random.shuffle(joined)
    print('Total vocab: ', len(joined))
    stats += '\nTotal vocab: ' + str(len(joined))+ '\n'
    
    # split test data 
    testset, newstats = split_test(joined)
    stats += newstats 
    write_inout(testset, 'test.'+lang+task+CHAR)

    # make sure train data has no overlap with test data
    trainingpool = check_overlap(testset, joined)
    num_train = len(trainingpool)
    stats += 'Labeled (train/select) tokens: ' + str(num_train)
    stats += '\n'
    
    # from here on, it's mostly the same thing for each dataset size
    # make train and select files 
    for size in TRAINSIZES:
        if size >= num_train:
            size = num_train 
        # train/dev split
        #devsize = size//10
        #dev_set = trainingpool[:devsize]
        #train_set = trainingpool[devsize:size]
        train_set = trainingpool[:size]
        # selection data is the data not in current training data
        selection_data = trainingpool[size:]
        select_size = len(selection_data)
        if select_size < 25:
            print('NO SELECT DATA for', lang)
            
        # files
        code = str(size) + CHAR
        write_inout(train_set, 'train.'+lang+task+code)
        #write_inout(dev_set, 'dev.'+lang+ext)
        # create select files
        write_inout(selection_data, 'select.'+lang+task+code)
        with open(STOREDIR+lang+task+CHAR+'_STATS.txt', 'w') as S:
            S.write(stats)

In [5]:
STOREDIR = r'C:/Users/thesa/Documents/GitHub/al_morphseg/al_trainselect/'

LANGFOLDERS = ['btz','cho','lez'] # 'nyb',
#LANGFOLDERS = ['bdg','ntu','tau']
DATA = ['btz_L','cho_L','lez_L'] #'nyb',
#DATA = ['bdg_L','ntu_L','tau_L'] #'nyb',
TRAINSIZES = [2500,2000,1500,1000,500,100,50]
#TRAINSIZES = [10000,8000,6000,4000,2500,2000,1500,1000,500,100,50]
#all tasks '_surSeg', '_surSegGls', '_gls', '_infl'
TASKS = ['_infl']
TESTSIZE = 200

#SELECT_SIZES = [25,50,75,100,150,200]
CHAR = '' # treatment of combining characters
    
for lang,language in enumerate(LANGFOLDERS):
    for task in TASKS:
        if CHAR == '':
            lfilepath = r'C:/Users/thesa/OneDrive - University of Florida/Research/AL/data/'+language+'/'+DATA[lang]+task
        else:
            lfilepath = r'./'+language+'/'+DATA[lang]+task
        main(language,task,lfilepath)

C:/Users/thesa/OneDrive - University of Florida/Research/AL/data/btz/btz_L_infl
Test join data: ['n', 'Prep', 'distrnum', 'n', 'n']
Total vocab:  3779
Labeled types:  1422
C:/Users/thesa/OneDrive - University of Florida/Research/AL/data/cho/cho_L_infl
Test join data: ['n TODAY,.THIS.DAY', 'v 1SI', 'adv', 'v 1SI', 'n TODAY,.THIS.DAY']
Total vocab:  4221
Labeled types:  595
C:/Users/thesa/OneDrive - University of Florida/Research/AL/data/lez/lez_L_infl
Test join data: ['cardnum', 'n IN EL', 'v IMPF PTCP', 'cop', 'cardnum']
Total vocab:  11210
Labeled types:  2607
