# Evaluating features extraction

In [1]:
import sys
sys.path.insert(0, '/home/aa/git/pyseqlab_exp')
import os
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from pyseqlab.features_extraction import FOFeatureExtractor, HOFeatureExtractor, SeqsRepresenter
from pyseqlab.ho_crf_ad import HOCRFAD, HOCRFADModelRepresentation
from pyseqlab.fo_crf import FirstOrderCRF, FirstOrderCRFModelRepresentation
from pyseqlab.workflow import TrainingWorkflow
from pyseqlab.utilities import ReaderWriter, SequenceStruct, TemplateGenerator, \
                               create_directory, generate_updated_model, generate_trained_model
from pyseqlab.attributes_extraction import AttributeScaler, GenericAttributeExtractor, NERSegmentAttributeExtractor
from pyseqlab.crf_learning import Learner

# define frequently used directories
current_dir = os.getcwd()
root_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
def trainconfig_1():
    template_generator = TemplateGenerator()
    templateXY = {}
    # generating template for tracks
    template_generator.generate_template_XY('w', ('1-gram', range(0,1)), '1-state:2-states', templateXY)
    templateY = {'Y':()}
    filter_obj = None
    ascaler_class = None
    return(templateXY, templateY, ascaler_class, filter_obj)

def trainconfig_2():
    template_generator = TemplateGenerator()
    templateXY = {}
    # generating template for tracks
    template_generator.generate_template_XY('w', ('1-gram', range(0,1)), '1-state:2-states', templateXY)
    templateY = template_generator.generate_template_Y('2-states')
    filter_obj = None
    ascaler_class = None
    return(templateXY, templateY, ascaler_class, filter_obj)

def trainconfig_3():
    template_generator = TemplateGenerator()
    templateXY = {}
    # generating template for tracks
    template_generator.generate_template_XY('w', ('1-gram', range(0,1)), '1-state:2-states', templateXY)
    templateY = template_generator.generate_template_Y('2-states:3-states')
    filter_obj = None
    ascaler_class = None
    return(templateXY, templateY, ascaler_class, filter_obj)

def trainconfig_4():
    template_generator = TemplateGenerator()
    templateXY = {}
    # generating template for tracks
    template_generator.generate_template_XY('w', ('1-gram', range(0,1)), '1-state:2-states', templateXY)
    templateY = template_generator.generate_template_Y('1-state')
    filter_obj = None
    ascaler_class = None
    return(templateXY, templateY, ascaler_class, filter_obj)

def load_seqs():
    seqs = []
    X = [{'w':'The'}, {'w':'dog'}, {'w':'barks'}]
    Y = ['DT', 'N', 'V']
    seqs.append(SequenceStruct(X, Y))
    X = [{'w':'Cool'}, {'w':'dog'}]
    Y = ['ADJ', 'N']
    seqs.append(SequenceStruct(X, Y))
    return(seqs)

def train_crfs(model_type, optimization_options, dsplit_options, trainconfig):
    if(model_type == "HO_AD"):
        crf_model = HOCRFAD
        model_repr = HOCRFADModelRepresentation
        fextractor = HOFeatureExtractor
    elif(model_type == "FO"):
        crf_model = FirstOrderCRF 
        model_repr = FirstOrderCRFModelRepresentation
        fextractor = FOFeatureExtractor
    template_xy, template_y, ascaler_class, filter_obj = trainconfig()
    workflow_trainer = TrainingWorkflow(template_y, template_xy, model_repr, 
                                        crf_model, fextractor, NERSegmentAttributeExtractor,
                                        "", optimization_options, root_dir, filter_obj)
    seqs = load_seqs()
    data_split = workflow_trainer.seq_parsing_workflow(seqs, dsplit_options)
    models_info = workflow_trainer.traineval_folds(data_split, meval=False)



In [11]:
import os
print(os.getcwd())

/home/aa/git/pyseqlab_exp/tests


In [2]:
dsplit_options = {'method':"none"}
optimization_options = {'method': "L-BFGS-B",
                            'regularization_type': 'l2',
                            'regularization_value': 0
                        }
trainconfigs = [trainconfig_1, trainconfig_2, trainconfig_3, trainconfig_4]
for trainconfig in trainconfigs:
    train_crfs('HO_AD', optimization_options, dsplit_options, trainconfig)
    print("-"*40)

boundary  (2, 2)
xy_feat
{'DT|N': {'w[0]=dog': 1}, 'N': {'w[0]=dog': 1}}
y_feat
{}
xy_feat after join
{'DT|N': {'w[0]=dog': 1}, 'N': {'w[0]=dog': 1}}
boundary  (3, 3)
xy_feat
{'V': {'w[0]=barks': 1}, 'N|V': {'w[0]=barks': 1}}
y_feat
{}
xy_feat after join
{'V': {'w[0]=barks': 1}, 'N|V': {'w[0]=barks': 1}}
boundary  (1, 1)
xy_feat
{'DT': {'w[0]=The': 1}}
y_feat
{}
xy_feat after join
{'DT': {'w[0]=The': 1}}
dumping globalfeatures -- processed seqs:  1
boundary  (1, 1)
xy_feat
{'ADJ': {'w[0]=Cool': 1}}
y_feat
{}
xy_feat after join
{'ADJ': {'w[0]=Cool': 1}}
boundary  (2, 2)
xy_feat
{'ADJ|N': {'w[0]=dog': 1}, 'N': {'w[0]=dog': 1}}
y_feat
{}
xy_feat after join
{'ADJ|N': {'w[0]=dog': 1}, 'N': {'w[0]=dog': 1}}
dumping globalfeatures -- processed seqs:  2
constructing model -- processed seqs:  1
constructing model -- processed seqs:  2
identifying model active features -- processed seqs:  1
identifying model active features -- processed seqs:  2
inverted modelfeatures  {'w[0]=Cool': {1: {'ADJ'}}

KeyError: ('V', 'V')

In [4]:
gfeatures_seq1 = ReaderWriter.read_data('/home/aa/git/pyseqlab_exp/working_dir/reference_corpus_2017_5_9-15_13_44_876325/global_features/seq_1/globalfeatures')

In [5]:
print(gfeatures_seq1)

{'DT|N': Counter({'DT|N': 1, 'w[0]=dog': 1}), 'N': Counter({'DT|N': 1, 'w[0]=dog': 1}), 'DT': Counter({'w[0]=The': 1}), 'V': Counter({'w[0]=barks': 1, 'N|V': 1}), 'N|V': Counter({'w[0]=barks': 1, 'N|V': 1})}
