In [10]:
import pandas as pd
import nltk
from nltk.corpus.reader import ConllCorpusReader
import nltk.chunk.named_entity
from nltk.classify import MaxentClassifier
import os
import fnmatch
from random import shuffle

In [11]:
def new_classifier_builder(self, train):
        print("The algorithm IIS is being used!")
        return MaxentClassifier.train(
            train, algorithm="IIS", gaussian_prior_sigma=1, trace=3, max_iter=10
        )

nltk.chunk.named_entity.NEChunkParserTagger._classifier_builder = new_classifier_builder

In [12]:
# implement cv myself
# inside the loop: training, predict
# define scoring function with the previous function which says a label is right or not
# k fold -> list of data splits -> chunked_sents 
# not round number of samples? divide them. three folds with 9 samples instead of one fold with 7

# def filecount(dir):
#     path, dirs, files = next(os.walk(dir))
#     file_count = len(files)
#     return file_count

def filecount(dir):
    num_files = len(fnmatch.filter(os.listdir(dir),'eng.*')) + len(fnmatch.filter(os.listdir(dir),'deu.*'))
    return num_files

def cross_validation(k, language, dataform, datadir, resultsdir):
    if language != 'eng' and language != 'deu': raise ValueError('Language parameter incorrect.')
    columnlist = ['IOB Accuracy','Precision:','Recall','F-Measure', 'True Positives', 'False Positives', 'False Negatives']
    df = pd.DataFrame(columns= columnlist)
    filesum = filecount(datadir)
    samplels = []
    for i in range(filesum):
        with open(datadir + '/' + language + '.' + str(i)) as ele:
            samplels.append(ele.read())
    samplels = list(filter(None, samplels))
    shuffle(samplels)
    print('Cross validation for ' + str(len(samplels)) + ' samples has begun')
    foldsize = len(samplels) // k
    num_extra_bins = len(samplels) % k # the number of extra sized bins
    foldsizes = [foldsize + (i < num_extra_bins) for i in range(k)]
    trainls = ['' for i in range(k)]
    for i in range(k):
        for j in range(foldsizes[i]):
            trainls[i] += samplels[j+ sum(foldsizes[0:i])] + '\n'
    for i in range(k):
        print("Training for iteration " + str(i+1) + ' started')
        trainingls = trainls[:i] + trainls[i+1:]
        testls = trainls[i]
        with open(datadir + '/CV' + '/iteration' + str(i) + '.train', 'w') as tr:
            for j in range(len(trainingls)):
                tr.write(trainingls[j])
        if dataform == 'conllpos':
            training = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.train', ['words', 'chunk', 'pos'])
        elif language == 'deu' and dataform == 'conll':
            training = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.train', ['words', 'ignore', 'pos', 'ignore', 'chunk'])
        elif language == 'eng' and dataform == 'conll':
            training = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.train', ['words', 'pos', 'ignore', 'chunk'])
        elif language == 'deu' and dataform == 'germeval':
            training = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.train', ['ignore', 'words', 'chunk', 'ignore', 'pos'])
        else: raise ValueError("This combination of data language and data form does not exist.")
        ne_chunker = nltk.chunk.named_entity.NEChunkParser(training.chunked_sents())
        print("Training for iteration " + str(i+1) + ' completed')

        with open(datadir + '/CV' + '/iteration' + str(i) + '.test', 'w') as te:
            te.write(testls)
        if dataform == 'conllpos':
            test = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.test', ['words', 'chunk', 'pos'])
        elif language == 'deu' and dataform == 'conll':
            test = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.test', ['words', 'ignore', 'pos', 'ignore', 'chunk'])
        elif language == 'eng' and dataform == 'conll':
            test = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.test', ['words', 'pos', 'ignore', 'chunk'])
        elif language == 'deu' and dataform == 'germeval':
            test = ConllCorpusReader(datadir + '/CV', 'iteration' + str(i) + '.test', ['ignore', 'words', 'chunk', 'ignore', 'pos'])
        else: raise ValueError("This combination of data language and data form does not exist.")
        print("Evaluation for iteration " + str(i+1) + ' started')
        results = ne_chunker.evaluate(test.chunked_sents())
        bufferdf = pd.DataFrame([[results.accuracy(),results.precision(),results.recall(),results.f_measure(),results._tp_num, results._fp_num, results._fn_num]], columns = columnlist, index = ['Iteration' + str(i + 1)])
        df = df.append(bufferdf)
        print("Evaluation for iteration " + str(i+1) + ' completed')
    df.to_csv(resultsdir)
    print('results exported to: ' + resultsdir)
    print('Cross validation complete.')

        #comments:
        #read the files in a list 
        #put the texts to each fold together
        #train
        #predict, use the previous scoring function
        #export
    

In [99]:
# print('10-fold, document-level, Conll2003, German, nltk NEChunkParser, IIS')
# %time cross_validation(10, 'deu', 'conll', '../../data/conll2003/splitdeu', '../../results/7th.csv')

Training for iteration 1
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-1pdvrsdc']
Training for iteration 2
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-exri4mkp']
Training for iteration 3
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-9exchqnj']
Training for iteration 4
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-9ybh6fka']
Training for iteration 5
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-atskr0eb']
Training for iteration 6
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-ayf391a2']
Training f

In [None]:
# training data file too big(24M), biggest so far that works(9M). Error message see discord message to Moritz
# print('10-fold, sentence-level, LegalER coarse-grained, German, nltk NEChunkParser, IIS')
# %time cross_validation(10, 'deu', 'conllpos', '../../data/LegalER/dataset_courts/coarse-grained/split', '../../results/8th.csv')

10-fold, sentence-level, LegalER coarse-grained, German, nltk NEChunkParser, IIS
Cross validation for 66723 samples has begun
Training for iteration 1 started
The algorithm IIS is being used!
  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.94444        0.809
             2          -0.23688        0.889
             3          -0.17753        0.942
             4          -0.14499        0.957
             5          -0.12441        0.964
             6          -0.11016        0.969
             7          -0.09961        0.973
             8          -0.09141        0.976
             9          -0.08480        0.979
         Final          -0.07933        0.981
Training for iteration 1 completed
Evaluation for iteration 1 started
Evaluation for iteration 1 completed
Training for iteration 2 started
The algorithm IIS is being used!
  ==> Training (10 iterations)

      Iteration    

In [None]:
# print('10-fold, sentence-document-level, GermEval2014, German, nltk NEChunkParser, IIS')
# %time cross_validation(10, 'deu', 'germeval', '../../data/GermEval2014/split', '../../results/9th.csv')

10-fold, sentence-document-level, GermEval2014, German, nltk NEChunkParser, megam
Cross validation for 31300 samples has begun
Training for iteration 1 started
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-lmzyh_ot']
Training for iteration 1 completed
Evaluation for iteration 1 started
Evaluation for iteration 1 completed
Training for iteration 2 started
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-bcbp5v9j']
Training for iteration 2 completed
Evaluation for iteration 2 started
Evaluation for iteration 2 completed
Training for iteration 3 started
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-tgec57rm']
Training for iteration 3 completed
Evaluation for iteration 3 started
Evaluation for iteration 3 completed
Training for iteration 4 started


In [151]:
# print('10-fold, document-level, Conll2003, English, nltk NEChunkParser, IIS')
# %time cross_validation(10, 'eng', 'conll', '../../data/conll2003/spliteng', '../../results/10th.csv')

Training for iteration 1
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-1dedqduh']
Training for iteration 2
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-am8ryju1']
Training for iteration 3
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-yftmgkrt']
Training for iteration 4
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-eydgh9d_']
Training for iteration 5
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-n229ua_g']
Training for iteration 6
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-9ffgtrkt']
Training f

In [13]:
# finished
# print('8-fold, document-level, SEC-filings, English, nltk NEChunkParser, IIS')
# %time cross_validation(8, 'eng', 'conll', '../../data/SEC-filings/CONLL-format/data/split', '../../results/11th.csv')

8-fold, document-level, SEC-filings, English, nltk NEChunkParser, IIS
Cross validation for 8 samples has begun
Training for iteration 1 started
The algorithm IIS is being used!
  ==> Training (10 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.07944        0.961
             2          -0.06365        0.961
             3          -0.05480        0.971
             4          -0.04446        0.983
             5          -0.03736        0.989
             6          -0.03240        0.992
             7          -0.02876        0.994
             8          -0.02596        0.995
             9          -0.02372        0.996
         Final          -0.02190        0.997
Training for iteration 1 completed
Evaluation for iteration 1 started
Evaluation for iteration 1 completed
Training for iteration 2 started
The algorithm IIS is being used!
  ==> Training (10 iterations)

      Iteration    Log Likelihood 

In [3]:
# print('10-fold, sentence-document-level, WNUT17, English, nltk NEChunkParser, IIS')
# %time cross_validation(10, 'eng', 'conllpos', '../../data/WNUT17/CONLL-format/data/split', '../../results/12th.csv')

10-fold, sentence-document-level, WNUT17, English, nltk NEChunkParser, megam
Cross validation for 5690 samples has begun
Training for iteration 1 started
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-j2qmpmsm']
Training for iteration 1 completed
Evaluation for iteration 1 started
Evaluation for iteration 1 completed
Training for iteration 2 started
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-1h0yfeg_']
Training for iteration 2 completed
Evaluation for iteration 2 started
Evaluation for iteration 2 completed
Training for iteration 3 started
['../../megam_i686.opt', '-nobias', '-repeat', '10', '-explicit', '-lambda', '1.00', '-tune', '-quiet', 'multiclass', '/tmp/nltk-3wfn72s_']
Training for iteration 3 completed
Evaluation for iteration 3 started
Evaluation for iteration 3 completed
Training for iteration 4 started
['../.

In [None]:
print('10-fold, Conll2003 + GermEval2014, German, nltk NEChunkParser, IIS')
%time cross_validation(10, 'ger', 'conll', '../../data/GermEval2014/split/final', '../../results/21st.csv')