In [1]:
import codecs
import numpy as np
import pandas as pd
import scipy
import sys, codecs, json, math, time, warnings
import sklearn_crfsuite
import sklearn
import nltk
nltk.download('averaged_perceptron_tagger')
warnings.simplefilter( action='ignore', category=FutureWarning )

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tosin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def create_dataset( dataset_file, max_files = 50 ) :
    # load parsed ontonotes dataset
    readHandle = codecs.open( dataset_file, 'r', 'utf-8', errors = 'replace' )
    str_json = readHandle.read()
    readHandle.close()
    dict_ontonotes = json.loads( str_json )

    # make a training and test split
    list_files = list( dict_ontonotes.keys() )
    if len(list_files) > max_files :
        list_files = list_files[ :max_files ]
    nSplit = math.floor( len(list_files)*0.99 )
    list_train_files = list_files[ : nSplit ]
    list_test_files = list_files[ nSplit : ]

    # sent = (tokens, pos, IOB_label)
    list_train = []
    for str_file in list_train_files :
        for str_sent_index in dict_ontonotes[str_file] :
            # ignore sents with non-PENN POS tags
            if 'XX' in dict_ontonotes[str_file][str_sent_index]['pos'] :
                continue
            if 'VERB' in dict_ontonotes[str_file][str_sent_index]['pos'] :
                continue

            list_entry = []

            # compute IOB tags for named entities (if any)
            ne_type_last = None
            for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
                strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
                strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
                ne_type = None
                if 'ne' in dict_ontonotes[str_file][str_sent_index] :
                    dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
                    if not 'parse_error' in dict_ne :
                        for str_NEIndex in dict_ne :
                            if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
                                ne_type = dict_ne[str_NEIndex]['type']
                                break
                if ne_type != None :
                    if ne_type == ne_type_last :
                        strIOB = 'I-' + ne_type
                    else :
                        strIOB = 'B-' + ne_type
                else :
                    strIOB = 'O'
                ne_type_last = ne_type
                
                list_entry.append( ( strToken, strPOS, strIOB ) )

            list_train.append( list_entry )

    list_test = []
    for str_file in list_test_files :
        for str_sent_index in dict_ontonotes[str_file] :
			# ignore sents with non-PENN POS tags
            if 'XX' in dict_ontonotes[str_file][str_sent_index]['pos'] :
                continue
            if 'VERB' in dict_ontonotes[str_file][str_sent_index]['pos'] :
                continue

            list_entry = []

            #compute IOB tags for named entities (if any)
            ne_type_last = None
            for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
                strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
                strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
                ne_type = None
                if 'ne' in dict_ontonotes[str_file][str_sent_index] :
                    dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
                    if not 'parse_error' in dict_ne :
                        for str_NEIndex in dict_ne :
                            if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
                                ne_type = dict_ne[str_NEIndex]['type']
                                break
                if ne_type != None:
                    if ne_type == ne_type_last :
                        strIOB = 'I-' + ne_type
                    else :
                        strIOB = 'B-' + ne_type
                else :
                    strIOB = 'O'
                ne_type_last = ne_type
                
                list_entry.append( ( strToken, strPOS, strIOB ) )
            list_test.append( list_entry )
        return list_train, list_test

In [3]:
import re
def names(check):
    names_possibility = "((?:Dr|Mr|Mrs|Miss|Sir|Lord|Lady|King|Professor|Doctor|Madam|Gentleman|Dame)\.?\s*(?:[A-Z]\.?\s?)*(?:[A-Z][a-z0-9\-\.]+\s?)+|(?:[A-Z]\.\s?)+(?:[A-Z][a-z0-9\-\.]+\s?)+)"
    result = re.findall(names_possibility, check)
    result = [re.sub('\r|\n|\'|"|“|’', '', word.strip().strip('.').lower(), re.IGNORECASE) for word in result]
    return result

def names_match(text):
    names_possibility = "((?:Dr|Mr|Mrs|Miss|Sir|Lord|Lady|King|Professor|Doctor|Madam|Gentleman|Dame)\.?\s*(?:[A-Z]\.?\s?)*(?:[A-Z][a-z0-9\-\.]+\s?)+|(?:[A-Z]\.\s?)+(?:[A-Z][a-z0-9\-\.]+\s?)+)"
    return re.findall(names_possibility, text)

In [4]:
def preprocess_textfile(filename):
    text = ''
    for line in codecs.open(filename, "r", encoding="utf-8"):
        text += line
    sentences = nltk.sent_tokenize(text)
    word_pos_tags = [nltk.pos_tag(nltk.word_tokenize(word)) for word in sentences]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    # stemmer = nltk.stem.porter.PorterStemmer()
    word_lemmas = []
    sents = names(text)
    for sent in word_pos_tags:
        # word_lemmas.append([(stemmer.stem(word[0]), word[1]) for word in sent])
        word_lemmas.append([(lemmatizer.lemmatize(word[0]), word[1]) for word in sent])
    return word_pos_tags, sents

In [13]:
def task2_word2features(sent, i):
	word = sent[i][0]
	postag = sent[i][1]

	features = {
		'word' : word,
		'postag': postag,

		# token shape
		'word.lower()': word.lower(),
		'word.isupper()': word.isupper(),
		'word.istitle()': word.istitle(),
		'word.isdigit()': word.isdigit(),
		# token suffix
		'word.suffix': word.lower()[-3:],

		# POS prefix
		'postag[:2]': postag[:2],
	}
	if i > 0:
		word_prev = sent[i-1][0]
		postag_prev = sent[i-1][1]
		features.update({
			'-1:word.lower()': word_prev.lower(),
			'-1:postag': postag_prev,
			'-1:word.lower()': word_prev.lower(),
			'-1:word.isupper()': word_prev.isupper(),
			'-1:word.istitle()': word_prev.istitle(),
			'-1:word.isdigit()': word_prev.isdigit(),
			'-1:word.suffix': word_prev.lower()[-3:],
			'-1:postag[:2]': postag_prev[:2],
		})
	else:
		features['BOS'] = True

	if i < len(sent)-1:
		word_next = sent[i+1][0]
		postag_next = sent[i+1][1]
		features.update({
			'+1:word.lower()': word_next.lower(),
			'+1:postag': postag_next,
			'+1:word.lower()': word_next.lower(),
			'+1:word.isupper()': word_next.isupper(),
			'+1:word.istitle()': word_next.istitle(),
			'+1:word.isdigit()': word_next.isdigit(),
			'+1:word.suffix': word_next.lower()[-3:],
			'+1:postag[:2]': postag_next[:2],
		})
	else:
		features['EOS'] = True

	return features

In [6]:
def sent2features(sent, word2features_func = None):
	return [word2features_func(sent, i) for i in range(len(sent))]

def sent2labels(sent):
	return [label for token, postag, label in sent]

def sent2tokens(sent):
	return [token for token, postag, label in sent]

In [20]:
def exec_task(filebook, dataset_file, word2features_func, max_files = 20,train_crf_model_func = None,  max_iter = 50) :
    #make a dataset from english NE labelled ontonotes sents
    train_sents, test_sents = create_dataset( dataset_file, max_files = max_files )
    txt_sents, sentences = preprocess_textfile(filebook)
    
   
    #sent = names(text)
    #word_pos_tags = [nltk.pos_tag(nltk.word_tokenize(word)) for word in sent]
    

    # create feature vectors for every sent
    X_train = [sent2features(s, word2features_func = word2features_func) for s in train_sents]
    Y_train = [sent2labels(s) for s in train_sents]
    
    unsup_text = [sent2features(s, word2features_func = word2features_func) for s in txt_sents]
    #name_text = [sent2features(s, word2features_func = word2features_func) for s in word_pos_tags]

    X_test = [sent2features(s, word2features_func = word2features_func) for s in test_sents]
    Y_test = [sent2labels(s) for s in test_sents]
    

    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore', lowercase=False)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    chap_vectorized = vectorizer.transform(unsup_text)

    # get the label set
    set_labels = set([])
    for data in [Y_train,Y_test] :
        for n_sent in range(len(data)) :
            for str_label in data[n_sent] :
                set_labels.add( str_label )
    labels = list( set_labels )

    # remove 'O' label as we are not usually interested in how well 'O' is predicted
    #labels = list( crf.classes_ )
    labels.remove('O')
    
    
    crf = train_crf_model_func( X_train_vectorized, Y_train, max_iter, labels)
    Y_pred = crf.predict( X_test_vectorized )
    #names_pred = crf.predict(name_text)
    result_pred = crf.predict(chap_vectorized)
    #print(len(Y_pred[1]), len(X_test[1]))
    sorted_labels = sorted(
        labels, 
        key=lambda name: (name[1:], name[0])
    )
     
    macro_scores = sklearn_crfsuite.metrics.flat_classification_report( Y_test, Y_pred, labels=sorted_labels)
    print( macro_scores )
    result = []
    accept = ['PERSON']
    for i in range(0,len(result_pred)):
        conlltags = [(word['word'], word['postag'] , tg) for tg, word in zip(result_pred[i], unsup_text[i])]
        ne_tree = nltk.chunk.conlltags2tree(conlltags)
        for subtree in ne_tree:
            if type(subtree) == nltk.tree.Tree:
                original_label = subtree.label()
                original_string = " ".join([token for token, pos in subtree.leaves()]).lower().strip()
                result.append((original_string, original_label))
    """for i in range(0,len(names_pred)):
        conlltags = [(word['word'], word['postag'] , tg) for tg, word in zip(names_pred[i], name_text[i])]
        ne_tree = nltk.chunk.conlltags2tree(conlltags)
        for subtree in ne_tree:
            if type(subtree) == nltk.tree.Tree:
                original_label = subtree.label()
                original_string = " ".join([token for token, pos in subtree.leaves()]).lower().strip()
                result.append((original_string, original_label))"""
    d = []
    for value,key in result:
        value = re.sub('\r|\n|\'|"|“|’', '', value.strip(), re.IGNORECASE)
        if value not in d:
            d.append(value)
    
    d = d + sentences
    return set(d)

In [21]:
def exec_task(filebook, dataset_file, word2features_func, max_files=10, train_crf_model_func=None, max_iter=100):
    # make a dataset from english NE labelled ontonotes sents
    train_sents, test_sents = create_dataset(dataset_file, max_files=max_files)
    txt_sents, sentences = preprocess_textfile(filebook)
    #tagged_words = [(word, tag) for word, tag in train_sents]

    """fd = nltk.FreqDist(tagged_words)
    smoothing_factor = 1  # Add-one smoothing
    lpd = LidstoneProbDist(fd, smoothing_factor)
    for word_pos_tuple in tagged_words:
        word, pos_tag = word_pos_tuple
        smoothed_prob = lpd.prob(word_pos_tuple)
        word_pos_tuple = (word,pos_tag,smoothed_prob)"""

    # create feature vectors for every sent
    X_train = [sent2features(s, word2features_func=word2features_func) for s in train_sents]
    Y_train = [sent2labels(s) for s in train_sents]
    unsup_text = [sent2features(s, word2features_func=word2features_func) for s in txt_sents]



    X_test = [sent2features(s, word2features_func=word2features_func) for s in test_sents]
    Y_test = [sent2labels(s) for s in test_sents]

    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(sublinear_tf=True, encoding='utf-8', decode_error='ignore', lowercase=False) 
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    chap_vectorized = vectorizer.transform(unsup_text)
    
    # get the label set
    set_labels = set([])
    for data in [Y_train, Y_test]:
        for n_sent in range(len(data)):
            for str_label in data[n_sent]:
                set_labels.add(str_label)
    labels = list(set_labels)

    # remove 'O' label as we are not usually interested in how well 'O' is predicted
    # labels = list( crf.classes_ )
    labels.remove('O')
    
    crf = train_crf_model_func( X_train_vectorized, Y_train, max_iter, labels)
    Y_pred = crf.predict( X_test_vectorized )
    #names_pred = crf.predict(name_text)
    result_pred = crf.predict(chap_vectorized)
    #print(len(Y_pred[1]), len(X_test[1]))
    sorted_labels = sorted(
        labels, 
        key=lambda name: (name[1:], name[0])
    )

    # macro_scores = sklearn_crfsuite.metrics.flat_classification_report( Y_test, Y_pred, labels=sorted_labels)
    # print( macro_scores )
    result = []
    # print(len(X_test[1]), len(Y_pred[1]))
    for i in range(0, len(result_pred)):
        conlltags = [(word['word'], word['postag'], tg) for tg, word in zip(result_pred[i], unsup_text[i])]
        ne_tree = nltk.chunk.conlltags2tree(conlltags)
        for subtree in ne_tree:
            if type(subtree) == nltk.tree.Tree:
                original_label = subtree.label()
                original_string = " ".join([token for token, pos in subtree.leaves()]).lower().strip()
                result.append((original_string, original_label))
    d = {}
    for value, key in result:
        value = re.sub('[\r|\n|\'|"|“|’|\t]+', '', value.strip(), re.IGNORECASE)
        if key in d and value not in d[key] or key not in d:
            d.setdefault(key, []).append(value)
            
    for value in sentences:
        value = re.sub('[\r|\n|\'|"|“|’|\t]+', '', value.strip(), re.IGNORECASE)
        if 'PERSON' in d and value not in d['PERSON'] or 'PERSON' not in d:
            d.setdefault('PERSON', []).append(value)
    return d

In [22]:
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
def task5_train_crf_model( X_train, Y_train, max_iter, labels ) :
    crf = sklearn_crfsuite.CRF(
      algorithm='lbfgs',
      max_iterations=100,
      all_possible_transitions=True
  )
    params_space = {
      'c1': scipy.stats.expon(scale=0.5),
      'c2': scipy.stats.expon(scale=0.05),
  }

  # use the same metric for evaluation
    f1_scorer = make_scorer(sklearn_crfsuite.metrics.flat_f1_score,
                          average='weighted', labels=labels)

  # search
    rs = sklearn.model_selection.RandomizedSearchCV(crf, params_space,
                          cv=3,
                          verbose=1,
                          n_jobs=-1,
                          n_iter=10,
                          scoring=f1_scorer)
    rs.fit(X_train, Y_train)
    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)
    print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
    return rs

In [23]:
def task3_train_crf_model( X_train, Y_train, max_iter, labels ) :
	# train the basic CRF model
	crf = sklearn_crfsuite.CRF(
		algorithm='lbfgs',
		c1=0.15632504647140685,
		c2=0.043957202811694684,
		max_iterations=max_iter,
		all_possible_transitions=True,
	)
	crf.fit(X_train, Y_train)
	return crf

In [24]:
file_book = 'C:\\Users\\tosin\\Documents\\Comp3225_coursework\\comp3225_example_package\\eval_chapter.txt'
ontonotes = 'C:\\Users\\tosin\\Documents\\Comp3225_coursework\\comp3225_example_package\\ontonotes_parsed.json'
exec_task(file_book, ontonotes, word2features_func = task2_word2features,train_crf_model_func = task3_train_crf_model)

TypeError: expected string or bytes-like object

In [12]:
text = 'Rachel B. Smith'
for line in codecs.open(file_book, "r", encoding="utf-8"):
    text += line
print(set(names(text)))

{'mrs.creakle', 'mr. mell', 'mr. creakle', 'j. steerforth', 'mrs. mell', 'mrs. creakle', 'j. steerforth. steerforth', 'mr. sharp', 'mr. mell. mr. mell', 'mr.creakle', 'b. smith', 'miss creakle'}
