In [1]:
import codecs
import numpy as np
import pandas as pd
import scipy
import sys, codecs, json, math, time, warnings
import sklearn_crfsuite
import sklearn
import nltk
nltk.download('averaged_perceptron_tagger')
warnings.simplefilter( action='ignore', category=FutureWarning )

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tosin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
file_book = 'C:\\Users\\tosin\\Documents\\Comp3225_coursework\\books_nlp\\book_1.txt'
ontonotes = 'C:\\Users\\tosin\\Documents\\Comp3225_coursework\\comp3225_example_package\\ontonotes_parsed.json'

In [34]:
def preprocess_textfile(filename):
    text = ''
    for line in codecs.open(file_book, "r", encoding="utf-8"):
        text += line
    sentences = nltk.sent_tokenize(text)
    word_pos_tags = [nltk.pos_tag(nltk.word_tokenize(word)) for word in sentences]
    lemmatizer = nltk.stem.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()
    word_lemmas = []
    for sent in word_pos_tags:
        word_lemmas.append([(stemmer.stem(word[0]), word[1]) for word in sent])
       # word_lemmas.append([(lemmatizer.lemmatize(word[0]), word[1]) for word in sent])
    return word_pos_tags, word_pos_tags

In [36]:
def task2_word2features(sent, i):
	word = sent[i][0]
	postag = sent[i][1]

	features = {
		'word' : word,
		'postag': postag,

		# token shape
		'word.lower()': word.lower(),
		'word.isupper()': word.isupper(),
		'word.istitle()': word.istitle(),
		'word.isdigit()': word.isdigit(),

		# token suffix
		'word.suffix': word.lower()[-3:],

		# POS prefix
		'postag[:2]': postag[:2],
	}
	if i > 0:
		word_prev = sent[i-1][0]
		postag_prev = sent[i-1][1]
		features.update({
			'-1:word.lower()': word_prev.lower(),
			'-1:postag': postag_prev,
			'-1:word.lower()': word_prev.lower(),
			'-1:word.isupper()': word_prev.isupper(),
			'-1:word.istitle()': word_prev.istitle(),
			'-1:word.isdigit()': word_prev.isdigit(),
			'-1:word.suffix': word_prev.lower()[-3:],
			'-1:postag[:2]': postag_prev[:2],
		})
	else:
		features['BOS'] = True

	if i < len(sent)-1:
		word_next = sent[i+1][0]
		postag_next = sent[i+1][1]
		features.update({
			'+1:word.lower()': word_next.lower(),
			'+1:postag': postag_next,
			'+1:word.lower()': word_next.lower(),
			'+1:word.isupper()': word_next.isupper(),
			'+1:word.istitle()': word_next.istitle(),
			'+1:word.isdigit()': word_next.isdigit(),
			'+1:word.suffix': word_next.lower()[-3:],
			'+1:postag[:2]': postag_next[:2],
		})
	else:
		features['EOS'] = True

	return features

In [37]:
def task1_word2features(sent, i):

	word = sent[i][0]
	postag = sent[i][1]

	features = {
		# basic features - token and POS tag
		'word' : word,
		'postag': postag,
	}
	if i > 0:
		# features for previous word (context)
		word_prev = sent[i-1][0]
		postag_prev = sent[i-1][1]
		features.update({
			'-1:word.lower()': word_prev.lower(),
			'-1:postag': postag_prev,
		})
	else:
		features['BOS'] = True

	if i < len(sent)-1:
		# features for next word (context)
		word_next = sent[i+1][0]
		postag_next = sent[i+1][1]
		features.update({
			'+1:word.lower()': word_next.lower(),
			'+1:postag': postag_next,
		})
	else:
		features['EOS'] = True

	return features

In [38]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features

In [39]:
def sent2features(sent, word2features_func = None):
	return [word2features_func(sent, i) for i in range(len(sent))]

def sent2labels(sent):
	return [label for token, postag, label in sent]

def sent2tokens(sent):
	return [token for token, postag, label in sent]



In [1]:
def exec_task(filebook, dataset_file, word2features_func, max_files = 20,train_crf_model_func = None,  max_iter = 50) :
    #make a dataset from english NE labelled ontonotes sents
    train_sents, test_sents = create_dataset( dataset_file, max_files = max_files )
    txt_sents, sentences = preprocess_textfile(file_book)
    

    # create feature vectors for every sent
    X_train = [sent2features(s, word2features_func = word2features_func) for s in train_sents]
    Y_train = [sent2labels(s) for s in train_sents]
    unsup_text = [sent2features(s, word2features_func = word2features_func) for s in txt_sents]

    X_test = [sent2features(s, word2features_func = word2features_func) for s in test_sents]
    Y_test = [sent2labels(s) for s in test_sents]

    # get the label set
    set_labels = set([])
    for data in [Y_train,Y_test] :
        for n_sent in range(len(data)) :
            for str_label in data[n_sent] :
                set_labels.add( str_label )
    labels = list( set_labels )

    # remove 'O' label as we are not usually interested in how well 'O' is predicted
    #labels = list( crf.classes_ )
    labels.remove('O')
    

    crf = train_crf_model_func( X_train, Y_train, max_iter, labels)
    Y_pred = crf.predict( X_test )
    result_pred = crf.predict(unsup_text)
    #print(len(Y_pred[1]), len(X_test[1]))
    sorted_labels = sorted(
        labels, 
        key=lambda name: (name[1:], name[0])
    )
     
    macro_scores = sklearn_crfsuite.metrics.flat_classification_report( Y_test, Y_pred, labels=sorted_labels)
    print( macro_scores )
    result = []
    accept = ['DATE','CARDINAL','ORDINAL','NORP']
    #print(len(X_test[1]), len(Y_pred[1]))
    for i in range(0,len(Y_pred)):
        conlltags = [(word['word'], word['postag'] , tg) for tg, word in zip(result_pred[i], unsup_text[i])]
        ne_tree = nltk.chunk.conlltags2tree(conlltags)
        for subtree in ne_tree:
            if type(subtree) == nltk.tree.Tree:
                original_label = subtree.label()
                original_string = " ".join([token for token, pos in subtree.leaves()]).lower().strip()
                result.append((original_string, original_label))
    d = {}
    for value,key in result:
        d.setdefault(key, []).append(value)
    return d
   


In [2]:
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
def task1_train_crf_model( X_train, Y_train, max_iter, labels ) :
    crf = sklearn_crfsuite.CRF(
      algorithm='lbfgs',
      max_iterations=100,
      all_possible_transitions=True
  )
    params_space = {
      'c1': scipy.stats.expon(scale=0.05),
      'c2': scipy.stats.expon(scale=0.5),
  }
    
  # use the same metric for evaluation
    f1_scorer = make_scorer(sklearn_crfsuite.metrics.flat_f1_score,
                          average='weighted', labels=labels)
  # search
    rs = sklearn.model_selection.RandomizedSearchCV(crf, params_space,
                          cv=3,
                          verbose=1,
                          n_jobs=-1,
                          n_iter=30,
                          scoring=f1_scorer)
    rs.fit(X_train, Y_train)
    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)
    print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
    return rs

In [3]:
def task3_train_crf_model( X_train, Y_train, max_iter, labels ) :
	# train the basic CRF model
	crf = sklearn_crfsuite.CRF(
		algorithm='lbfgs',
		c1=0.1,
		c2=0.1,
		max_iterations=max_iter,
		all_possible_transitions=True,
	)
	crf.fit(X_train, Y_train)
	return crf

In [4]:
def create_dataset( dataset_file, max_files = 50 ) :
	# load parsed ontonotes dataset
	readHandle = codecs.open( dataset_file, 'r', 'utf-8', errors = 'replace' )
	str_json = readHandle.read()
	readHandle.close()
	dict_ontonotes = json.loads( str_json )
    
	# make a training and test split
	list_files = list( dict_ontonotes.keys() )
	if len(list_files) > max_files :
		list_files = list_files[ :max_files ]
	nSplit = math.floor( len(list_files)*0.9 )
	list_train_files = list_files[ : nSplit ]
	list_test_files = list_files[ nSplit : ]

	# sent = (tokens, pos, IOB_label)
	list_train = []
	for str_file in list_train_files :
		for str_sent_index in dict_ontonotes[str_file] :
			# ignore sents with non-PENN POS tags
			if 'XX' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue
			if 'VERB' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue

			list_entry = []

			# compute IOB tags for named entities (if any)
			ne_type_last = None
			for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
				strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
				strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
				ne_type = None
				if 'ne' in dict_ontonotes[str_file][str_sent_index] :
					dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
					if not 'parse_error' in dict_ne :
						for str_NEIndex in dict_ne :
							if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
								ne_type = dict_ne[str_NEIndex]['type']
								break
				if ne_type != None :
					if ne_type == ne_type_last :
						strIOB = 'I-' + ne_type
					else :
						strIOB = 'B-' + ne_type
				else :
					strIOB = 'O'
				ne_type_last = ne_type

				list_entry.append( ( strToken, strPOS, strIOB ) )

			list_train.append( list_entry )

	list_test = []
	for str_file in list_test_files :
		for str_sent_index in dict_ontonotes[str_file] :
			# ignore sents with non-PENN POS tags
			if 'XX' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue
			if 'VERB' in dict_ontonotes[str_file][str_sent_index]['pos'] :
				continue

			list_entry = []

			# compute IOB tags for named entities (if any)
			ne_type_last = None
			for nTokenIndex in range(len(dict_ontonotes[str_file][str_sent_index]['tokens'])) :
				strToken = dict_ontonotes[str_file][str_sent_index]['tokens'][nTokenIndex]
				strPOS = dict_ontonotes[str_file][str_sent_index]['pos'][nTokenIndex]
				ne_type = None
				if 'ne' in dict_ontonotes[str_file][str_sent_index] :
					dict_ne = dict_ontonotes[str_file][str_sent_index]['ne']
					if not 'parse_error' in dict_ne :
						for str_NEIndex in dict_ne :
							if nTokenIndex in dict_ne[str_NEIndex]['tokens'] :
								ne_type = dict_ne[str_NEIndex]['type']
								break
				if ne_type != None :
					if ne_type == ne_type_last :
						strIOB = 'I-' + ne_type
					else :
						strIOB = 'B-' + ne_type
				else :
					strIOB = 'O'
				ne_type_last = ne_type

				list_entry.append( ( strToken, strPOS, strIOB ) )

			list_test.append( list_entry )

	return list_train, list_test

In [5]:
def task2_word2features(sent, i):
	#print(sent)
	word = sent[i][0]
	postag = sent[i][1]

	features = {
		'word' : word,
		'postag': postag,

		# token shape
		'word.lower()': word.lower(),
		'word.isupper()': word.isupper(),
		'word.istitle()': word.istitle(),
		'word.isdigit()': word.isdigit(),

		# token suffix
		'word.suffix': word.lower()[-3:],

		# POS prefix
		'postag[:2]': postag[:2],
	}
	if i > 0:
		word_prev = sent[i-1][0]
		postag_prev = sent[i-1][1]
		features.update({
			'-1:word.lower()': word_prev.lower(),
			'-1:postag': postag_prev,
			'-1:word.lower()': word_prev.lower(),
			'-1:word.isupper()': word_prev.isupper(),
			'-1:word.istitle()': word_prev.istitle(),
			'-1:word.isdigit()': word_prev.isdigit(),
			'-1:word.suffix': word_prev.lower()[-3:],
			'-1:postag[:2]': postag_prev[:2],
		})
	else:
		features['BOS'] = True

	if i < len(sent)-1:
		word_next = sent[i+1][0]
		postag_next = sent[i+1][1]
		features.update({
			'+1:word.lower()': word_next.lower(),
			'+1:postag': postag_next,
			'+1:word.lower()': word_next.lower(),
			'+1:word.isupper()': word_next.isupper(),
			'+1:word.istitle()': word_next.istitle(),
			'+1:word.isdigit()': word_next.isdigit(),
			'+1:word.suffix': word_next.lower()[-3:],
			'+1:postag[:2]': postag_next[:2],
		})
	else:
		features['EOS'] = True

	return features

In [6]:
def task5_train_crf_model( X_train, Y_train, max_iter, labels ) :
    crf = sklearn_crfsuite.CRF(
      algorithm='lbfgs',
      max_iterations=100,
      all_possible_transitions=True
  )
    params_space = {
      'c1': scipy.stats.expon(scale=0.5),
      'c2': scipy.stats.expon(scale=0.05),
  }

  # use the same metric for evaluation
    f1_scorer = make_scorer(sklearn_crfsuite.metrics.flat_f1_score,
                          average='weighted', labels=labels)

  # search
    rs = sklearn.model_selection.RandomizedSearchCV(crf, params_space,
                          cv=3,
                          verbose=1,
                          n_jobs=-1,
                          n_iter=10,
                          scoring=f1_scorer)
    rs.fit(X_train, Y_train)
    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)
    print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))
    return rs

In [7]:
exec_task(file_book, ontonotes, word2features_func = task2_word2features,train_crf_model_func = task5_train_crf_model)

NameError: name 'file_book' is not defined

In [45]:
best params: {'c1': 0.22341151833692405, 'c2': 0.061981414682358016}
    best params: {'c1': 0.005782789658969212, 'c2': 0.17842452852345164}

SyntaxError: invalid syntax (<ipython-input-45-aa6405c6ee26>, line 1)