## PREPARING THE DATA

In [1]:
# =========== IMPORT UTILS ===========
from utils.utils import *

In [2]:
# =========== IMPORT DATASETS ===========
train = pd.read_csv("data/train_data_ner.csv")
test = pd.read_csv("data/test_data_ner.csv")
tiny_test = pd.read_csv("data/tiny_test.csv")

In [3]:
# =========== NAN VALUES ===========
# examples
print(test.loc[test.sentence_id==20056])
# print(test.loc[test.sentence_id==46902])

# change nan's to string
train['words'] = train['words'].astype(str)
test['words'] = test['words'].astype(str)
tiny_test['words'] = tiny_test['words'].astype(str)

# train['words'] = np.where(train['words'].isna(), "none", train['words'])
# test['words'] = np.where(test['words'].isna(), "none", test['words'])
# tiny_test['words'] = np.where(tiny_test['words'].isna(), "none", tiny_test['words'])

# fix
print(test.loc[test.sentence_id==20056])

       sentence_id     words tags
85436        20056       NaN    O
85437        20056        of    O
85438        20056     those    O
85439        20056  released    O
85440        20056       was    O
85441        20056    guilty    O
85442        20056        of    O
85443        20056   violent    O
85444        20056    crimes    O
85445        20056         .    O
       sentence_id     words tags
85436        20056       nan    O
85437        20056        of    O
85438        20056     those    O
85439        20056  released    O
85440        20056       was    O
85441        20056    guilty    O
85442        20056        of    O
85443        20056   violent    O
85444        20056    crimes    O
85445        20056         .    O


In [4]:
# =========== CONVERT DATASETS ===========

# Tiny dataset from the assignment pdf
# X_tiny, y_tiny = tiny_test()
X_tiny, y_tiny = transform_data_sentence_tag(tiny_test)

# Test, train data
X_train, y_train = transform_data_sentence_tag(train)
X_test, y_test = transform_data_sentence_tag(test)

Creating: 100%|██████████| 13/13 [00:00<00:00, 2740.82sentence/s]
Creating: 100%|██████████| 38366/38366 [00:18<00:00, 2073.69sentence/s]
Creating: 100%|██████████| 38367/38367 [00:18<00:00, 2057.47sentence/s]


In [5]:
# =========== CREATE DICTIONARIES WITH UNIQUE INDEX  ===========

# To store values efficiently (INTEGERS/WORDS and INTEGERS/TAGS)
# x attribute: list of words (integer words)
# y attribute: list of tags (integer tags)
# Then we need to keep a mapping from integers to words and from integers to tags.

word_dict, tag_dict = create_corpus(X_train, y_train)

In [6]:
# =========== CREATE SEQUENCE LIST  ===========

# Next we will help ourselves with the nlp_hmms lecture and create a SequenceList object (without cython)
sequence_list = create_sequence_list(X_train, y_train, word_dict, tag_dict)

Creating Sequence List: 100%|██████████| 38366/38366 [02:17<00:00, 278.33it/s]


In [7]:
# =========== CREATE TAGS  ===========

train_tag_pos = [[tag_dict[i] for i in tag] for tag in y_train]
y_train_true = [tag for array in train_tag_pos for tag in array]

test_tag_pos = [[tag_dict[i] for i in tag] for tag in y_test]
y_test_true = [tag for array in test_tag_pos for tag in array]

tiny_tag_pos = [[tag_dict[i] for i in tag] for tag in y_tiny]
y_tiny_true = [tag for array in tiny_tag_pos for tag in array]

## MODELS

In [8]:
# =========== MODELS IMPORT  ===========

# =========== MODELS - Default Features ===========
feature_mapper = IDFeatures(sequence_list)
feature_mapper.build_features()

structured_perceptron = StructuredPerceptron(word_dict, tag_dict, feature_mapper)
structured_perceptron.load_model(dir='fitted_models/default_features_model_wo_cython')

# =========== PREDICTION AND EVALUATION TEST SET ===========

y_test_pred = []

for i in tqdm(range(len(X_test)), desc="Predicting tags", unit="sequence"):
    predicted_tag = structured_perceptron.predict_tags_given_words(X_test[i])
    y_test_pred.append(predicted_tag)

y_test_pred = [np.ndarray.tolist(array) for array in y_test_pred]
y_test_pred = np.concatenate(y_test_pred).ravel().tolist()

print(f1_score_weighted(y_test_true, y_test_pred))
print(accuracy(y_test_true, y_test_pred))
plot_confusion_matrix(y_test_true, y_test_pred, tag_dict)

# =========== PREDICTION AND EVALUATION TINY SET ===========

y_tiny_test_pred = []

for i in tqdm(range(len(X_tiny)), desc="Predicting tags", unit="sequence"):
    predicted_tag = structured_perceptron.predict_tags_given_words(X_tiny[i])
    y_tiny_test_pred.append(predicted_tag)

y_tiny_test_pred = [np.ndarray.tolist(array) for array in y_tiny_test_pred]
y_tiny_test_pred = np.concatenate(y_tiny_test_pred).ravel().tolist()

print(f1_score_weighted(y_tiny_true, y_tiny_test_pred))
print(accuracy(y_tiny_true, y_tiny_test_pred))
plot_confusion_matrix(y_tiny_true, y_tiny_test_pred, tag_dict)


In [12]:
# =========== MODELS - Added Features ===========
from skseq.sequences import extended_feature
import skseq.sequences.structured_perceptron as spc
extra_mapping_feature = extended_feature.ExtendedFeatures(sequence_list) 
extra_mapping_feature.build_features()

structured_perceptron_extraf = spc.StructuredPerceptron(word_dict, tag_dict, extra_mapping_feature)
structured_perceptron_extraf.load_model(dir='fitted_models/extra_features_model_wo_cython')

# =========== PREDICTION AND EVALUATION TEST SET ===========

y_test_pred_1 = []

for i in tqdm(range(len(X_test)), desc="Predicting tags", unit="sequence"):
    predicted_tag = structured_perceptron_extraf.predict_tags_given_words(X_test[i])
    y_test_pred_1.append(predicted_tag)

y_test_pred_1 = [np.ndarray.tolist(array) for array in y_test_pred_1]
y_test_pred_1 = np.concatenate(y_test_pred_1).ravel().tolist()

print(f1_score_weighted(y_test_true, y_test_pred_1))
print(accuracy(y_test_true, y_test_pred_1))
plot_confusion_matrix(y_test_true, y_test_pred_1, tag_dict)

# =========== PREDICTION AND EVALUATION TINY SET ===========

y_tiny_test_pred_1 = []

for i in tqdm(range(len(X_tiny)), desc="Predicting tags", unit="sequence"):
    predicted_tag = structured_perceptron_extraf.predict_tags_given_words(X_tiny[i])
    y_tiny_test_pred_1.append(predicted_tag)

y_tiny_test_pred_1 = [np.ndarray.tolist(array) for array in y_tiny_test_pred_1]
y_tiny_test_pred_1 = np.concatenate(y_tiny_test_pred_1).ravel().tolist()

print(f1_score_weighted(y_tiny_true, y_tiny_test_pred_1))
print(accuracy(y_tiny_true, y_tiny_test_pred_1))
plot_confusion_matrix(y_tiny_true, y_tiny_test_pred_1, tag_dict)


FileNotFoundError: [Errno 2] No such file or directory: 'fitted_models/extra_features_model_wo_cythonparameters.txt'