In [2]:
# =========== IMPORT LIBRARIES ===========
from utils.utils import *
import pickle
from skseq import structured_perceptron_c
from skseq.structured_perceptron import StructuredPerceptron

In [20]:
def load_and_transform_data(file_path):
    df = pd.read_csv(file_path)
    df['words'] = df['words'].astype(str)
    X, y = transform_data_sentence_tag(df)
    return X, y

# Dictionary to store the data
data_files = {
    "train": "data/train_data_ner.csv",
    "test": "data/test_data_ner.csv",
    "tiny": "data/tiny_test.csv"
}

save_models=True
if save_models==True:
    
    data_dict = {}

    # Loop through the dictionary, load, transform, and store the data
    for key, file_path in data_files.items():
        X, y = load_and_transform_data(file_path)
        data_dict[f"X_{key}.pkl"] = X
        data_dict[f"y_{key}.pkl"] = y

    for filename, data in data_dict.items():
        save_p(filename, data)

    word_dict, tag_dict = create_corpus(data_dict["X_train.pkl"], data_dict["y_train.pkl"])
    save_p("sequence_list.pkl", sequence_list)

else:
    X_train=load_p("X_train.pkl")
    y_train=load_p("y_train.pkl")

    word_dict, tag_dict = create_corpus(X_train,y_train)
    save_p("sequence_list.pkl", sequence_list)

    sequence_list = create_sequence_list(X_train, y_train, word_dict, tag_dict)

    sequence_list=load_p("sequence_list.pkl")

In [4]:
# =========== MODELS - DEFAULT FEATURES ===========

# =========== GET FEATURES ===========
mapping_feature = IDFeatures(sequence_list)
mapping_feature.build_features()

# =========== MODEL PARAMETERS ===========
epochs = 15
structured_perceptron = StructuredPerceptron(word_dict, tag_dict, mapping_feature)
structured_perceptron.num_epochs = 5

# =========== MODEL TRAIN ===========
%time structured_perceptron.fit(mapping_feature.dataset, epochs)



Epoch: 0 Accuracy: 0.893815
Epoch: 1 Accuracy: 0.931674
Epoch: 2 Accuracy: 0.940913
Epoch: 3 Accuracy: 0.946175
Epoch: 4 Accuracy: 0.950018
Epoch: 5 Accuracy: 0.952577
Epoch: 6 Accuracy: 0.954425


KeyboardInterrupt: 

In [10]:
# =========== MODEL SAVE ===========
#structured_perceptron.save_model("fitted_models/default_features_model_wo_cython.pkl")

save_p("default_features_model_wo_cython.pkl", structured_perceptron)


In [18]:
# # =========== MODELS - ADDED FEATURES ===========

from skseq.sequences import extended_feature


# =========== GET FEATURES ===========
extra_mapping_feature = extended_feature.ExtendedFeatures(sequence_list) 
extra_mapping_feature.build_features()

# =========== MODEL PARAMETERS ===========
epochs = 15
structured_perceptron_extra = StructuredPerceptron(word_dict, tag_dict, extra_mapping_feature)
structured_perceptron_extra.num_epochs = 5
%time 
structured_perceptron_extra.fit(extra_mapping_feature.dataset, epochs)



CPU times: total: 0 ns
Wall time: 0 ns
Epoch: 0 Accuracy: 0.932172
Epoch: 1 Accuracy: 0.946864
Epoch: 2 Accuracy: 0.950514
Epoch: 3 Accuracy: 0.953234
Epoch: 4 Accuracy: 0.955109
Epoch: 5 Accuracy: 0.956550
Epoch: 6 Accuracy: 0.957786
Epoch: 7 Accuracy: 0.958589
Epoch: 8 Accuracy: 0.959384
Epoch: 9 Accuracy: 0.960120
Epoch: 10 Accuracy: 0.960703
Epoch: 11 Accuracy: 0.961394
Epoch: 12 Accuracy: 0.961745
Epoch: 13 Accuracy: 0.962609
Epoch: 14 Accuracy: 0.962600


In [23]:
save_p("extra_features_model_wo_cython.pkl",structured_perceptron_extra)
