# 3. Component_extraction_CRF

To run and evaluate the CRF models.

NB. For this notebook first download the glove files 'glove.6B.50d.txt' and 'glove.6B.100d.txt' from the internet and place them in the same location as this notebook.

In [None]:
# Only run this cell once to install prepare the glove embeddings for the CRF models.
%pip install gensim
!python3 -m gensim.scripts.glove2word2vec --input glove.6B.50d.txt --output glove.6B.50d.w2vformat.txt
!python3 -m gensim.scripts.glove2word2vec --input glove.6B.100d.txt --output glove.6B.100d.w2vformat.txt

# Train and run CRF models

In [1]:
import sklearn

from sklearn import metrics
from sklearn.metrics import classification_report

from CRF import CRF, FeaturesCRF, Features2CRF, Features3CRF, Features4CRF, Features5CRF, Features6CRF, Features7CRF
from CRF import Features8CRF, Features9CRF, Features10CRF, EmbeddingCRF, Embedding2CRF, Features2Embedding2CRF

In [2]:
eval_data_set = 'dev'
preprocessed_data=[f'polnear_with_BIO_{eval_data_set}.csv', "polnear_with_BIO_train.csv",
                   f'parc3_with_BIO_{eval_data_set}.csv', "parc3_with_BIO_train.csv",
                   f'merged_with_BIO_{eval_data_set}.csv', "merged_with_BIO_train.csv",]
filename_addition = '_extra_data.csv'
removed_filename_addition = '_removed' + filename_addition

In [None]:
# Training baseline model with polnear data
crf = CRF()
crf.train_and_run_crf_model('polnear_with_BIO_train',
                            f'polnear_with_BIO_{eval_data_set}',
                            f'CRF_out_BIO_baseline-{eval_data_set}_polnear-polnear.csv')

In [None]:
# Training baseline model with parc 3.0 data
crf = CRF()
crf.train_and_run_crf_model('parc3_with_BIO_train',
                            f'parc3_with_BIO_{eval_data_set}',
                            f'CRF_out_BIO_baseline-{eval_data_set}_parc3-parc3.csv')

In [3]:
# Training baseline model with merged data
crf = CRF()
crf.train_and_run_crf_model('merged_with_BIO_train',
                            f'merged_with_BIO_{eval_data_set}',
                            f'CRF_out_BIO_baseline-{eval_data_set}_merged-merged.csv')

['B-CONTENT', 'I-CONTENT', 'B-SOURCE', 'B-CUE', 'I-SOURCE', 'I-CUE']




In [None]:
# Training features model with polnear data
crf = FeaturesCRF()
crf.train_and_run_crf_model('polnear_with_BIO_train' + filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features-{eval_data_set}_polnear-polnear.csv')

In [None]:
# Training features2 model with polnear data
crf = Features2CRF()
crf.train_and_run_crf_model('polnear_with_BIO_train' + filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-{eval_data_set}_polnear-polnear.csv')

In [None]:
# Training features2 model with parc 3.0 data
crf = Features2CRF()
crf.train_and_run_crf_model('parc3_with_BIO_train' + filename_addition,
                            f'parc3_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-{eval_data_set}_parc3-parc3.csv')

In [5]:
# Training features2 model with merged data
crf = Features2CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-{eval_data_set}_merged-merged.csv')

['B-CONTENT', 'I-CONTENT', 'B-SOURCE', 'B-CUE', 'I-SOURCE', 'I-CUE']


In [6]:
# Training features2 model with merged data, unlabeled sentences removed
crf = Features2CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + removed_filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-{eval_data_set}-removed_merged-merged.csv')

['B-CONTENT', 'I-CONTENT', 'B-SOURCE', 'B-CUE', 'I-SOURCE', 'I-CUE']


In [None]:
# Training features5 model with merged data
crf = Features5CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features5-{eval_data_set}_merged-merged.csv')

In [None]:
# Training features6 model with merged data
crf = Features6CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features6-{eval_data_set}_merged-merged.csv')

In [None]:
# Training features7 model with merged data, unlabeled sentences removed
crf = Features7CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + removed_filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features7-{eval_data_set}-removed_merged-merged.csv')

In [3]:
# Training features8 model with merged data
crf = Features8CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features8-{eval_data_set}_merged-merged.csv')

['B-CONTENT', 'I-CONTENT', 'B-SOURCE', 'B-CUE', 'I-SOURCE', 'I-CUE']




In [4]:
# Training features8 model with merged data, unlabeled sentences removed
crf = Features8CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + removed_filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features8-{eval_data_set}-removed_merged-merged.csv')

['B-CONTENT', 'I-CONTENT', 'B-SOURCE', 'B-CUE', 'I-SOURCE', 'I-CUE']


In [None]:
# Training features9 model with merged data, unlabeled sentences removed
crf = Features9CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + removed_filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features9-{eval_data_set}-removed_merged-merged.csv')

In [7]:
# Training features10 model with merged data
crf = Features10CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features10-{eval_data_set}_merged-merged.csv')

['B-CONTENT', 'I-CONTENT', 'B-SOURCE', 'B-CUE', 'I-SOURCE', 'I-CUE']


In [8]:
# Training features10 model with merged data, unlabeled sentences removed
crf = Features10CRF()
crf.train_and_run_crf_model('merged_with_BIO_train' + removed_filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features10-{eval_data_set}-removed_merged-merged.csv')

['B-CONTENT', 'I-CONTENT', 'B-SOURCE', 'B-CUE', 'I-SOURCE', 'I-CUE']


In [None]:
# Training features2 model with polnear data, unlabeled sentences removed
crf = Features2CRF()
crf.train_and_run_crf_model('polnear_with_BIO_train' + removed_filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-{eval_data_set}-removed_polnear-polnear.csv')

In [None]:
# Training features3 model with polnear data
crf = Features3CRF()
crf.train_and_run_crf_model('polnear_with_BIO_train' + filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features3-{eval_data_set}_polnear-polnear.csv')

In [None]:
# Training features4 model with polnear data
crf = Features4CRF()
crf.train_and_run_crf_model('polnear_with_BIO_train' + filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features4-{eval_data_set}_polnear-polnear.csv')

In [None]:
# Training word embedding model, 50 dimensions, with polnear data
crf = EmbeddingCRF(50)
crf.train_and_run_crf_model('polnear_with_BIO_train.csv',
                            f'polnear_with_BIO_{eval_data_set}.csv',
                            f'CRF_out_BIO_embedding50-{eval_data_set}_polnear-polnear.csv')

In [None]:
# Training word embedding model, 50 dimensions, with merged data
crf = EmbeddingCRF(50)
crf.train_and_run_crf_model('merged_with_BIO_train.csv',
                            f'merged_with_BIO_{eval_data_set}.csv',
                            f'CRF_out_BIO_embedding50-{eval_data_set}_merged-merged.csv')

In [None]:
# Training word embedding2 model, 50 dimensions, with merged data
crf = Embedding2CRF(50)
crf.train_and_run_crf_model('merged_with_BIO_train.csv',
                            f'merged_with_BIO_{eval_data_set}.csv',
                            f'CRF_out_BIO_embedding2-50-{eval_data_set}_merged-merged.csv')

In [None]:
# Training features2 and word embedding2 model, 50 dimensions, with polnear data
crf = Features2Embedding2CRF(50)
crf.train_and_run_crf_model('polnear_with_BIO_train' + filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-embedding2-50-{eval_data_set}_polnear-polnear.csv')

In [None]:
# Training features2 and word embedding2 model together, 50 dimensions, with merged data
crf = Features2Embedding2CRF(50)
crf.train_and_run_crf_model('merged_with_BIO_train' + filename_addition,
                            f'merged_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-embedding2-50-{eval_data_set}_merged-merged.csv')

In [None]:
# Training word embedding model, 50 dimensions, with parc 3.0 data
crf = EmbeddingCRF(50)
crf.train_and_run_crf_model('parc3_with_BIO_train.csv',
                            f'parc3_with_BIO_dev_{eval_data_set}.csv',
                            f'CRF_out_BIO_embedding50-{eval_data_set}_parc3-parc3.csv')

In [None]:
# Training word embedding model, 100 dimensions, with parc 3.0 data
crf = EmbeddingCRF(100)
crf.train_and_run_crf_model('parc3_with_BIO_train.csv',
                            f'parc3_with_BIO_dev_{eval_data_set}.csv',
                            f'CRF_out_BIO_embedding100-{eval_data_set}_parc3-parc3.csv')

In [None]:
# Training word embedding model, 100 dimensions, with polnear data
crf = EmbeddingCRF(100)
crf.train_and_run_crf_model('polnear_with_BIO_train.csv',
                            f'polnear_with_BIO_{eval_data_set}.csv',
                            f'CRF_out_BIO_embedding100-{eval_data_set}_polnear-polnear.csv')

# Example embedding features
# {'bias': 1.0, 'token': 'mark', 'v0': 0.26736, 'v1': 0.18049, 'v2': -0.314, 'v3': -0.58327, 'v4': -0.23609, 'v5': -0.34614, 'v6': -0.1654, 'v7': -0.32759, 
# 'v8': -1.479, 'v9': -0.31961, 'v10': -0.27241, 'v11': -0.30492, 'v12': 0.44354, 'v13': 0.10112, 'v14': 0.33955, 'v15': -0.41747, 'v16': 0.44942, 
# 'v17': -0.035098, 'v18': -1.1154, 'v19': 0.46818, 'v20': 0.84366, 'v21': 0.040854, 'v22': 0.30347, 'v23': 0.50844, 'v24': 1.1786, 'v25': 0.034765, 
# 'v26': 0.16056, 'v27': -0.63163, 'v28': 0.33688, 'v29': 0.14631, 'v30': -0.42875, 'v31': 0.2888, 'v32': 0.00062319, 'v33': 0.87847, 'v34': 0.31677, 
# 'v35': -0.68571, 'v36': -0.057479, 'v37': 0.23021, 'v38': -0.04874, 'v39': 0.16405, 'v40': 0.2489, 'v41': -0.77387, 'v42': 0.39342, 'v43': 0.70581, 
# 'v44': -0.61186, 'v45': -0.25623, 'v46': 0.14624, 'v47': -0.78911, 'v48': -0.071785, 'v49': -1.2844, 'v50': 1.1634, 'v51': -0.28015, 'v52': 0.032799, 
# 'v53': 0.45011, 'v54': -0.20363, 'v55': -2.4238, 'v56': -0.69491, 'v57': 0.64105, 'v58': 0.99553, 'v59': 0.049622, 'v60': 0.043249, 'v61': 0.499, 
# 'v62': 0.019314, 'v63': 0.070149, 'v64': -0.036043, 'v65': -0.60366, 'v66': 0.57625, 'v67': 0.64738, 'v68': 0.4857, 'v69': -0.088133, 'v70': -0.0072429,
# 'v71': 0.5134, 'v72': -0.99625, 'v73': -0.60808, 'v74': 1.0536, 'v75': -0.49266, 'v76': -0.26202, 'v77': -0.006042, 'v78': -0.25483, 'v79': 0.5442, 
# 'v80': 0.40837, 'v81': 0.11222, 'v82': 0.14556, 'v83': -0.68029, 'v84': -0.58866, 'v85': -0.097334, 'v86': -0.19981, 'v87': -0.4306, 'v88': 0.8099, 
# 'v89': 0.1809, 'v90': -0.30745, 'v91': 0.7425, 'v92': 0.079075, 'v93': 0.52299, 'v94': -0.6159, 'v95': -0.20503, 'v96': 0.41337, 'v97': 0.070152, 
# 'v98': -0.66364, 'v99': -0.4607, 'BOS': True}

In [None]:
# Evaluation: train and eval data from the different source. Unlabeled sentences not removed.

crf = Features2CRF()
crf.train_and_run_crf_model('polnear_with_BIO_train' + filename_addition,
                            f'parc3_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2_polnear-parc3.csv')

In [None]:
# Evaluation: train and eval data from the different source. Unlabeled sentences not removed.

crf = Features2CRF()
crf.train_and_run_crf_model('parc3_with_BIO_train' + filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2_parc3-polnear.csv')

In [None]:
# Evaluation: train and eval data from the different source. Unlabeled sentences removed.

crf = Features2CRF()
crf.train_and_run_crf_model('polnear_with_BIO_train' + removed_filename_addition,
                            f'parc3_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-removed_polnear-parc3.csv')

In [None]:
# Evaluation: train and eval data from the different source. Unlabeled sentences not removed.

crf = Features2CRF()
crf.train_and_run_crf_model('parc3_with_BIO_train' + removed_filename_addition,
                            f'polnear_with_BIO_{eval_data_set}' + filename_addition,
                            f'CRF_out_BIO_features2-removed_parc3-polnear.csv')