In [1]:
import random
import time
import pandas as pd
from pathlib import Path
from corpus import Corpus

DATA_PATH = Path('data\\raw\\labels.parquet')
SAVE_PATH = Path('data\\dataset.parquet')
SEED = random.seed(0)
DOWNSAMPLE = False

config = {
    'load_embeddings': True,
    'load_CoNNL': True,
    'load_sentiments': True
}

# Load Corpus

In [2]:
dataset = pd.read_parquet(DATA_PATH)['Word']
doc_ids = dataset.index.get_level_values('doc')

if DOWNSAMPLE:
    n_docs = 25
    sample_ids = random.sample(set(doc_ids), n_docs)
    data_ds = dataset[doc_ids.isin(sample_ids)].copy()

    dataset = data_ds
    
corpus = Corpus(dataset)

Indexing sentences for 4993 documents.


# Annotation

Add more general features using functionalities built into `Corpus`.

In [3]:
# add index position information and stemmed token 
corpus.process(*['add_locators', 'add_stem'])

# add binary markers for attributes
corpus.process('mark_')

Added columns ['add_locators', 'add_stem']
<corpus.Corpus object at 0x000001F7213935C0>
                         token  sent_id  sent_loc      stem  is_upper  \
doc      sent word                                                      
10036953 1    1              [        1         1         [     False   
              2         Triple        1         2     tripl     False   
              3        therapy        1         3   therapi     False   
              4       regimens        1         4   regimen     False   
              5      involving        1         5    involv     False   
...                        ...      ...       ...       ...       ...   
9989713  15   21    randomised       15        21  randomis     False   
              22             ,       15        22         ,     False   
              23      clinical       15        23    clinic     False   
              24        trials       15        24     trial     False   
              25             .      

Generate `fasttext` embeddings for the data. This step is very memory-intensive, skip this if you don't have a fast enough system to run it. Once created, the function can load the data from a specified filepath.

In [4]:
if config['load_embeddings']: corpus.process('load_embeddings') # filepath='path\\to\\file.parquet'

Loading FastText embeddings from data\features\ft_embeds.parquet.
Added columns ['load_embeddings']


Generate or load the CoNLL data parsed by `stanfordnlp`.

In [5]:
if config['load_CoNNL']: 
    corpus.load_CoNLL(filepath='data\\features\\conll.parquet')
    parental_features = ['par_' + col for col in ['token', 'lemma', 'deprel', 'upos', 'xpos']]
    corpus.parse_deprel(parental_features)

Loading CoNLL data from data\features\conll.parquet.


In [6]:
x = pd.read_csv('data\\features\\conll.csv', sep='\t', header=None, keep_default_na=False, na_values=None)
x.index = corpus.df.index

Run a sentiment analysis over the sentences using TextBlob and add the polarity and subjectivity of each token's sentence as a feature.

In [7]:
if config['load_sentiments']: corpus.load_sentiments()

Running TextBlob sentiment analysis over 1354239 instances.


Generate a column for TF-IDF from the lookup table generated during initialization.

In [8]:
corpus.add_tfidf()

assert 'xpos' in corpus.df.columns
corpus.add_pos_stem() # add a stemmed version of the CoNNL XPOS (basic grammar info, i.e. NN, VB)

Perform some cleaning on the columns and produce some lagged columns that give information about a word's neighbours. Missing values are imputed by various procedures, and shifts are performed on a document level.

In [10]:
# lags
window = 2
lag_features = ['is_int', 'is_dec', 'first_word', 'last_word', 'form', 'lemma', 'upos', 'xpos', 'deprel', 'tfidf']
par_cols = corpus.df.columns[corpus.df.columns.str.startswith('par')]
fill_tag = '_' # fill missing strings with a custom value

corpus.lag_cols(['is_int', 'is_dec', 'first_word', 'last_word'], window, level='doc', fill_value=False)
corpus.lag_cols(['token', 'lemma', 'upos', 'xpos', 'deprel'], window, level='doc', fill_value=fill_tag)
corpus.lag_cols(['tfidf'], window, level='doc', fill_value=0.0)

<corpus.Corpus at 0x1f7213935c0>

# Downsample & Save

We can save the data to various formats directly through `Corpus`. The exported feature table includes the labels in the index (`df.reset_index()['token']`)

In [11]:
print('features in the final set: \n', list(corpus.df.columns))
assert corpus.df.columns.isna().sum().sum() == 0

features in the final set: 
 ['token', 'sent_id', 'sent_loc', 'stem', 'is_upper', 'is_lower', 'is_title', 'near_cap', 'first_sent', 'last_sent', 'first_word', 'last_word', 'is_int', 'is_dec', 'punctuation', 'stopword', 'PMFT_1', 'PMFT_2', 'PMFT_3', 'PMFT_4', 'PMFT_5', 'PMFT_6', 'PMFT_7', 'PMFT_8', 'PMFT_9', 'PMFT_10', 'PMFT_11', 'PMFT_12', 'PMFT_13', 'PMFT_14', 'PMFT_15', 'PMFT_16', 'PMFT_17', 'PMFT_18', 'PMFT_19', 'PMFT_20', 'PMFT_21', 'PMFT_22', 'PMFT_23', 'PMFT_24', 'PMFT_25', 'PMFT_26', 'PMFT_27', 'PMFT_28', 'PMFT_29', 'PMFT_30', 'PMFT_31', 'PMFT_32', 'PMFT_33', 'PMFT_34', 'PMFT_35', 'PMFT_36', 'PMFT_37', 'PMFT_38', 'PMFT_39', 'PMFT_40', 'PMFT_41', 'PMFT_42', 'PMFT_43', 'PMFT_44', 'PMFT_45', 'PMFT_46', 'PMFT_47', 'PMFT_48', 'PMFT_49', 'PMFT_50', 'PMFT_51', 'PMFT_52', 'PMFT_53', 'PMFT_54', 'PMFT_55', 'PMFT_56', 'PMFT_57', 'PMFT_58', 'PMFT_59', 'PMFT_60', 'PMFT_61', 'PMFT_62', 'PMFT_63', 'PMFT_64', 'PMFT_65', 'PMFT_66', 'PMFT_67', 'PMFT_68', 'PMFT_69', 'PMFT_70', 'PMFT_71', 'PMFT_72'

In [13]:
corpus.df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token,sent_id,sent_loc,stem,is_upper,is_lower,is_title,near_cap,first_sent,last_sent,...,XPOS_LAG1,XPOS_LAG2,DEPREL_LAG-2,DEPREL_LAG-1,DEPREL_LAG1,DEPREL_LAG2,TFIDF_LAG-2,TFIDF_LAG-1,TFIDF_LAG1,TFIDF_LAG2
doc,sent,word,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
10036953,1,1,[,1,1,[,False,False,False,True,True,False,...,_,_,compound,compound,_,_,0.007214,0.0,0.0,0.0
10036953,1,2,Triple,1,2,tripl,False,False,True,True,True,False,...,-LRB-,_,root,compound,punct,_,0.001188,0.007214,0.0,0.0
10036953,1,3,therapy,1,3,therapi,False,True,False,True,True,False,...,NN,-LRB-,acl,root,compound,punct,0.000386,0.001188,0.0,0.0
10036953,1,4,regimens,1,4,regimen,False,True,False,True,True,False,...,NN,NN,compound,acl,compound,compound,0.0,0.000386,0.007214,0.0
10036953,1,5,involving,1,5,involv,False,True,False,True,True,False,...,NNS,NN,obj,compound,root,compound,9e-06,0.0,0.001188,0.007214
10036953,1,6,H2,1,6,h2,True,False,True,True,True,False,...,VBG,NNS,case,obj,acl,root,0.0,9e-06,0.000386,0.001188
10036953,1,7,blockaders,1,7,blockad,False,True,False,True,True,False,...,NN,VBG,obl,case,compound,acl,0.007214,0.0,0.0,0.000386
10036953,1,8,for,1,8,for,False,True,False,True,True,False,...,NNS,NN,case,obl,obj,compound,0.0,0.007214,9e-06,0.0
10036953,1,9,therapy,1,9,therapi,False,True,False,True,True,False,...,IN,NNS,compound,case,case,obj,0.0,0.0,0.0,9e-06
10036953,1,10,of,1,10,of,False,True,False,True,True,False,...,NN,IN,compound,compound,obl,case,0.000882,0.0,0.007214,0.0


In [14]:
corpus.save(f'data\\dataset_{corpus.n_docs}.parquet')