In [1]:
import os
# nvidia-smi
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [2]:
import os
import json
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from flair.tokenization import SegtokSentenceSplitter
from flair.data import Sentence
from flair.models import SequenceTagger
from nltk.stem import WordNetLemmatizer
from flair.embeddings import FlairEmbeddings, StackedEmbeddings, ELMoEmbeddings
import warnings
warnings.filterwarnings('ignore')

In [3]:
import logging
logger = logging.getLogger('flair')
logger.setLevel(level=logging.ERROR)
fh = logging.StreamHandler()
logger.addHandler(fh)

In [4]:
PREFIX = "../data/"
PA_PATH = PREFIX + "sap2017-connotation-frames-power-agency/"
J_PATH = PREFIX + "pungas2017-plaintext-jokes/"
W_PATH = PREFIX + "wang2018-wiki-dataset/"

# read power_agency

In [5]:
power_agency = pd.read_csv(PA_PATH + "agency_power_prepro.csv", sep=',')

# general tools

In [6]:
splitter = SegtokSentenceSplitter()
lemmatizer = WordNetLemmatizer()
pos_tagger = SequenceTagger.load("upos-fast")

# wiki

@inproceedings{wang-etal-2018-describing,
    title = "Describing a Knowledge Base",
    author = "Wang, Qingyun  and
      Pan, Xiaoman  and
      Huang, Lifu  and
      Zhang, Boliang  and
      Jiang, Zhiying  and
      Ji, Heng  and
      Knight, Kevin",
    booktitle = "Proceedings of the 11th International Conference on Natural Language Generation",
    month = nov,
    year = "2018",
    address = "Tilburg University, The Netherlands",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/W18-6502",
    doi = "10.18653/v1/W18-6502",
    pages = "10--21",
}

Get data at https://github.com/EagleW/Describing_a_Knowledge_Base

## get sentences

In [7]:
w_texts = []

with tqdm(total=os.path.getsize(W_PATH + 'wiki_person.json')) as pbar:
    with open(W_PATH + 'wiki_person.json') as f:
        for line in f:
            l = json.loads(line)
            
            if 'TEXT' in l and len(l['TEXT']) > 0:
                w_texts.append(l['TEXT'])
                
            pbar.update(len(line))

100%|██████████| 1045758208/1045758208 [00:35<00:00, 29170197.77it/s]


In [8]:
w_texts = ["".join([" "+i for i in s]).strip() for t in w_texts for s in t]

In [11]:
w_texts = pd.DataFrame(w_texts, columns=['text'])

In [12]:
len(w_texts)

4097687

In [13]:
w_texts.head()

Unnamed: 0,text
0,"""Weird Al"" Yankovic "" ""Weird Al"" Yankovic "" ""W..."
1,"yankovic was born in Downey, California and ra..."
2,al 's first accordion lesson which sparked his...
3,a door-to-door salesman traveling through Lynw...
4,yankovic claims the reason his parents chose a...


In [15]:
pa_verbprep = power_agency[power_agency.prep.notna()][['lemma', 'prep']]
pa_verbprep = list(pa_verbprep.itertuples(index=False, name=None))

In [16]:
pa_verblemma = set(power_agency.lemma)

In [17]:
upper_bound = 300

In [43]:
col = 'wsents'

In [44]:
power_agency[col] = [list() for x in range(len(power_agency.index))]

In [45]:
def preprocess_wiki(sentence, col):
    sent = Sentence(sentence)
    if len(sent) > 0:
        pos_tagger.predict(sent)
        for token in sent:
            if token.get_tag('pos').value == 'VERB':
                token.add_tag('lemma', lemmatizer.lemmatize(token.text, pos='v'))
             
        for tid in range(len(sent)):
            token = sent[tid]
            if token.get_tag('pos').value == 'VERB' and token.get_tag('lemma').value in pa_verblemma:
                # verb from pa -> has prep?
                if tid+1 < len(sent):
                    next_token = sent[tid+1]
                    
                    if (token.get_tag('lemma').value, next_token.text) in pa_verbprep:
                        # has prep
                        index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (power_agency.prep == next_token.text)].index
                    else:
                        index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (~power_agency.prep.notna())].index
                else:
                    index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (~power_agency.prep.notna())].index
                    
                # add to power_agency    
                if len(index) == 1:
                    result = power_agency.loc[index[0], col]
                    
                    if len(result) < upper_bound:
                        result.append(sent)
                        power_agency.at[index[0], col] = result
                else:
                    if len(index) > 1:
                        print(sent)
                        print(index)

In [46]:
w_texts['sent'] = w_texts['text'].progress_apply(lambda s: preprocess_wiki(s, col)) # run by python script

100%|██████████| 100/100 [00:02<00:00, 35.91it/s]


In [26]:
print(power_agency.wsents.apply(len).describe())

count    2144.000000
mean        1.162780
std         5.165583
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max       165.000000
Name: wsents, dtype: float64


In [None]:
power_agency.to_pickle(PA_PATH + 'power_agency_wikisents.pkl')