In [3]:
import os
# nvidia-smi
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

In [4]:
import os
import json
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from flair.tokenization import SegtokSentenceSplitter
from flair.data import Sentence
from flair.models import SequenceTagger
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

In [5]:
import logging
logger = logging.getLogger('flair')
logger.setLevel(level=logging.ERROR)
fh = logging.StreamHandler()
logger.addHandler(fh)

In [6]:
PREFIX = "../data/"
PA_PATH = PREFIX + "sap2017-connotation-frames-power-agency/"
J_PATH = PREFIX + "pungas2017-plaintext-jokes/"
W_PATH = PREFIX + "wang2018-wiki-dataset/"

# read power_agency

In [7]:
power_agency = pd.read_csv(PA_PATH + "agency_power_prepro.csv", sep=',')

# general tools

In [8]:
splitter = SegtokSentenceSplitter()
lemmatizer = WordNetLemmatizer()
pos_tagger = SequenceTagger.load("upos-fast")

# jokes

@misc{pungas,
        title={A dataset of English plaintext jokes.},
        url={https://github.com/taivop/joke-dataset},
        author={Pungas, Taivo},
        year={2017},
        publisher = {GitHub},
        journal = {GitHub repository}
}

Get data at https://github.com/taivop/joke-dataset

## get sentences

In [7]:
def read_jokes():
    filenames = ['reddit_jokes.json', 'stupidstuff.json', 'wocka.json']
    tags = ['body', 'title']
    
    result = []
    for filename in filenames:
        print(filename)
        with open(J_PATH + filename, 'r') as f: 
            result.extend(json.loads(f.read()))
    return result

In [8]:
jokes = read_jokes()

reddit_jokes.json
stupidstuff.json
wocka.json


In [9]:
bodies = [j['body'] for j in jokes]
titles = [j['title'] for j in jokes if 'title' in j]
j_texts = bodies + titles
del jokes, bodies, titles

In [10]:
j_texts = pd.DataFrame([j for j in j_texts if j.strip()], columns=['text'])

In [11]:
len(j_texts)

408163

In [12]:
def preprocess_jokes(text):
    result = []
    sentences = splitter.split(text)
    if len(sentences) > 0:
        for sent in sentences:
            if len(sent) > 0:
                pos_tagger.predict(sent)
                for token in sent:
                    if token.get_tag('pos').value == 'VERB':
                        token.add_tag('lemma', lemmatizer.lemmatize(token.text, pos='v'))
                result.append(sent)
    return result

In [None]:
j_texts['sent'] = j_texts['text'].progress_apply(preprocess_jokes)

In [None]:
j_texts['sent'].to_pickle(J_PATH + 'j_sents.pkl')

## filter sentences

In [9]:
sents = pd.read_pickle(J_PATH + 'j_sents.pkl')

In [10]:
power_agency.head()

Unnamed: 0,verb,agency,power,verb_prep,prep,lemma
0,abandons,agency_pos,power_agent,abandons,,abandon
1,abolishes,agency_pos,power_agent,abolishes,,abolish
2,absorbs,agency_pos,power_agent,absorbs,,absorb
3,abuses,agency_pos,power_agent,abuses,,abuse
4,accelerates,agency_pos,power_agent,accelerates,,accelerate


In [11]:
pa_verbprep = power_agency[power_agency.prep.notna()][['lemma', 'prep']]
pa_verbprep = list(pa_verbprep.itertuples(index=False, name=None))

In [12]:
pa_verblemma = set(power_agency.lemma)

In [13]:
upper_bound = 300

In [14]:
def add_to_verb(sents, col):
    for sent in sents:  
        for tid in range(len(sent)):
            token = sent[tid]
            if token.get_tag('pos').value == 'VERB' and token.get_tag('lemma').value in pa_verblemma:
                # verb from pa -> has prep?
                if tid+1 < len(sent):
                    next_token = sent[tid+1]
                    
                    if (token.get_tag('lemma').value, next_token.text) in pa_verbprep:
                        # has prep
                        index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (power_agency.prep == next_token.text)].index
                    else:
                        index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (~power_agency.prep.notna())].index
                else:
                    index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (~power_agency.prep.notna())].index
                    
                # add to power_agency    
                if len(index) == 1:
                    result = power_agency.loc[index[0], col]
                    
                    if len(result) < upper_bound:
                        result.append(sent)
                        power_agency.at[index[0], col] = result
                else:
                    if len(index) > 1:
                        print(sent)
                        print(index)

In [15]:
col = 'jsents'

In [16]:
power_agency[col] = [list() for x in range(len(power_agency.index))]

In [17]:
sents.progress_apply(lambda s: add_to_verb(s, col))

100%|██████████| 408163/408163 [30:51<00:00, 220.48it/s]  


0         None
1         None
2         None
3         None
4         None
          ... 
408158    None
408159    None
408160    None
408161    None
408162    None
Name: sent, Length: 408163, dtype: object

In [18]:
power_agency.jsents.apply(len).describe()

count    2144.000000
mean      115.518190
std       113.048178
min         0.000000
25%        22.000000
50%        61.000000
75%       230.000000
max       300.000000
Name: jsents, dtype: float64

In [23]:
power_agency.to_pickle(PA_PATH + 'power_agency_jokesents.pkl')