In [1]:
import os
# nvidia-smi
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [38]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from flair.tokenization import SegtokSentenceSplitter
from flair.data import Sentence
from flair.models import SequenceTagger
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

In [3]:
import logging
logger = logging.getLogger('flair')
logger.setLevel(level=logging.ERROR)
fh = logging.StreamHandler()
logger.addHandler(fh)

In [4]:
PREFIX = "../data/"
PA_PATH = PREFIX + "sap2017-connotation-frames-power-agency/"
S_PATH = PREFIX + "kiesel2017-webis-simple-sentences-17/"

# read power_agency

In [5]:
power_agency = pd.read_csv(PA_PATH + "agency_power_prepro.csv", sep=',')

# general tools

In [6]:
splitter = SegtokSentenceSplitter()
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
pos_tagger = SequenceTagger.load("upos-fast")

# get sentences

@InProceedings{kiesel:2017a,
  author =                {Johannes Kiesel and Benno Stein and Stefan Lucks},
  booktitle =             {24th Annual Network and Distributed System Security Symposium (NDSS 2017)},
  doi =                   {10.14722/ndss.2017.23077},
  ids =                   {stein:2017a},
  month =                 feb,
  numpages =              13,
  publisher =             {Association for Computational Linguistics},
  site =                  {San Diego, CA, USA},
  title =                 {{A Large-scale Analysis of the Mnemonic Password Advice}},
  year =                  2017
}

Get data at https://zenodo.org/record/205950#.Yi8qtHrMJhE

In [7]:
test = pd.read_csv(S_PATH + 'webis-simple-sentences-17-corpus-test.txt', delimiter = "\n", names=['text'])

In [8]:
test.head()

Unnamed: 0,text
0,This way you covert the best part of the old f...
1,"In 2004, the city was awarded European Fortres..."
2,The Sint Jans Cathedral is one of the most pro...
3,"It will take years to restore the full church,..."
4,The only time the crowd can get a bit rough is...


In [9]:
len(test)

37208441

In [7]:
wikisents = pd.read_pickle(PA_PATH + 'power_agency_wikisents.pkl')

In [8]:
jokesents = pd.read_pickle(PA_PATH + 'power_agency_jokesents.pkl')

In [9]:
sents = jokesents.merge(wikisents)

In [10]:
sents['sents'] = sents['jsents'] + sents['wsents']
sents = sents.drop(['jsents','wsents'],axis=1)

# determine missing verbs

## filter too long sentences

In [11]:
sents.sents.apply(lambda x: sum([len(y) for y in x])/len(x) if len(x) > 0 else 0).describe()

count    2144.000000
mean       29.351446
std         5.546669
min         0.000000
25%        26.142615
50%        28.971759
75%        32.720113
max        80.000000
Name: sents, dtype: float64

In [12]:
sents.sents.apply(lambda x: sum([len(y) for y in x])).sum()

15879663

In [13]:
def filter_long_sents(row):
    return [s for s in row if len(s) <= 50]        

In [14]:
sents['sents'] = sents.sents.progress_apply(filter_long_sents)

100%|██████████| 2144/2144 [00:00<00:00, 4032.29it/s]


In [15]:
sents.sents.apply(lambda x: sum([len(y) for y in x])).sum()

12423502

In [16]:
sents.sents.apply(len).describe()

count    2144.000000
mean      232.080224
std       158.137586
min         0.000000
25%        73.750000
50%       237.000000
75%       370.250000
max       492.000000
Name: sents, dtype: float64

## filter dataset

In [17]:
goal_n = 250

In [18]:
sents['missing'] = sents.sents.apply(lambda x: goal_n - len(x) if len(x) < goal_n else 0)

In [19]:
sum(sents['missing'])

165751

### all test.texts to Sentences

In [None]:
test = test[test.text.apply(type) == str] # three were NaN

In [None]:
test = test[test.text.apply(len) >= 5]

In [None]:
len(test)

In [None]:
test['sent'] = test.text.apply(Sentence)

### filter by stems

In [21]:
sents['stem'] = sents.verb.progress_apply(stemmer.stem)

100%|██████████| 2144/2144 [00:00<00:00, 24544.76it/s]


In [22]:
sents[sents.missing > 0]

Unnamed: 0,verb,agency,power,verb_prep,prep,lemma,sents,missing,stem
1,abolishes,agency_pos,power_agent,abolishes,,abolish,"[(Token: 1 is, Token: 2 the, Token: 3 fact, To...",35,abolish
4,accelerates,agency_pos,power_agent,accelerates,,accelerate,"[(Token: 1 I, Token: 2 accelerated, Token: 3 t...",26,acceler
7,accommodates,agency_pos,power_equal,accommodates,,accommodate,"[(Token: 1 Vagina, Token: 2 -, Token: 3 "", Tok...",1,accommod
14,aches,agency_neg,,aches,,ache,"[(Token: 1 He, Token: 2 says, Token: 3 to, Tok...",188,ach
17,acquaints,agency_pos,power_agent,acquaints,,acquaint,"[(Token: 1 They, Token: 2 get, Token: 3 acquai...",6,acquaint
...,...,...,...,...,...,...,...,...,...
2139,yelps,agency_pos,,yelps,,yelp,"[(Token: 1 She, Token: 2 limps, Token: 3 on, T...",210,yelp
2140,yields,agency_equal,power_agent,yields,,yield,"[(Token: 1 After, Token: 2 gathering, Token: 3...",40,yield
2141,zaps,agency_pos,power_agent,zaps,,zap,"[(Token: 1 I, Token: 2 must, Token: 3 admit, T...",234,zap
2142,zips,agency_pos,,zips,,zip,"[(Token: 1 As, Token: 2 the, Token: 3 man, Tok...",146,zip


In [None]:
def filter_long_test_sents(row):
    if len(row) <= 50:
        return row
    else:
        None 

In [None]:
test['sent'] = test.sent.progress_apply(filter_long_test_sents)

In [23]:
missing_stems = set(sents[sents.missing > 0].stem)

In [None]:
def pre_filter_sentences(sent):
    if sent and len(sent)>0:
        for token in sent:
            for stem in missing_stems:
                if stem in token.text:
                    return True
    return False

In [None]:
test['candidate'] = test['sent'].progress_apply(pre_filter_sentences)

In [None]:
test['candidate'] = test['candidate'].apply(lambda x: False if type(x) == float else x) # nan -> False

In [68]:
candidates = test[test['candidate']]

100%|██████████| 100000/100000 [00:30<00:00, 3330.45it/s]


In [68]:
print('# Candidates:', len(candidates))

100%|██████████| 100000/100000 [00:30<00:00, 3330.45it/s]


# get sentences

In [75]:
pa_verbprep = power_agency[power_agency.prep.notna()][['lemma', 'prep']]
pa_verbprep = list(pa_verbprep.itertuples(index=False, name=None))

In [76]:
pa_verblemma = set(power_agency.lemma)

In [11]:
def preprocess_webis(sent, col):
    if len(sent) > 0:
        pos_tagger.predict(sent)
        for token in sent:
            if token.get_tag('pos').value == 'VERB':
                token.add_tag('lemma', lemmatizer.lemmatize(token.text, pos='v'))
             
        for tid in range(len(sent)):
            token = sent[tid]
            if token.get_tag('pos').value == 'VERB' and token.get_tag('lemma').value in pa_verblemma:
                # verb from pa -> has prep?
                if tid+1 < len(sent):
                    next_token = sent[tid+1]
                    
                    if (token.get_tag('lemma').value, next_token.text) in pa_verbprep:
                        # has prep
                        index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (power_agency.prep == next_token.text)].index
                    else:
                        index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (~power_agency.prep.notna())].index
                else:
                    index = power_agency[(power_agency.lemma == token.get_tag('lemma').value) & (~power_agency.prep.notna())].index
                    
                # add to power_agency    
                if len(index) == 1:
                    result = power_agency.loc[index[0], col]
                    
                    if len(result) < upper_bound:
                        result.append(sent)
                        power_agency.at[index[0], col] = result
                else:
                    if len(index) > 1:
                        print(sent)
                        print(index)

In [12]:
upper_bound = 250

In [13]:
col = 'ssents'

In [14]:
power_agency[col] = [list() for x in range(len(power_agency.index))]

In [None]:
candidates['test'] = candidates['sent'].progress_apply(lambda s: preprocess_webis(s, col)) # run by script

In [17]:
power_agency.to_pickle(PA_PATH + 'power_agency_webissents.pkl')