In [1]:
import os
# nvidia-smi
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [2]:
import os
import json
import random
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from flair.tokenization import SegtokSentenceSplitter
from flair.data import Sentence
from flair.models import SequenceTagger
from nltk.stem import WordNetLemmatizer
from flair.embeddings import FlairEmbeddings, StackedEmbeddings, ELMoEmbeddings
import warnings
warnings.filterwarnings('ignore')

In [3]:
import logging
logger = logging.getLogger('flair')
logger.setLevel(level=logging.ERROR)
fh = logging.StreamHandler()
logger.addHandler(fh)

In [4]:
PREFIX = "../data/"
PA_PATH = PREFIX + "sap2017-connotation-frames-power-agency/"
J_PATH = PREFIX + "pungas2017-plaintext-jokes/"
W_PATH = PREFIX + "wang2018-wiki-dataset/"
S_PATH = PREFIX + "kiesel2017-webis-simple-sentences-17/"

# read power_agency

In [5]:
wikisents = pd.read_pickle(PA_PATH + 'power_agency_wikisents.pkl')

In [6]:
jokesents = pd.read_pickle(PA_PATH + 'power_agency_jokesents.pkl')

In [7]:
webissents = pd.read_pickle(PA_PATH + 'power_agency_webissents.pkl')

In [8]:
sents = jokesents.merge(wikisents)
sents = sents.merge(webissents)
del wikisents, jokesents, webissents

In [9]:
sents['sents'] = sents['jsents'] + sents['wsents']
sents['sents'] = sents['sents'] + sents['ssents']

In [10]:
sum(sents.jsents.apply(len)), sum(sents.wsents.apply(len)), sum(sents.ssents.apply(len)), (sum(sents.jsents.apply(len)) + sum(sents.wsents.apply(len)) + sum(sents.ssents.apply(len)))

(247671, 369790, 527953, 1145414)

In [11]:
sents = sents.drop(['jsents','wsents','ssents'],axis=1)

In [12]:
sum(sents.sents.apply(len))

1145414

In [13]:
len(sents)

2144

In [14]:
sents.sents.apply(len).describe()

count    2144.000000
mean      534.241604
std       211.780758
min        17.000000
25%       330.000000
50%       559.000000
75%       700.000000
max       850.000000
Name: sents, dtype: float64

# filter long sentences

In [15]:
s = sents.explode('sents')

In [16]:
s.describe()

Unnamed: 0,verb,agency,power,verb_prep,prep,lemma,sents
count,1145414,1144363,964919,1145414,26709,1145414,1145414
unique,2107,3,3,2144,14,2107,943436
top,speaks,agency_pos,power_agent,runs,to,talk,"(Token: 1 Caress, Token: 2 ,, Token: 3 praise,..."
freq,1700,898400,660097,850,9837,1700,42


In [17]:
s.sents.apply(len).describe()

count    1.145414e+06
mean     2.377096e+01
std      1.576609e+01
min      1.000000e+00
25%      1.400000e+01
50%      2.000000e+01
75%      2.900000e+01
max      9.970000e+02
Name: sents, dtype: float64

In [18]:
s = s[s.sents.apply(len) <= 75]

In [19]:
s.sents.apply(len).describe()

count    1.133706e+06
mean     2.292405e+01
std      1.241553e+01
min      1.000000e+00
25%      1.400000e+01
50%      2.000000e+01
75%      2.900000e+01
max      7.500000e+01
Name: sents, dtype: float64

In [24]:
del s

In [20]:
sum(sents.sents.apply(len))

1145414

In [21]:
def filter_long_sents(row):
    return [s for s in row if len(s) <= 75]        

In [22]:
sents['sents'] = sents.sents.progress_apply(filter_long_sents)

100%|██████████| 2144/2144 [00:01<00:00, 1427.48it/s]


In [23]:
sum(sents.sents.apply(len))

1133706

# balance dataset

In [25]:
sents.sents.apply(len).describe()

count    2144.000000
mean      528.780784
std       208.547094
min        17.000000
25%       327.000000
50%       550.000000
75%       692.000000
max       850.000000
Name: sents, dtype: float64

In [26]:
for i in range(0, 251, 5):
    print(i, ':', len(sents[sents.sents.apply(len) < i]), len(sents[sents.sents.apply(len) >= i]))

0 : 0 2144
5 : 0 2144
10 : 0 2144
15 : 0 2144
20 : 2 2142
25 : 3 2141
30 : 3 2141
35 : 5 2139
40 : 5 2139
45 : 6 2138
50 : 6 2138
55 : 7 2137
60 : 7 2137
65 : 8 2136
70 : 11 2133
75 : 13 2131
80 : 14 2130
85 : 15 2129
90 : 15 2129
95 : 16 2128
100 : 17 2127
105 : 18 2126
110 : 20 2124
115 : 21 2123
120 : 22 2122
125 : 26 2118
130 : 26 2118
135 : 29 2115
140 : 29 2115
145 : 29 2115
150 : 30 2114
155 : 31 2113
160 : 34 2110
165 : 37 2107
170 : 38 2106
175 : 40 2104
180 : 42 2102
185 : 43 2101
190 : 43 2101
195 : 45 2099
200 : 48 2096
205 : 49 2095
210 : 52 2092
215 : 53 2091
220 : 55 2089
225 : 59 2085
230 : 60 2084
235 : 64 2080
240 : 65 2079
245 : 66 2078
250 : 67 2077


In [31]:
48 / 2144

0.022388059701492536

In [32]:
n = 200

In [33]:
# filter verbs with too few sentences
sents = sents[sents.sents.apply(len) >= n]

In [34]:
# remove sentences that are too many
def filter_too_many(slist):
    return random.sample(slist, n)

In [35]:
sents['sents'] = sents.sents.progress_apply(filter_too_many)

100%|██████████| 2096/2096 [00:00<00:00, 4466.63it/s]


In [39]:
sents = sents.reset_index()

In [40]:
sents.sents.apply(len).describe()

count    2096.0
mean      200.0
std         0.0
min       200.0
25%       200.0
50%       200.0
75%       200.0
max       200.0
Name: sents, dtype: float64

In [41]:
sents.to_pickle(PA_PATH + 'power_agency_sents.pkl')