# Create data subset

### Explore structure of dataset 'snes'

In [1]:
import mwparserfromhell
import numpy as np
import pandas as pd
import pickle
import re
import spacy
import string
import time
from pytorch_pretrained_bert.tokenization import BertTokenizer

In [2]:
# SPECIFY RUN MODE

test_flag = False # True or False

In [3]:
### SPECIFY DATASET

HANSEN_DATASET = 'pomt' # 'snes' OR 'pomt'

print(f'Using HANSEN_DATASET: {HANSEN_DATASET}')

Using HANSEN_DATASET: pomt


In [4]:
def load_data(dataset):
    path = "data/hansen_data/" + dataset + "/"
    main_data = pd.read_csv(path + dataset + ".tsv", sep="\t", header=None)
    snippets_data = pd.read_csv(path + dataset + "_snippets.tsv", sep="\t", header=None, encoding='utf-8') # \xa0 is latin_1
    label_order = pickle.load(open(path + dataset + "_labels.pkl", "rb"))
    splits = pickle.load(open(path + dataset + "_index_split.pkl", "rb"))

    return main_data, snippets_data, label_order, splits

In [5]:
main_data, snippets_data, label_order, splits = load_data(HANSEN_DATASET)

In [6]:
print(f'main_data:\n')
print(f'shape: {main_data.shape}')
main_data.head()

main_data:

shape: (13581, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,pomt-03627,Six out of 10 of the highest unemployment rate...,half-true,/ohio/statements/2013/may/06/chris-redfern/ohi...,When a couple of Statehouse Republicans prepar...,,Chris Redfern,,,,2013-05-06T06:00:00,2013-04-30,['None']
1,pomt-09611,No Democratic campaign for (Fla.) governor has...,true,/florida/statements/2010/jan/15/alex-sink/flor...,Florida's leading Republican candidate for gov...,,Alex Sink,,,,2010-01-15T13:59:00,2010-01-06,['None']
2,pomt-06704,Says Ron Paul insisted FEMA should be shut down.,true,/texas/statements/2011/sep/03/maureen-dowd/mau...,Commenting on the federal response to Hurrican...,,Maureen Dowd,,,,2011-09-03T06:00:00,2011-08-30,['None']
3,pomt-15232,"Says Hillary Clinton has ""been in office and i...",mostly false,/truth-o-meter/statements/2015/aug/07/marco-ru...,"Sen. Marco Rubio, R-Fla., says the electorate ...",,Marco Rubio,,,,2015-08-07T00:55:38,2015-08-06,['None']
4,pomt-02638,"In 1916, the U.S. government predicted that by...",false,/truth-o-meter/statements/2014/jan/17/facebook...,With the recent legalization of recreational m...,,Facebook posts,,,,2014-01-17T14:19:04,2014-01-16,['United_States']


In [7]:
print(f'Sample row:\n')
print(f'{main_data.iloc[0]}\n')
print(f'Claim: {main_data.iloc[0,1]}\n')

Sample row:

0                                            pomt-03627
1     Six out of 10 of the highest unemployment rate...
2                                             half-true
3     /ohio/statements/2013/may/06/chris-redfern/ohi...
4     When a couple of Statehouse Republicans prepar...
5                                                  None
6                                         Chris Redfern
7                                                  None
8                                                  None
9                                                  None
10                                  2013-05-06T06:00:00
11                                           2013-04-30
12                                             ['None']
Name: 0, dtype: object

Claim: Six out of 10 of the highest unemployment rates are also in so-called right to work states.



In [8]:
# Correct fatal error in pomt main_data

if HANSEN_DATASET == 'pomt':
    
    main_data.iloc[699,1] = 'Georgia has the most restrictive ballot access laws in the country.'

### Extract claim only

In [9]:
if test_flag == True: extract_df = main_data.iloc[:100,[0,1]] ### TESTING ONLY
    
else: extract_df = main_data.iloc[:,[0,1]]

In [10]:
claim_extract_path = f'data/ac295_{HANSEN_DATASET}_claim_extract.tsv'
print(f'claim_extract_path: {claim_extract_path}')

claim_extract_path: data/ac295_pomt_claim_extract.tsv


In [11]:
extract_df.to_csv(claim_extract_path, sep='\t', header=None, index=False, encoding='utf-8')

# Format New Text for Neutralization

Adapted from https://github.com/rpryzant/neutralizing-bias/tree/master/harvest

In [12]:
### Adapted from harvest/gen_data_from_crawl.py

def clean_wikitext(token_list):    
    x = ' '.join(token_list)

    # ascii only
    x = ''.join(filter(lambda x: x in string.printable, x))

    # preemptively remove <ref>'s (including uncompleted)
    x = x.strip()
    #x = rm_refs(x)
    # collapse multispaces
    x = re.sub('[ ]+', ' ', x)

    parse = mwparserfromhell.parse(x)
    plaintext = parse.strip_code()
    #plaintext = rm_refs(plaintext) # get refs again? some things missed
    # collapse multispaces
    plaintext = re.sub('[ ]+', ' ', plaintext)
    # parse again to hit complicatd nested wikicode like 21055249
    parse = mwparserfromhell.parse(plaintext)
    plaintext = parse.strip_code()

    # ignore lines starting with ! or | (likely table artifacts)
    if plaintext.startswith('?') or plaintext.startswith('|'):
        plaintext = ''

    # ignore lines without text, e.g. ( , , , , ) or ]]
    if not re.findall('\w', plaintext):
        plaintext = ''

    # parse AGAIN again to hit remaining links e.g. 377258469
    plaintext = plaintext.replace('[ ', '[').replace(' ]', ']')
    parse = mwparserfromhell.parse(plaintext)
    plaintext = parse.strip_code()

    # at this point just rm all brackets
    plaintext = plaintext.replace(']', '').replace('[', '')
    # rm html
    plaintext = re.sub('http\S+', '', plaintext)
    # rm parents with nothing in them, e.g. (; )
    plaintext = re.sub('\([^\w]*\)', '', plaintext)
    # rm remining <del>, <ins> (valid tags should already have been taken parsed)
    plaintext = re.sub('<\/?(del|ins)([-\w=" <>]+)?>', '', plaintext)
    # place stars
    plaintext = plaintext.replace('*', '')
    # rm table fragments
    plaintext = re.sub('(right[ ]?\||left[ ]?\||thumb[ ]?\||frame[ ]?\||\d+px[ ]?\|)', '', plaintext)
    # ignore timestamp sentences
    if 'retrieved on' in plaintext.lower():
        plaintext = ''
    # msc html missed
    plaintext = plaintext.replace('<blockquote>', '')
    
    # remove tabs and newlines - deliminators used by neutralizing bias
    # Pryzant made an error here and did not assign the modified text back to the original variable
    plaintext = plaintext.replace('\t', ' ')
    plaintext = plaintext.replace('\n', ' ')
    plaintext = plaintext.replace('\r', '')
    # collapse multispaces (again again)
    plaintext = re.sub('[ ]+', ' ', plaintext).strip()
    
    return plaintext

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(s):    
    tok_list = tokenizer.tokenize(s.strip())
    return ' '.join(tok_list)

11/30/2021 23:41:05 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at C:\Users\xsrpc21b\.pytorch_pretrained_bert\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [14]:
### From harvest/add_tags.py

NLP = spacy.load("en_core_web_sm")

def get_pos_dep(text):
    
    toks = text.split()
    
    def words_from_toks(toks):
        words = []
        word_indices = []
        for i, tok in enumerate(toks):
            if tok.startswith('##'):
                words[-1] += tok.replace('##', '')
                word_indices[-1].append(i)
            else:
                words.append(tok)
                word_indices.append([i])
        return words, word_indices

    out_pos, out_dep = [], []
    words, word_indices = words_from_toks(toks)
    analysis = NLP(' '.join(words))
    
    if len(analysis) != len(words):
        return None, None

    for analysis_tok, idx in zip(analysis, word_indices):
        out_pos += [analysis_tok.pos_] * len(idx)
        out_dep += [analysis_tok.dep_] * len(idx)
    
    assert len(out_pos) == len(out_dep) == len(toks)
    
    return ' '.join(out_pos), ' '.join(out_dep)

### Format claim extract for processing by neutralization model

In [15]:
# Pipeline

def process_text(original_text):
    
    original_text = original_text.replace('\"', '')
    original_text = original_text.replace('\t', '')
    original_text = original_text.replace('\n', '')
    original_text = original_text.replace('\r', '')
    
    cleaned_text = clean_wikitext(original_text.split()) # clean_wikitext takes sentence as list of tokens split on whitespace
    tokenized_text = tokenize(cleaned_text)

    return tokenized_text

In [16]:
df = pd.read_csv(claim_extract_path, sep='\t', header=None, encoding='utf-8', dtype=str)

df = df.rename(columns={0:'id', 1:'original_text'})

In [17]:
df['tokenized_text'] = df['original_text'].apply(process_text)

In [18]:
print(f'Sample data:\n')
print('original_text: {}\n'.format(df.loc[0,'original_text']))
print('tokenized_text: {}\n'.format(df.loc[0,'tokenized_text']))

Sample data:

original_text: Six out of 10 of the highest unemployment rates are also in so-called right to work states.

tokenized_text: six out of 10 of the highest unemployment rates are also in so - called right to work states .



In [19]:
# Tagging

pos_dep = df['tokenized_text'].apply(get_pos_dep) # takes several minutes to run

In [20]:
pre_pos, pre_dep = zip(*pos_dep)

df['pre_pos'] = pre_pos
df['pre_dep'] = pre_dep

In [21]:
null = 'na'

df['placeholder'] = null

output_df = df[['id',
                'tokenized_text', 
                'placeholder', 
                'original_text', 
                'placeholder', 
                'pre_pos', 
                'pre_dep']]

In [22]:
claim_formatted_path = f'data/ac295_{HANSEN_DATASET}_claim_formatted'
print(f'claim_formatted_path: {claim_formatted_path}')

claim_formatted_path: data/ac295_pomt_claim_formatted


In [23]:
output_df.to_csv(claim_formatted_path, sep='\t', header=None, index=False, encoding='utf-8')

# Run neutralization
**main.py**

In [24]:
claim_neutralized_path = f'data/ac295_{HANSEN_DATASET}_claim_neutralized.tsv'
print(f'claim_neutralized_path: {claim_neutralized_path}')

claim_neutralized_path: data/ac295_pomt_claim_neutralized.tsv


In [25]:
timestamp = str(int(time.time()))

working_dir_path = f'data/ac295_{HANSEN_DATASET}_temp_{timestamp}'
print(f'working_dir_path: {working_dir_path}')

working_dir_path: data/ac295_pomt_temp_1638333738


In [26]:
# max test_batch_size 32
# the output pickle file does not contain the results
# src/seq2seq/utils.py has been modified for the required output format: original text, neutralized text, list of changes
# src/shared/data.py, src/joint.utils.py & src/seq2seq/utils.py have been modified to log skipped errors
### MUST BE RUN as batch size 1 to maintain the correct order

%run lib/inference.py \
       --extra_features_top --pre_enrich --activation_hidden \
       --test_batch_size 64 --bert_full_embeddings --debias_weight 1.3 --token_softmax \
       --pointer_generator --coverage \
       --working_dir $working_dir_path \
       --test $claim_formatted_path \
       --checkpoint model/neutralization_model.ckpt \
       --inference_output $claim_neutralized_path

11/30/2021 23:42:19 - INFO - pytorch_pretrained_bert.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt not found in cache, downloading to C:\Users\xsrpc21b\AppData\Local\Temp\tmpy55a251g
100%|██████████████████████████████████████████████████████████████████████| 231508/231508 [00:00<00:00, 1126328.93B/s]
11/30/2021 23:42:20 - INFO - pytorch_pretrained_bert.file_utils -   copying C:\Users\xsrpc21b\AppData\Local\Temp\tmpy55a251g to cache at data/ac295_pomt_temp_1638333738/cache\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
11/30/2021 23:42:20 - INFO - pytorch_pretrained_bert.file_utils -   creating metadata file for data/ac295_pomt_temp_1638333738/cache\26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
11/30/2021 23:42:20 - INFO - pytorch_pretrained_bert.file_utils -   removing tem

SKIPPED  3
[['post_error', 'i: 9090', 'revid: pomt-00178'], ['post_error', 'i: 9633', 'revid: pomt-03506'], ['post_error', 'i: 11483', 'revid: pomt-11907']]
END skipped_list


11/30/2021 23:42:22 - INFO - pytorch_pretrained_bert.file_utils -   https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz not found in cache, downloading to C:\Users\xsrpc21b\AppData\Local\Temp\tmpihex2s1w
100%|███████████████████████████████████████████████████████████████| 407873900/407873900 [00:14<00:00, 27928220.79B/s]
11/30/2021 23:42:37 - INFO - pytorch_pretrained_bert.file_utils -   copying C:\Users\xsrpc21b\AppData\Local\Temp\tmpihex2s1w to cache at data/ac295_pomt_temp_1638333738/cache\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
11/30/2021 23:42:37 - INFO - pytorch_pretrained_bert.file_utils -   creating metadata file for data/ac295_pomt_temp_1638333738/cache\9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
11/30/2021 23:42:37 - INFO - pytorch_pretrained_bert.file_utils -   removing temp f

LOADING FROM model/neutralization_model.ckpt


  0%|                                                                                          | 0/213 [00:00<?, ?it/s]

...DONE


100%|████████████████████████████████████████████████████████████████████████████████| 213/213 [25:55<00:00,  7.30s/it]


eval/bleu 0 0
eval/true_hits 0.0 0


### Load neutralized data

In [27]:
neutralized_df = pd.read_csv(claim_neutralized_path, 
                             header=None, sep='\t', encoding='utf-8')

In [28]:
neutralized_df

Unnamed: 0,0,1,2,3
0,44,says a federal judge in san antonio issued a r...,says a federal judge in san antonio issued a r...,[]
1,33,says the dick ##ey amendment prevents the cent...,says the dick ##ey amendment prevents the cent...,[]
2,54,"the non ##partisan cb ##o , congressional budg...","the cb ##o , congressional budget office , has...",[]
3,49,says hillary clinton ' s plan would bring in 6...,says hillary clinton ' s plan would bring in 6...,[]
4,46,if you are a gun store owner and you decide to...,if you are a gun store owner and you decide to...,[]
...,...,...,...,...
13573,13571,most of the ( affordable care act ) has alread...,most of the ( affordable care act ) has in som...,[]
13574,13578,says ron des ##ant ##is has not revealed $ 145...,says ron des ##ant ##is has not claimed $ 145 ...,[['claimed']]
13575,13577,"my 7 - step plan creates 700 , 000 jobs in 7 y...","my 7 - step plan creates 700 , 000 jobs in 7 y...",[]
13576,13580,we borrow money from the chinese to buy oil fr...,we borrow money from the chinese to buy oil fr...,[]


In [29]:
i = 40

print(f'i: {i} index: {neutralized_df.iloc[i,0]}:')
print(f'Original Text: {neutralized_df.iloc[i,1]}\n')
print(f'Neutrlzd Text: {neutralized_df.iloc[i,2]}\n')

i: 40 index: 34:
Original Text: bump stocks are already illegal in the state of new york . you can ' t buy one .

Neutrlzd Text: bump stocks are already illegal in the state of new york . you can ' t buy one .



In [30]:
for i in np.random.choice(np.arange(len(neutralized_df)), size=5, replace=False):

    print(f'i: {i} index: {neutralized_df.iloc[i,0]}:\n')
    print(f'Original Text: {neutralized_df.iloc[i,1]}\n')
    print(f'Neutrlzd Text: {neutralized_df.iloc[i,2]}\n')
    print(f'{"*"*30}\n')

i: 5816 index: 5775:

Original Text: the washington d . c . - area never has a recession .

Neutrlzd Text: the washington d . c . - area has a recession .

******************************

i: 2222 index: 2230:

Original Text: the state bureaucracy jane norton managed grew by $ 43 million in just three years .

Neutrlzd Text: the state bureaucracy jane norton managed grew by $ 43 million in three years .

******************************

i: 10885 index: 10928:

Original Text: says i worked with our mayor and other commissioners to help save the school year when a budget crisis in salem threatened to cut four weeks of the school year , including negotiating with teachers to work for free .

Neutrlzd Text: says i worked with mayor and other commissioners to help save the school year when a budget crisis in salem threatened to cut four weeks of the school year , including negotiating with teachers to work for free .

******************************

i: 3369 index: 3382:

Original Text: the on

### Apply neutralized data to main_data

In [31]:
modified_df = main_data.copy()
modified_df.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,pomt-03627,Six out of 10 of the highest unemployment rate...,half-true,/ohio/statements/2013/may/06/chris-redfern/ohi...,When a couple of Statehouse Republicans prepar...,,Chris Redfern,,,,2013-05-06T06:00:00,2013-04-30,['None']


In [32]:
for row in neutralized_df.iterrows():
    
    target_index = row[1][0]
    neutralized_text = row[1][2]
    
    modified_df.iloc[target_index, 1] = neutralized_text

In [33]:
modified_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,pomt-03627,six out of 10 of the highest unemployment rate...,half-true,/ohio/statements/2013/may/06/chris-redfern/ohi...,When a couple of Statehouse Republicans prepar...,,Chris Redfern,,,,2013-05-06T06:00:00,2013-04-30,['None']
1,pomt-09611,no democratic campaign for ( fl ##a . ) govern...,true,/florida/statements/2010/jan/15/alex-sink/flor...,Florida's leading Republican candidate for gov...,,Alex Sink,,,,2010-01-15T13:59:00,2010-01-06,['None']
2,pomt-06704,ron paul says fe ##ma should be shut down .,true,/texas/statements/2011/sep/03/maureen-dowd/mau...,Commenting on the federal response to Hurrican...,,Maureen Dowd,,,,2011-09-03T06:00:00,2011-08-30,['None']
3,pomt-15232,hillary clinton has been in office and in gove...,mostly false,/truth-o-meter/statements/2015/aug/07/marco-ru...,"Sen. Marco Rubio, R-Fla., says the electorate ...",,Marco Rubio,,,,2015-08-07T00:55:38,2015-08-06,['None']
4,pomt-02638,"in 1916 , the u . s . government stated that b...",false,/truth-o-meter/statements/2014/jan/17/facebook...,With the recent legalization of recreational m...,,Facebook posts,,,,2014-01-17T14:19:04,2014-01-16,['United_States']
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13576,pomt-10818,obama skip ##s out on condemning attacks on ge...,true,/truth-o-meter/statements/2007/sep/21/republic...,Political candidates and parties take plenty o...,,Republican National Committee,,,,2007-09-21T00:00:00,2007-09-21,['None']
13577,pomt-08924,"my 7 - step plan creates 700 , 000 jobs in 7 y...",mostly true,/florida/statements/2010/jul/26/rick-scott/ric...,"Rick Scott, a Republican running for governor,...",,Rick Scott,,,,2010-07-26T18:25:47,2010-07-25,['None']
13578,pomt-00146,says ron des ##ant ##is has not claimed $ 145 ...,half-true,/florida/statements/2018/oct/24/andrew-gillum/...,Democratic Tallahassee Mayor Andrew Gillum and...,,Andrew Gillum,,,,2018-10-24T19:51:42,2018-10-24,['None']
13579,pomt-10701,"i spend a lot of time in iowa , and believe it...",false,/truth-o-meter/statements/2007/nov/29/tom-tanc...,Illegal immigration is one of the key targets ...,,Tom Tancredo,,,,2007-11-29T00:00:00,2007-11-20,['Iowa']


In [34]:
main_neutralized_path = f'data/ac295_{HANSEN_DATASET}_main_neutralized.tsv'
print(f'main_neutralized_path: {main_neutralized_path}')

main_neutralized_path: data/ac295_pomt_main_neutralized.tsv


In [35]:
modified_df.to_csv(main_neutralized_path, 
                    index=False, header=None, sep='\t', encoding='utf-8')