In [1]:
import sys
import os
sys.path.append('../src-py/')
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
from ca_utils import *

2022-05-13 12:09:03,390 loading file ../../../data-ceph/arguana/arg-generation/claim-target-tagger/model/final-model.pt
2022-05-13 12:09:14,866 SequenceTagger predicts: Dictionary with 5 tags: O, S-CT, B-CT, E-CT, I-CT


In [3]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', None)

In [4]:
import torch

In [5]:
torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

device(type='cuda')

In [50]:
gen_kwargs = {
    "do_sample": True, 
    "max_length":200,
    "top_p":0.90, 
    "num_return_sequences":10
}

### Generate conclusions for the comments in training and validation dataset:

In [114]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def get_best_conclusion(stance_scores):
    res = list(zip(stance_scores[0], stance_scores[1][1], stance_scores[1][2]))
    filtered_res = [x for x in res if x[1]==1] #keep only the ones that looks like countering the argument's conclusion
    if len(res) > len(filtered_res):
        print('Filtered out {} of {}'.format(len(res) - len(filtered_res), len(res)))
        #print(res)
        #print(filtered_res)
    return sorted(res, key=lambda x: -x[2])[0][0] #pick the one that the model is sure of the most

def generate_counter_conclusions(arg_conclusions, counters):
    #generate the conclusions
    conclusions = generate_conclusion(counters, gen_kwargs, batch_size=8)
    #remove dublicate conclusions so we don't have to process them
    conclusions = [list(set(c)) for c in chunks(conclusions, 10)] 
    arg_conclusion_and_counter_conclusions = list(zip(arg_conclusions, conclusions))
    
    #compute stances of generated conclusions towards the original argument's conclusion
    stance_scores = []
    for c in arg_conclusion_and_counter_conclusions:
        stance_scores.append((c[1], get_stance_scores([c[0]] * len(c[1]), c[1])))
    
    #get best conclusion of the candidates
    final_conclusions = [ get_best_conclusion(conc_set) for conc_set in stance_scores]
    return final_conclusions

In [115]:
train_df = pd.read_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/preprocessed_train_conclusion_all.pkl')
valid_df = pd.read_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/sample_valid_conclusion_all.pkl')

In [None]:
counter_conclusions = generate_counter_conclusions(train_df['title'].tolist(),  train_df['counter'].tolist())

  0%|          | 0/26 [00:00<?, ?ba/s]

In [None]:
train_df['counter_conclusions'] = counter_conclusions

In [126]:
train_df[['title', 'counter_conclusion']].head()

Unnamed: 0,title,counter_conclusion
0,I believe that churches and other religious institutions should have to pay taxes.,"Tax exemptions are a privilege and should not automatically be granted to any organisation, including churches."
22,I don't believe it is acceptable to attack the person that your SO cheated on you with.,"I believe that physical violence against a cheating partner is no different than other types of violence,"
38,"There is no viable alternative to capitalism, though the current system can be altered to serve people's needs.","Capitalism is not the ideal system, there are alternatives."
50,"Given the option to draft either a QB with elite passing AND elite running skills or a QB with elite passing skills ONLY, NFL teams should almost always draft the QB with elite passing skills only.",Bben roethlesberger is overrated
54,I don't think promiscuity is a bad thing. .,There are dangers to casual sex. (NSFW)


In [None]:
train_df.to_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/preprocessed_train_conclusion_all.pkl')

##### On valid:

In [None]:
counter_conclusions = generate_counter_conclusions(valid_df['title'].tolist(),  valid_df['counter'].tolist())
valid_df['counter_conclusion'] = counter_conclusions

In [None]:
valid_df.to_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/sample_valid_conclusion_all.pkl')

In [125]:
print('donee')

donee


In [127]:
valid_df[['title', 'counter_conclusion']].head()

Unnamed: 0,title,counter_conclusion
358034,It is fine to let teenagers/kids play video games that feature violence.,A parent should be allowed to let their child play video games.
309893,The content on subreddits such as r/uncensorednews aren't bad enough to justify banning them,It was right to ban the pineapple on pizza.
409946,"""Positive Discrimination""/""Affirmative Action"" is immoral and has no place in society.","It is very difficult to tell from aggregate statistics \(for example, a police department's arrest records\) whether racial profiling is driving a racial disparity in arrest statistics."
375438,Watching TV is good for you,i think watching tv is a waste of time.
391572,The Muslim community (and Islam) need a social reformation,i don't think we need a reformation for muslims.


### Generate conclusions for the valid-all dataset:

In [8]:
test_df = pd.read_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/sample_test_conclusion_all.pkl')

In [9]:
post_dict = list(pd.Series(test_df.post.values, index=test_df.post_id).to_dict().items())
post_ids, posts = zip(*post_dict)
pred_conclusions = generate_conclusion(posts, gen_kwargs, batch_size=8)
post_pred_conclusions = {x[0]: x[1] for x in zip(post_ids, pred_conclusions)}
test_df['bart_conclusion'] = test_df.post_id.apply(lambda x: post_pred_conclusions[x])

  0%|          | 0/2 [00:00<?, ?ba/s]

  next_indices = next_tokens // vocab_size


In [10]:
test_df.to_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/sample_test_conclusion_all_preprocessed.pkl')

In [11]:
test_df = pd.read_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/sample_valid_conclusion_all.pkl')
post_dict = list(pd.Series(test_df.post.values, index=test_df.post_id).to_dict().items())
post_ids, posts = zip(*post_dict)
pred_conclusions = generate_conclusion(posts, gen_kwargs, batch_size=8)
post_pred_conclusions = {x[0]: x[1] for x in zip(post_ids, pred_conclusions)}
test_df['bart_conclusion'] = test_df.post_id.apply(lambda x: post_pred_conclusions[x])
test_df.to_pickle('../../../data-ceph/arguana/arg-generation/multi-taks-counter-argument-generation/reddit_data/conclusion_and_ca_generation/sample_valid_conclusion_all_preprocessed.pkl')

  0%|          | 0/2 [00:00<?, ?ba/s]

  next_indices = next_tokens // vocab_size
