In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import rouge

In [3]:
evaluator = rouge.Rouge(metrics=['rouge-l', 'rouge-w'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

In [4]:
import json
import pandas as pd
import numpy as np
import itertools
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import tabulate

from scipy import sparse
from fast_pagerank import pagerank
from fast_pagerank import pagerank_power


pd.set_option('display.max_colwidth', None)

In [5]:
from sentence_transformers import SentenceTransformer, InputExample, LoggingHandler, losses, models, util
import torch

### Prepare data:

In [6]:
data_path = '/mnt/ceph/storage/data-in-progress/data-research/arguana/arg-generation'

In [7]:
train_args = pd.read_pickle('../../data/valid_args_with_scores.pkl')

In [8]:
train_topic_with_scored_kps = pd.read_pickle('../../data/valid_topic_keypoints_with_scores.pkl')

In [9]:
train_topics = train_args.groupby(['topic', 'stance']).agg({
    'sents_with_scores': lambda x: set([item for items in x for item in items]),
    'gt-kps' : lambda x: set([item for items in x for item in items]),
    #'gt-cand-kps': lambda x: set([item for items in x for item in items])
}).reset_index()

In [10]:
train_topics.columns

Index(['topic', 'stance', 'sents_with_scores', 'gt-kps'], dtype='object')

In [11]:
#Add the ground-truth key-points into the original set of sentences to allow for an optimal scenario.
train_topics['sents_with_scores'] = train_topics.apply(lambda row: list(row['sents_with_scores']) + train_topic_with_scored_kps[(train_topic_with_scored_kps.topic == row['topic']) & (train_topic_with_scored_kps.stance == row['stance'])]['scored_kps'].tolist()[0], axis=1)

### Load Models:

In [12]:
models_list = [
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-0-2023-07-03_14-50-42',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-1-2023-07-03_15-07-56',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-2-2023-07-03_15-22-51',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-3-2023-07-03_15-37-30',
     data_path + '/keypoint-analysis-sharedtask/final-experiment/roberta-large-final-model-fold-4-2023-07-03_15-52-27',
]
models = [SentenceTransformer(x) for x in models_list]

In [101]:
def gen_match_matrix(model, sents, topic, min_match_score=0):
    sents1 = [topic + ' <SEP> ' + x for x in sents]
    sents1_embeddings = model.encode(sents1)
    sents_embeddings  = model.encode(sents)
    
    sim_matrix = cosine_similarity(sents1_embeddings, sents1_embeddings)
    super_threshold_indices = sim_matrix < min_match_score
    sim_matrix[super_threshold_indices] = 0
    #print(sim_matrix)
    #print('--------;')
    return sim_matrix

def get_avg_match_matrix(models, sents, topic, min_match_score=0):
    sim_matrices = [gen_match_matrix(model, sents, topic, min_match_score) for model in models]
    #print(sim_matrices)
    final_matrix = np.mean(sim_matrices, axis=0)
    return final_matrix

# def evalute_topic_by_rouge(row, top_k=5):
#     ranked_sents = [[x[0]] for x in row['ranked_sents']][:top_k]
#     gt_kps = [[x] for x in row['gt-kps']]

#     r_l = []
#     for a,b in itertools.product(gt_kps, ranked_sents):
#         #print(a, b)
#         scores = evaluator.get_scores(a, b)
#         r_l.append(scores['rouge-l']['f'])
#     return np.mean(r_l)

def evalute_topic_by_rouge(row, top_k=5):
    ranked_sents = [x[0] for x in row['ranked_sents']][:top_k]
    gt_kps = [x for x in row['gt-kps']]
    
    #print(ranked_sents)
    #print(gt_kps)
    
    scores = evaluator.get_scores(ranked_sents, [gt_kps] * len(ranked_sents))
    return scores['rouge-l']['f']

def evalute_topic(row):
    ranked_sents = [x[0] for x in row['ranked_sents']]
    #ranked_sents = [x for x in row['filtered_sents']]
    gt_kps_ranks = []
    for real_kp in row['gt-kps']:
        if real_kp in ranked_sents:
            gt_kps_ranks.append(ranked_sents.index(real_kp)+1)
        else:
            print('kp is not there...')
            gt_kps_ranks.append(len(ranked_sents)+1)
            
    #print(gt_kps_ranks)
    return 1/(sum(gt_kps_ranks)/len(gt_kps_ranks)) if len(gt_kps_ranks) > 0 else 0

def apply_page_rank(row, p=0.85, min_quality_score=0.5, min_match_score=0.5, min_len=5, max_len=20, arg_score_clm='sents_with_scores'):
    cand_sents = [x for x in row[arg_score_clm] if x[1] > min_quality_score and len(x[0].split()) < max_len and len(x[0].split()) > min_len]
    if len(cand_sents) == 0:
        return []
        print('empty')
    cands, cands_qualities = zip(*cand_sents)
    #print(cands)
    #print(cands_qualities)
    cands_qualities = [float(i)/sum(cands_qualities) for i in cands_qualities]
    
    cands_matching_mat = get_avg_match_matrix(models, cands, row['topic'], min_match_score=min_match_score)
    pr=pagerank(sparse.csr_matrix(np.array(cands_matching_mat)), personalize=np.array(cands_qualities), p=p) #it looks like modifying the initial probability doesn't help
    #pr=pagerank(cands_matching_mat, p=p)
    ranked_candidates = list(zip(cands, pr))
    return sorted(ranked_candidates, key=lambda x: -x[1])    

def filter_ranked_list(row, model, min_match=0.8):
    ranked_sents = [x[0] for x in row['ranked_sents']]
    filtered_sents = []
    while len(filtered_sents) < 6:
        filtered_sents = []
        for i, s in enumerate(ranked_sents):
            if len(filtered_sents) == 0:
                filtered_sents.append(s)
                #print('FFFF ' , s)
            else:
                matching_scores = get_avg_match_matrix(models, [s]+filtered_sents, row['topic'])
                max_sim = np.max(matching_scores[0][1:])
                #print(matching_scores)
                if max_sim < min_match:
                    #print('SSSS ' , s)
                    filtered_sents.append(s)
                    if len(filtered_sents) > 4:
                        return filtered_sents

        #min_match = min_match + 0.1
        #print(min_match)

    return filtered_sents

### Testing PageRank with Argumentative Quality Scores:

The scores already computed in sentences_with_scores column, computed via project debater api in another notebook.

In [25]:
scores = []
for min_match_score in [0.0, 0.2, 0.4, 0.8, 1.0]:
    for p in [0.0, 0.2, 0.4, 0.8, 1.0]:
        for min_quality_score in [0.0, 0.2, 0.4, 0.8, 1.0]:
                train_topics['ranked_sents'] = train_topics.apply(lambda row: apply_page_rank(row, p=p, min_quality_score=min_quality_score, min_match_score=min_match_score), axis=1)
                train_topics['scores'] = train_topics.apply(lambda row: evalute_topic_by_rouge(row), axis=1)
                score = np.mean(train_topics.scores.tolist())
                scores.append([p, min_quality_score, min_match_score, score])



In [26]:
scores_df = pd.DataFrame(scores, columns=['p', 'min-quality-score', 'min-match-score', 'score'])

In [27]:
scores_df.sort_values('score', ascending=False)

Unnamed: 0,p,min-quality-score,min-match-score,score
38,0.4,0.8,0.2,0.266590
33,0.2,0.8,0.2,0.266200
8,0.2,0.8,0.0,0.265192
35,0.4,0.0,0.2,0.264783
12,0.4,0.4,0.0,0.264783
...,...,...,...,...
54,0.0,1.0,0.4,0.192913
14,0.4,1.0,0.0,0.192836
64,0.4,1.0,0.4,0.192836
39,0.4,1.0,0.2,0.192836


In [28]:
scores_df.to_csv('../../data/scores_df.csv')

In [23]:
# We wanted to also fine-tune for the best filter_score threshold but it takes for ever
# for filter_score in [0.0, 0.2, 0.4, 0.8, 1.0]:
#     train_topics['ranked_sents'] = train_topics.apply(lambda row: filter_ranked_list(row, models, min_match=filter_score), axis=1)
#     train_topics['scores'] = train_topics.apply(lambda row: evalute_topic_by_rouge(row), axis=1)
#     score = np.mean(train_topics.scores.tolist())
#     scores.append([p, min_quality_score, min_match_score, score])

done


### Using Lexicon for scoring Argumentation:

In [84]:
discourse_markers = ["for example", "such as", "for instance", "in the case of", "as revealed by",
"illustrated by",
"because", "so", "therefore", "thus", "consequently", "hence", "similariy",
"likewise",
"as with",
"like", "equally", "in the same way", "first", "second ",
"third,", "finally", "next", "meanwhile", "after", "then", "subsequently",
"above all",
"in particular", "especially", "significantly", "indeed", "notably", "but", "however",
"although",
"unless", "except", "apart from", "as long as", "if", "whereas", "instead of",
"alternatively", "otherwise", "unlike", "on the other hand", "conversely"]

In [30]:
def get_sentences_lexicon_scores(sentences):
    lexicon_file = open('../../data/ClaimLexicon.txt')
    lexicon_contents = lexicon_file.read()
    lexicon_claims = lexicon_contents.split(', ')
    score_sentences = []
    for sentence in sentences:
        value = 1.0
        for marker in discourse_markers:
            if marker in sentence.lower():
                value += 1            
        if any(claim_ind in sentence.lower() for claim_ind in lexicon_claims):  
            value += 1
        score_sentences.append(value)
    return score_sentences

In [31]:
#modify sentence_scores to be taken from the claim lecixon approach
train_topics['sents_with_lexicon_scores'] = train_topics['sents_with_scores'].apply(lambda sents: list(zip([x[0] for x in sents], get_sentences_lexicon_scores([x[0] for x in sents]))))

In [32]:
scores = []
for min_match_score in [0.0, 0.2, 0.4, 0.8, 1.0]:
    for p in [0.0, 0.2, 0.4, 0.8, 1.0]:
        for min_quality_score in [0.0, 0.2, 0.4, 0.8, 1.0]:
            train_topics['ranked_sents'] = train_topics.apply(lambda row: apply_page_rank(row, p=p, min_quality_score=min_quality_score, min_match_score=min_match_score, arg_score_clm='sents_with_lexicon_scores'), axis=1)
            train_topics['scores'] = train_topics.apply(lambda row: evalute_topic_by_rouge(row), axis=1)
            score = np.mean(train_topics.scores.tolist())
            scores.append([p, min_quality_score, min_match_score, score])



In [33]:
lexicon_scores_df = pd.DataFrame(scores, columns=['p', 'min-quality-score', 'min-match-score', 'score'])
lexicon_scores_df.sort_values('score', ascending=False)

Unnamed: 0,p,min-quality-score,min-match-score,score
48,1.0,0.8,0.2,0.248726
47,1.0,0.4,0.2,0.248726
46,1.0,0.2,0.2,0.248726
45,1.0,0.0,0.2,0.248726
49,1.0,1.0,0.2,0.238248
...,...,...,...,...
64,0.4,1.0,0.4,0.198647
60,0.4,0.0,0.4,0.198647
61,0.4,0.2,0.4,0.198647
63,0.4,0.8,0.4,0.198647


In [34]:
lexicon_scores_df.to_csv('../../data/lexicon_scores_df.csv')

-------------

### Apply on test:

Now evaluate the best model found on validation set compared to ChatGPT key points

In [102]:
evaluator = rouge.Rouge(metrics=['rouge-l', 'rouge-w', 'rouge-n'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

In [103]:
chat_gpt_key_points = json.load(open('../../data/chatgpt-keypoints.json'))
chat_gpt_key_points = {tuple(key.split('-')): value for key, value in chat_gpt_key_points.items()}

In [104]:
test_kps = pd.read_csv('../../KPA_2021_shared_task/test_data/key_points_test.csv')

In [105]:
test_args = pd.read_pickle('../../data/test_args_with_scores.pkl')

#### Adding our approach key-points:

In [106]:
test_topics = test_args.groupby(['topic', 'stance']).agg({
    'sents_with_scores': lambda x: set([item for items in x for item in items])
}).reset_index()

In [107]:
test_topics['ranked_sents'] = test_topics.apply(lambda row: apply_page_rank(row, p=0.4, min_quality_score=0.8, min_match_score=0.2), axis=1)

In [108]:
test_topics['our-key-points-w-filtering'] = test_topics.apply(lambda row: filter_ranked_list(row, models, min_match=0.8), axis=1)
test_topics['our-key-points-wo-filtering'] = test_topics['ranked_sents'].apply(lambda kp: [x[0] for x in kp[:5]])

In [109]:
#adding gt-kps
test_topics['gt-kps']= test_topics.apply(lambda row: test_kps[(test_kps.topic==row['topic']) & (test_kps.stance==row['stance'])]['key_point'].tolist(), axis=1)

In [110]:
[len(x) for x in test_topics['gt-kps'].tolist()]

[4, 5, 5, 5, 7, 7]

#### Adding ChatGPT key-points:

In [111]:
import re
test_topics['chat-gpt-kp'] = test_topics.apply(lambda row: chat_gpt_key_points[tuple([row['topic'], 'pro' if row['stance'] == 1 else 'con'])][2], axis=1)
test_topics['chat-gpt-kp'] = test_topics['chat-gpt-kp'].apply(lambda x : re.findall(r'\d\.\s[^.]+', x))

#### Evaluate the two sets of key-points:

In [112]:
test_topics.columns

Index(['topic', 'stance', 'sents_with_scores', 'ranked_sents',
       'our-key-points-w-filtering', 'our-key-points-wo-filtering', 'gt-kps',
       'chat-gpt-kp'],
      dtype='object')

In [113]:
scores = []
for idx, row in test_topics.iterrows():
    chatgpt_score = evaluator.get_scores(row['chat-gpt-kp'], [row['gt-kps']] * len(row['chat-gpt-kp']))
    our_score_filtering = evaluator.get_scores(row['our-key-points-w-filtering'], [row['gt-kps']] * len(row['our-key-points-w-filtering']))
    our_score_no_filtering = evaluator.get_scores(row['our-key-points-wo-filtering'], [row['gt-kps']] * len(row['our-key-points-wo-filtering']))
    scores.append([row['topic'], row['stance'],
        round(chatgpt_score['rouge-l']['f'],2), round(chatgpt_score['rouge-2']['f'],2),
        round(our_score_no_filtering['rouge-l']['f'],2), round(our_score_no_filtering['rouge-2']['f'],2),
        round(our_score_filtering['rouge-l']['f'],2), round(our_score_filtering['rouge-2']['f'],2)])


In [114]:
scores.append(['all', '', 
               np.mean([s[2] for s in scores]), np.mean([s[3] for s in scores]), np.mean([s[4] for s in scores]), np.mean([s[5] for s in scores]), np.mean([s[6] for s in scores]), np.mean([s[7] for s in scores])])

In [115]:
print(tabulate.tabulate(scores, headers=['topic', 'stance', 'chatgpt-r-l','chatgpt-r-2', 'nofilter-r-l', 'nofilter-r-2', 'filter-r-l', 'filter-r-2']))

topic                                                         stance      chatgpt-r-l    chatgpt-r-2    nofilter-r-l    nofilter-r-2    filter-r-l    filter-r-2
------------------------------------------------------------  --------  -------------  -------------  --------------  --------------  ------------  ------------
Routine child vaccinations should be mandatory                -1             0.24              0.03         0.24               0.03       0.23         0.02
Routine child vaccinations should be mandatory                1              0.26              0.06         0.34               0.11       0.31         0.12
Social media platforms should be regulated by the government  -1             0.32              0.11         0.32               0.11       0.33         0.1
Social media platforms should be regulated by the government  1              0.3               0.1          0.35               0.06       0.28         0.03
The USA is a good country to live in                   

In [118]:
test_topics.to_csv('../../data/generated_key_points.csv')

In [6]:
test_topics = pd.read_csv('../../data/generated_key_points.csv')

In [7]:
test_topics[['topic', 'stance', 'gt-kps', 'our-key-points-w-filtering', 'chat-gpt-kp']].head(n=6)

Unnamed: 0,topic,stance,gt-kps,our-key-points-w-filtering,chat-gpt-kp
0,Routine child vaccinations should be mandatory,-1,"['Routine child vaccinations, or their side effects, are dangerous', 'Mandatory vaccination contradicts basic rights', 'The parents and not the state should decide', 'Routine child vaccinations are not necessary to keep children healthy']","[""child vaccinations shouldn't be mandatory because some children can get sick or badly affected by vaccinations."", 'Vaccines contain heavy metals, they are toxic to children.', 'The child population has a low degree of vulnerability, so vaccination is not urgent yet', 'Vaccines are not mandatory in our country, they constitute a health recommendation, followed by the majority of families.', 'A vaccine that has not been sufficiently tested and without knowledge of side effects is not recommended for children']","['1. Child vaccination should not be mandatory because of potential side effects and unknown risks', '2. Parents should have the right to decide whether to vaccinate their children or not', '3. Vaccines may not be necessary for children as they have stronger immune systems and may not be susceptible to the diseases', '4. Mandatory vaccination goes against personal freedom and violates parental rights', '5. Vaccines can have harmful side effects and may not be effective']"
1,Routine child vaccinations should be mandatory,1,"['Routine child vaccinations are effective', 'Child vaccination saves lives', 'Routine child vaccinations are necessary to protect others', 'Routine child vaccinations should be mandatory to prevent virus/disease spreading', 'Children should not suffer from preventable diseases']","['Routine child vaccination should be mandatory so that the general population can thrive by being free of deadly diseases.', 'child vaccinations should be mandatory to provide decent health care to all.', 'child vaccinations should be mandatory so our children will be safe and protected.', 'Each parent must compulsorily vaccinate their children with routine vaccinations, health risks cannot be taken in any way', 'Infant vaccination should be mandatory so we avoid infant mortality']","['1. Routine child vaccinations should be mandatory to prevent the spread of diseases and protect children from deadly illnesses', '2. Mandatory vaccinations ensure the health and safety of the general population, including vulnerable individuals such as infants and the elderly', '3. Vaccinations save lives and prevent the unnecessary suffering caused by preventable diseases', '4. Vaccinations are a crucial public health intervention that has been proven to be safe and effective in preventing diseases', '5. Mandatory vaccinations in children help eradicate diseases and protect future generations from infectious diseases']"
2,Social media platforms should be regulated by the government,-1,"['Social media regulation is not effective', 'Social media regulation harms privacy', 'Social media regulation harm freedom of speech and other democratic rights', 'The government should not intervene in the affairs of a private company', 'Social media regulation can lead to political abuses by the government']","['Regulation by government of social media platforms would be detrimental to free speech around the world.', 'Government regulation of social media would be harmful to democracy', 'Social media platforms should not be regulated by the government because it is an invasion of privacy', ""Social media platforms shouldn't be regulated by the government because they can't control everything posted on social media."", ""Social media platforms mustn't be regulated by the government because some posts can be hidden from the government""]","['1. Government regulation of social media platforms would be harmful to democracy and freedom of speech', '2. Social media platforms should not be regulated by the government as it would infringe on privacy and freedom of expression', '3. Social media platforms are private companies and should not be controlled by the government', '4. Regulating social media platforms would limit the ability to foster free speech and innovation', '5. Government regulation of social media platforms could lead to abuse of power and invasion of privacy']"
3,Social media platforms should be regulated by the government,1,"['Social media regulation protects the younger audiences', 'Social media regulation is necessary to deal with fake news', 'Social media regulation is beneficial to society at large', 'Social media regulation can help to deal with negative content', 'Social media regulation is required to deal with malicious users']","['social platforms must be regulated by governments to avoid hate crimes as well as political disinformation', 'social networks being monitored by the government, would cause less negative impact on society', 'Social media platforms should be regulated by the government as they are a threat to democracy.', ""Social media platforms should be regulated by the government to make sure everyone's posts meet the community standards."", 'Social networks have become an extremely important means of communication, which results in the government regulating and legislating it.']","['1. Social media platforms should be regulated by the government to prevent the spread of fake news and hoaxes', '2. Regulation is necessary to protect society from crimes, hate speech, and harmful content on social media platforms', '3. Government control of social media can help ensure online security and prevent the dissemination of false information', '4. Regulating social media platforms is important to protect the well-being of individuals, especially children, and to prevent the promotion of dangerous activities and ideologies', '5. The government should regulate social media platforms to maintain control over publications that incite hatred, violence, and']"
4,The USA is a good country to live in,-1,"['The US has unfair health and education policies', 'The US has a problematic/divisive political system', 'The US has high taxation/high costs of living', 'The US is xenophobic/racist', 'The US has inequality/poverty', 'The US is unsafe', 'The US has a negative culture']","["" The poorest in society don't have access to either good health care or an adequate benefits system."", 'The USA is not a good place to live in because of the wide variance between rich and poor.', 'The USA is not a good place to live, as it still treats women as inferior beings.', 'in the USA the health system is very expensive and discriminates against the poor population', 'The United States has long been a hotbed of racism against minority cultures']","['1. High crime rates and lack of safety\n2', '3. High tax rates and expensive cost of living\n4', '5. Political divisions and social unrest']"
5,The USA is a good country to live in,1,"['The US offers great opportunities for individuals', 'The US has freedoms/ democratic rights', 'The US has a great environment/nature', 'The US is a powerful country', 'The US has a good economy/high standard of living', 'The US has a good health and education systems ', 'The US has great people/culture']","['The USA provides many benefits to their citizens that no other country has.', 'The USA is a good country to live in as we have so many freedoms compared to other countries.', 'The USA is a good country to live in because everyone has an equal opportunity to succeed.', 'The USA is a great place to live as everybody is able to prosper.', 'The United States is a land of opportunities where we can develop in different environments']","['1. The United States is a country of opportunities and the American dream', '2. The United States has a stable economy and good job opportunities', '3. The United States offers a high quality of life and good healthcare', '4. The United States is a diverse country with a multicultural society', '5. The United States has strong democratic values and freedoms']"


In [None]:
#Manual assessment of key points
# arg - ours-nofilter - ours-filter - chatgpt
# 0 -> 3 - 4 - 5
# 1 -> 3 - 4 - 5
# 2 -> 2 - 4 - 5
# 3 -> 2 - 3 - 5
# 4 -> 3 - 4 - 3
# 5 -> 4 - 4 - 5