# **Initialization**

## **Imports and Installations**

In [None]:
# !pip install tweet-preprocessor
!pip install transformers
# !pip install sentence-transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 34.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 60.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 70.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 4.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.6.0 p

In [None]:
import pandas as pd
import requests
from urllib.parse import urlparse
import string
from transformers import T5TokenizerFast, T5ForConditionalGeneration, BartForConditionalGeneration, BartTokenizerFast, PegasusForConditionalGeneration, PegasusTokenizerFast
import csv
import random
import torch

In [None]:
device = torch.device('cuda')
torch.manual_seed(0)
random.seed(0)

In [None]:
filenames = ['summ_data_eng_eng_raw.csv', 'summ_data_eng_preprocessed.csv', 'summ_data_eng_preprocessed_emoji_replaced.csv', 'summ_data_eng_preprocessed_hashtag_mention_removed.csv', 'summ_data_eng_preprocessed_hashtag_mention_run_removed.csv', 'summ_data_eng_preprocessed_hashtag_mention_run_removed_mention_replaced.csv', 'summ_data_eng_preprocessed_hashtag_removed.csv', 'summ_data_eng_preprocessed_mention_removed.csv']

## **Util functions**

In [None]:
def retrieval_eval(query, gold_urls):
  retrieval_1 = 0
  retrieval_5 = 0
  retrieval_10 = 0
  retrieval_20 = 0
  retrieval_50 = 0
  mrr = 0
  reciprocal_ranks = []

  for i in range(0,len(query)):
    added = False
    cand = query[i]
    gold_url = urlparse(gold_urls[i])
    gold_url_netloc = gold_url[1]
    gold_url_path = gold_url[2]
    if gold_url_path[-1]=='/':
      gold_url_path = gold_url_path[:-1]
    
    params = {
            'query': cand.strip(),
            'pageSize': '50',
            'key': '<your_key>',
        }
    try:
        response_dict = requests.get(
            'https://factchecktools.googleapis.com/v1alpha1/claims:search',
            params=params).json()
    except Exception as e:
      response_dict = {}
      print("Google API is not working")
      break

    if 'claims' in response_dict.keys():
      for i in range(0, len(response_dict["claims"])):
        claim = response_dict['claims'][i]
        ret_url = urlparse(claim['claimReview'][0]['url'])
        ret_url_netloc = ret_url[1]
        ret_url_path = ret_url[2]
        if ret_url_path[-1]=='/':
          ret_url_path = ret_url_path[:-1]

        if(ret_url_netloc == gold_url_netloc and ret_url_path == gold_url_path):
          added = True
          reciprocal_ranks.append(1/(i+1))
          if i<1:
            retrieval_1 +=1

          if i<5:
            retrieval_5 +=1

          if i<10:
            retrieval_10 +=1  

          if i<20:
            retrieval_20 +=1

          retrieval_50 +=1
          break
    
    if added == False:
      reciprocal_ranks.append(0)

  if len(reciprocal_ranks) >0:
    mrr = sum(reciprocal_ranks)/len(reciprocal_ranks)

  return { 
      'retrieval_1': (retrieval_1*100)/len(query),
      'retrieval_5': (retrieval_5*100)/len(query),
      'retrieval_10': (retrieval_10*100)/len(query),
      'retrieval_20': (retrieval_20*100)/len(query),
      'retrieval_50': (retrieval_50*100)/len(query),
      'mrr': mrr,
      }

In [None]:
def truncate_tweets(tweets,k):
  for i in range(0, len(tweets)):
    lis = tweets[i].split()
    lis = lis[:min(len(lis),k)]
    tweets[i] = " ".join(lis)
  return tweets

# **Gold Summary Retrieval (Skyline)**

In [None]:
summ_data_raw = pd.read_csv('summ_data_eng_eng_raw.csv', header=0, index_col=0)
print("Gold Retrieval = {}".format(retrieval_eval(list(summ_data_raw['claim_reviewed']), list(summ_data_raw['evidence_url']))))

Gold Retrieval = {'retrieval_1': 54.14462081128748, 'retrieval_5': 67.90123456790124, 'retrieval_10': 68.43033509700176, 'retrieval_20': 68.43033509700176, 'retrieval_50': 68.43033509700176, 'mrr': 0.5997718428935357}


In [None]:
summ_data_raw_shows_removed = pd.read_csv('summ_data_eng_eng_raw_shows_removed.csv', header=0, index_col=0)
print("Gold Retrieval = {}".format(retrieval_eval(list(summ_data_raw_shows_removed['claim_reviewed']), list(summ_data_raw_shows_removed['evidence_url']))))

Gold Retrieval = {'retrieval_1': 55.026455026455025, 'retrieval_5': 68.60670194003528, 'retrieval_10': 69.1358024691358, 'retrieval_20': 69.31216931216932, 'retrieval_50': 69.31216931216932, 'mrr': 0.6052253823340034}


# **Vanilla Summarizer**

In [None]:
for file in filenames:
  summ_data = pd.read_csv(file, header=0, index_col=0)
  print("{} Vanilla Retrieval = {}".format(file, retrieval_eval(list(summ_data['tweet']), list(summ_data['evidence_url']))))

summ_data_eng_eng_raw.csv Vanilla Retrieval = {'retrieval_1': 7.936507936507937, 'retrieval_5': 9.523809523809524, 'retrieval_10': 9.700176366843033, 'retrieval_20': 9.700176366843033, 'retrieval_50': 9.700176366843033, 'mrr': 0.08700764256319812}
summ_data_eng_preprocessed.csv Vanilla Retrieval = {'retrieval_1': 10.405643738977073, 'retrieval_5': 11.99294532627866, 'retrieval_10': 12.16931216931217, 'retrieval_20': 12.16931216931217, 'retrieval_50': 12.16931216931217, 'mrr': 0.1105820105820106}
summ_data_eng_preprocessed_emoji_replaced.csv Vanilla Retrieval = {'retrieval_1': 9.347442680776014, 'retrieval_5': 10.582010582010582, 'retrieval_10': 10.758377425044092, 'retrieval_20': 10.758377425044092, 'retrieval_50': 10.758377425044092, 'mrr': 0.09876543209876543}
summ_data_eng_preprocessed_hashtag_removed.csv Vanilla Retrieval = {'retrieval_1': 11.11111111111111, 'retrieval_5': 12.698412698412698, 'retrieval_10': 12.874779541446209, 'retrieval_20': 12.874779541446209, 'retrieval_50': 12

# **Truncate 11 Summarizer**

In [None]:
k = 11
for file in filenames:
  summ_data = pd.read_csv(file, header=0, index_col=0)
  truncated_tweets = truncate_tweets(list(summ_data['tweet']),k)
  with open('truncate11_{}_summaries.txt'.format(file), 'w') as f:
    for item in truncated_tweets:
        f.write("%s\n" % item)

  # print("{} Truncate 11 Retrieval = {}%".format(file, retrieval_eval(truncated_tweets, list(summ_data['evidence_url']))))

# **T5 OOB Summmarizer**

In [None]:
generate_min_len = 5
generate_max_len = 15
model = T5ForConditionalGeneration.from_pretrained('t5-base').to(device)
tokenizer = T5TokenizerFast.from_pretrained('t5-base')
model.num_parameters()

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


222903552

In [None]:
def t5_summarizer(df,model, min_len, max_len):
  t5_summaries = []
  for index,row in df.iterrows():
    preprocess_text = row['tweet'].strip().replace("\n","")
    t5_prepared_Text = "summarize: "+preprocess_text
    tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                      num_beams=6,
                                      no_repeat_ngram_size=2,
                                      min_length=min_len,
                                      max_length=max_len,
                                      early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    t5_summaries.append(output)
  return t5_summaries

In [None]:
for file in filenames:
  summ_data = pd.read_csv(file, header=0, index_col=0)
  t5_summaries = t5_summarizer(summ_data, model, generate_min_len, generate_max_len)

  with open('t5_base_oob_{}_summaries.txt'.format(file), 'w') as f:
    for item in t5_summaries:
        f.write("%s\n" % item)

  # print("{} T5 Retrieval = {}%".format(file, retrieval_eval(t5_summaries, list(summ_data['evidence_url']))))

# **DistilBART OOB Summmarizer**

In [None]:
generate_min_len = 5
generate_max_len = 15
model = BartForConditionalGeneration.from_pretrained('sshleifer/distilbart-cnn-12-6').to(device)
tokenizer = BartTokenizerFast.from_pretrained('sshleifer/distilbart-cnn-12-6')
model.num_parameters()

Downloading:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

305510400

In [None]:
def distilbart_summarizer(df,model, min_len, max_len):
  distilbart_summaries = []
  for index,row in df.iterrows():
    preprocess_text = row['tweet'].strip().replace("\n","")
    tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                      num_beams=6,
                                      no_repeat_ngram_size=2,
                                      min_length=min_len,
                                      max_length=max_len,
                                      early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    distilbart_summaries.append(output)
  return distilbart_summaries

In [None]:
for file in filenames:
  summ_data = pd.read_csv(file, header=0, index_col=0)
  distilbart_summaries = distilbart_summarizer(summ_data, model, generate_min_len, generate_max_len)
  with open('distilbart_oob_{}_summaries.txt'.format(file), 'w') as f:
    for item in distilbart_summaries:
        f.write("%s\n" % item)
  # print("{} DistilBART Retrieval = {}%".format(file, retrieval_eval(distilbart_summaries, list(summ_data['evidence_url']))))

# **Pegasus OOB Summmarizer**

In [None]:
generate_min_len = 5
generate_max_len = 15
model = PegasusForConditionalGeneration.from_pretrained('sshleifer/distill-pegasus-cnn-16-4').to(device)
tokenizer = PegasusTokenizerFast.from_pretrained('sshleifer/distill-pegasus-cnn-16-4')
model.num_parameters()

Downloading:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

369236992

In [None]:
def pegasus_summarizer(df,model, min_len, max_len):
  pegasus_summaries = []
  for index,row in df.iterrows():
    preprocess_text = row['tweet'].strip().replace("\n","")
    tokenized_text = tokenizer.encode(preprocess_text, return_tensors="pt").to(device)
    summary_ids = model.generate(tokenized_text,
                                      num_beams=6,
                                      no_repeat_ngram_size=2,
                                      min_length=min_len,
                                      max_length=max_len,
                                      early_stopping=True)

    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    pegasus_summaries.append(output)
  return pegasus_summaries

In [None]:
for file in filenames:
  summ_data = pd.read_csv(file, header=0, index_col=0)
  pegasus_summaries = pegasus_summarizer(summ_data, model, generate_min_len, generate_max_len)
  with open('pegasus_oob_{}_summaries.txt'.format(file), 'w') as f:
    for item in pegasus_summaries:
        f.write("%s\n" % item)
  # print("{} Pegasus Retrieval = {}%".format(file, retrieval_eval(pegasus_summaries, list(summ_data['evidence_url']))))

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-large').to(device)
model.num_parameters()

737668096

In [None]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)
model.num_parameters()

Downloading:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51G [00:00<?, ?B/s]

406290432