Instalando o módulo newspaper3k para ajudar a fazer o parsing dos documentos:

In [None]:
!pip3 install newspaper3k

In [10]:
queries, corpus = [], []

Baixando o dataset disponibilizado pelo paper "Re-ranking Web Search Results for Better Fact-Checking: A Preliminary Study"

In [None]:
!wget https://raw.githubusercontent.com/tassilucas/ir-project/master/dataset/data.jsonl

Processando o jsonl para um objeto python:

In [12]:
import json

data = []

with open("data.jsonl") as f:
  data = [json.loads(line) for line in f]

In [None]:
y_target = {}

labels = {'Non-relevant': 1, 'Relevant but not useful': 2, 'Relevant and useful': 3, 'Relevant and very useful': 4}

for c in data:
  y_target[c['claim']] = []
  for r in c['results']:
    print("URL: {} / {} -> {}".format(r['url'], r['judgment'], labels[r['judgment']]))
    y_target[c['claim']].append(labels[r['judgment']])
  print('======================================')

Obtendo os conteudos disponibilizados nas URLs com ajuda de Article fornecido por newspaper3k:

In [None]:
from newspaper import Article

texts = {}

for d in data:
  dict_name = d['claim']
  texts[dict_name] = []
  for res in d['results']:
    url = res['url']
    article = Article(url)
    article.download()
    try:
      article.parse()
      # print("{} -> {}".format(url, article.meta_description))
    except:
      # print("$ Failt to retrieve {}".format(url))

    texts[dict_name].append(article.meta_description + " " + article.text)

  print('--------------------')

Documentos obtidos:

In [None]:
texts

Agora, calculando os valores das features. Todos são normalizados e são encontrados no intervalo de [0, 1].

Encontrando valor da feature de similaridade de cosseno com TF-IDF entre queries e documentos:

In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cos_similarity(texts):
  df2 = pd.DataFrame()
  similarities = []

  for query, docs in texts.items():
    df = \
        pd.DataFrame({'text' : docs})

    vectorizer = TfidfVectorizer()

    tfidf = vectorizer.fit_transform(df['text'].values.astype('U'))
    words = vectorizer.get_feature_names()

    queryTFIDF = TfidfVectorizer().fit(words)

    queryTFIDF = queryTFIDF.transform([query])

    similarities.append(cosine_similarity(queryTFIDF, tfidf).flatten())

    similarities_dict = {}

  z = 0
  for query in texts.keys():
    similarities_dict[query] = similarities[z]
    z += 1
    
  return similarities_dict

Encontrando os valores da feature de fontes oficiais dos documentos:

In [17]:
def find_official_sources():
  official_sources = {}

  for d in data:
    name = d['claim']
    official_sources[name] = []
    for res in d['results']:
      if ".gov" in res['url']:
        print("{} is a government/official source.".format(res['url']))
        official_sources[name].append(1)
      else:
        official_sources[name].append(0)
  
  return official_sources

Encontrando os valores da feature de contagem de números (estatisticas) dos documentos:

In [18]:
import re

def find_number_count(texts):
  number_count = {}

  for k, v in texts.items():
    number_count[k] = []
    for doc in v:
      res = re.findall('[0-9]+', doc)
      number_count[k].append(len(res)*0.1)

  # Colocando valores entre [0, 1]
  for k, v in number_count.items():
    number_count[k] = [float(i)/sum(v) for i in v]
  
  return number_count

Obtendo os valores das features de frequência de exclamações e interrogações:

In [19]:
def find_freq_relations(texts):
  freq_relations = {}

  for k, v in texts.items():
    freq_relations[k] = []
    for doc in v:
      freq_exclamation = doc.count('!') * -1
      freq_interrogation = doc.count('?') * -1
      freq_relations[k].append(freq_exclamation + freq_interrogation)

  minVal = min(freq_relations[k])
  maxVal = max(freq_relations[k])
  maxRange = 1
  minRange = 0

  # Normalizando valores:
  for k, v in freq_relations.items():
    freq_relations[k] = [(((float(value) - minVal) / float((maxVal- minVal))) *  float((maxRange - minRange)) + minRange) for value in v]
  
  return freq_relations

Aqui, é instalado a biblioteca textacy para realizar parsing eficiente de citações em um documento e então utilizar essas informações para encontrar os valores da feature de citações nos documentos:

In [None]:
!pip install textacy

In [None]:
!python -m spacy download en_core_web_sm

In [22]:
import textacy

def find_quote_count(texts):
  claim_dict = {}

  for k, v in texts.items():

    claim_dict[k] = []
    for t in v:
      doc = textacy.make_spacy_doc(t, lang='en_core_web_sm')
      quote_count_doc = 0
      try:
        res = textacy.extract.triples.direct_quotations(doc)
        for x in list(res):
          quote_count_doc += 1
      except:
        claim_dict[k].append(0)
        continue
      
      claim_dict[k].append(quote_count_doc)
  
  # Colocando valores entre [0, 1]
  for k, v in claim_dict.items():
    claim_dict[k] = [float(i)/sum(v) for i in v]
    
  return claim_dict

Agora, um código (meio bagunçado) para criar um dataframe com todos estes dados recolhidos durante o notebook:

In [None]:
def create_lines(claim, columns):
  new_array = []
  for line_ref in range(0, len(columns[1])):
    line = []
    line.append(claim)
    for col in range(0, len(columns)):
      line.append(columns[col][line_ref])
    line_ref += 1
    new_array.append(line)

  return new_array

last = 0
res = 0

similarities_dict = cos_similarity(texts)
official_sources = find_official_sources()
number_count = find_number_count(texts)
freq_relations = find_freq_relations(texts)
claim_dict = find_quote_count(texts)

for i, claim_reference_str in enumerate(texts.keys()):
  data_columns = []
  data_columns.append(similarities_dict[claim_reference_str])
  data_columns.append(official_sources[claim_reference_str])
  data_columns.append(number_count[claim_reference_str])
  data_columns.append(freq_relations[claim_reference_str])
  data_columns.append(claim_dict[claim_reference_str])
  data_columns.append(y_target[claim_reference_str])
  data = create_lines(claim_reference_str, data_columns)
  new_frame = pd.DataFrame(data, columns = ['Claim', 'Cosine Similarity (TF-ID)', 'Official Resource', 'Numbers', 'Exclamation/Interrogation', 'Quotes', 'Label'])
  
  if i != 0:
    res = pd.concat([last, new_frame])
    last = res
  else:
    res = new_frame
    last = new_frame

In [25]:
res

Unnamed: 0,Claim,Cosine Similarity (TF-ID),Official Resource,Numbers,Exclamation/Interrogation,Quotes,Label
0,de beers is fighting lab-made diamonds with bl...,0.152596,0,0.049020,1.0,0.000000,2
1,de beers is fighting lab-made diamonds with bl...,0.275272,0,0.000000,1.0,0.000000,3
2,de beers is fighting lab-made diamonds with bl...,0.235300,0,0.039216,0.8,0.333333,3
3,de beers is fighting lab-made diamonds with bl...,0.000000,0,0.000000,1.0,0.000000,2
4,de beers is fighting lab-made diamonds with bl...,0.000000,0,0.000000,1.0,0.000000,1
...,...,...,...,...,...,...,...
15,the federal government pays $120 million in re...,0.412638,0,0.241176,1.0,0.000000,1
16,the federal government pays $120 million in re...,0.000000,1,0.000000,1.0,0.000000,1
17,the federal government pays $120 million in re...,0.000000,1,0.000000,1.0,0.000000,4
18,the federal government pays $120 million in re...,0.000000,1,0.000000,1.0,0.000000,1


Separando agora os resultados obtidos em dois, um para treino (80% do dataset) e outro para teste (20% do dataset) de maneira randomica:

In [26]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(res, test_size=0.2, random_state=25)

Dataframe de treino:

In [27]:
train_data

Unnamed: 0,Claim,Cosine Similarity (TF-ID),Official Resource,Numbers,Exclamation/Interrogation,Quotes,Label
4,de beers is fighting lab-made diamonds with bl...,0.000000,0,0.000000,1.0,0.000000,1
16,immigrants are causing a rise in anti-semitism...,0.000000,1,0.000000,1.0,0.000000,4
1,e-commerce sales in UK increased by 8 billions...,0.208680,0,0.058904,0.8,0.000000,2
11,Israel caused flooding in Gaza by opening rive...,0.233446,0,0.010563,1.0,0.032258,3
4,NAFTA has negatively impacted the US economy,0.268711,0,0.026316,1.0,0.000000,2
...,...,...,...,...,...,...,...
16,around 83% of students with a student loan are...,0.000000,1,0.000000,1.0,0.000000,2
0,knives kill more people in the US than guns do,0.417868,0,0.013376,1.0,0.000000,2
4,The U.S. has the highest corporate tax rate in...,0.609595,0,0.010920,1.0,0.156250,3
1,knives kill more people in the US than guns do,0.000000,0,0.000000,1.0,0.000000,3


Criando um arquivo formatado adequadamente do dataset train_data para servir como input na ferramenta SVM-Rank:

In [28]:
qid = 1

with open('train.dat', 'w') as td:
  for query in claim_dict.keys():
    d = train_data.loc[train_data['Claim'] == query]
    for index, row in d.iterrows():
      td.write("{} qid:{} 1:{:.1f} 2:{} 3:{:.1f} 4:{:.1f} 5:{:.1f}\n".format(row['Label'], qid, row['Cosine Similarity (TF-ID)'], row['Official Resource'],
                                                                            row['Numbers'], row['Exclamation/Interrogation'], row['Quotes']))
    qid += 1

In [29]:
!cat train.dat

1 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
1 qid:1 1:0.0 2:0 3:0.0 4:0.8 5:0.0
1 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
1 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
1 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
1 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
3 qid:1 1:0.2 2:0 3:0.1 4:1.0 5:0.7
3 qid:1 1:0.3 2:0 3:0.0 4:1.0 5:0.0
1 qid:1 1:0.2 2:0 3:0.1 4:1.0 5:0.0
2 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
3 qid:1 1:0.2 2:0 3:0.1 4:0.8 5:0.0
2 qid:1 1:0.2 2:0 3:0.0 4:1.0 5:0.0
3 qid:1 1:0.2 2:0 3:0.1 4:0.6 5:0.0
3 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
3 qid:1 1:0.2 2:0 3:0.3 4:0.8 5:0.0
3 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
1 qid:1 1:0.0 2:0 3:0.0 4:0.8 5:0.0
1 qid:1 1:0.0 2:0 3:0.0 4:1.0 5:0.0
3 qid:1 1:0.2 2:0 3:0.0 4:0.8 5:0.3
2 qid:2 1:0.3 2:0 3:0.0 4:1.0 5:0.0
2 qid:2 1:0.0 2:0 3:0.0 4:0.8 5:0.0
4 qid:2 1:0.3 2:0 3:0.2 4:-0.8 5:0.0
2 qid:2 1:0.3 2:0 3:0.0 4:1.0 5:0.0
4 qid:2 1:0.0 2:0 3:0.0 4:1.0 5:0.0
1 qid:2 1:0.3 2:0 3:0.0 4:1.0 5:0.0
3 qid:2 1:0.3 2:0 3:0.0 4:0.6 5:0.0
3 qid:2 1:0.0 2:1 3:0.0 4:1.0 5:0.0
2 qid:2 1:0.3 2:0 3:0.0 4:1

In [30]:
test_data

Unnamed: 0,Claim,Cosine Similarity (TF-ID),Official Resource,Numbers,Exclamation/Interrogation,Quotes,Label
18,58 published papers say that climate change is...,0.232130,0,0.011976,1.0,0.000000,2
12,58 published papers say that climate change is...,0.172450,0,0.180562,1.0,0.363636,1
15,immigrants are causing a rise in anti-semitism...,0.000000,0,0.000000,1.0,0.000000,3
3,trump signed an order allowing veterans to get...,0.421312,0,0.030965,1.0,0.111111,3
19,knives kill more people in the US than guns do,0.000000,0,0.000000,1.0,0.000000,2
...,...,...,...,...,...,...,...
6,around 83% of students with a student loan are...,0.000000,0,0.000000,1.0,0.000000,2
17,Buses were lining up to drop protesters in ant...,0.303784,0,0.035831,0.4,0.128205,2
5,the members of Decapitated face prison sentenc...,0.298850,0,0.009777,1.0,0.000000,2
5,knives kill more people in the US than guns do,0.395622,0,0.036943,0.8,0.230769,1


Criando um arquivo formatado adequadamente do dataset test_data para servir como input na ferramenta SVM-Rank:

In [34]:
qid = 1

with open("test.dat", "w") as test_file:
  for query in claim_dict.keys():
    d = test_data.loc[test_data['Claim'] == query]
    for index, row in d.iterrows():
      test_file.write("{} qid:{} 1:{:.1f} 2:{} 3:{:.1f} 4:{:.1f} 5:{:.1f}\n".format(row['Label'], qid, row['Cosine Similarity (TF-ID)'], row['Official Resource'],
                                                                            row['Numbers'], row['Exclamation/Interrogation'], row['Quotes']))
    qid += 1

Agora, um csv com o link e label (classificação de relevância) dos documentos retornados pelo Google da query/claim "COVID-19 vaccines cause infertility":

In [39]:
!cat covid-infertility.csv

https://www.muhealth.org/our-stories/does-covid-19-vaccine-affect-fertility-heres-what-experts-say,2
https://www.bu.edu/articles/2022/covid-vaccines-infertility/,3
https://www.sciencedaily.com/releases/2022/01/220120135142.htm,2
https://www.cdc.gov/coronavirus/2019-ncov/vaccines/planning-for-pregnancy.html,2
https://www.nih.gov/news-events/news-releases/covid-19-vaccination-does-not-reduce-chances-conception-study-suggests,3
https://www.health.gov.au/initiatives-and-programs/covid-19-vaccines/is-it-true/is-it-true-do-covid-19-vaccines-cause-infertility,2
https://portal.ct.gov/vaccine-portal/Vaccine-Knowledge-Base/Articles/Does-The-Vaccine-Cause-Infertility?language=en_US,2
https://womenshealthresearch.ubc.ca/blog/covid-19-vaccines-and-infertility-fact-or-fiction,2
https://www.nbcnews.com/health/sexual-health/covid-vaccine-doesnt-cause-infertility-disease-might-rcna2868,3
https://www.unicef.org/montenegro/en/stories/vaccine-against-covid-19-does-not-cause-sterility,2
https://www.chop.ed

Criando um .dat formatado adequadamente para ser utilizado na ferramente SVM-Rank:

In [40]:
import csv
from newspaper import Article

art_dict = {}
test_texts = {}

cos_sim = 0
official_resource = 0
nc = 0
ff = 0
fqc = 0

with open('covid-infertility.csv') as csv_file:
    claim = 'COVID-19 vaccines cause infertility'
    test_texts[claim] = []
    csv_reader = csv.reader(csv_file, delimiter=',')

    labels = []
    for row in csv_reader:
      article = Article(row[0])
      article.download()
      article.parse()
      test_texts[claim].append(article.meta_description + ' ' + article.text)
      labels.append(int(row[1]))

    cos_sim = cos_similarity(test_texts)
    official_resource = 0
    nc = find_number_count(test_texts)
    ff = find_freq_relations(test_texts)
    fqc = find_quote_count(test_texts)

    with open('search_engine.dat', 'w') as se:
      for col in range(0, 16):
        se.write("{} qid:{} 1:{:.1f} 2:{} 3:{:.1f} 4:{:.1f} 5:{:.1f}\n".format(labels[col], 1, cos_sim[claim][col], official_resource, nc[claim][col], ff[claim][col], fqc[claim][col]))



In [41]:
!cat search_engine.dat

2 qid:1 1:0.2 2:0 3:0.0 4:1.0 5:0.3
3 qid:1 1:0.2 2:0 3:0.1 4:1.0 5:0.0
2 qid:1 1:0.2 2:0 3:0.0 4:1.0 5:0.0
2 qid:1 1:0.3 2:0 3:0.1 4:1.0 5:0.0
3 qid:1 1:0.1 2:0 3:0.1 4:1.0 5:0.1
2 qid:1 1:0.4 2:0 3:0.0 4:0.9 5:0.0
2 qid:1 1:0.3 2:0 3:0.0 4:0.9 5:0.0
2 qid:1 1:0.2 2:0 3:0.1 4:0.9 5:0.0
3 qid:1 1:0.2 2:0 3:0.0 4:1.0 5:0.0
2 qid:1 1:0.2 2:0 3:0.0 4:0.9 5:0.1
2 qid:1 1:0.1 2:0 3:0.0 4:0.6 5:0.0
3 qid:1 1:0.1 2:0 3:0.0 4:0.9 5:0.1
4 qid:1 1:0.0 2:0 3:0.3 4:1.0 5:0.0
2 qid:1 1:0.3 2:0 3:0.1 4:0.9 5:0.2
2 qid:1 1:0.1 2:0 3:0.0 4:0.0 5:0.0
3 qid:1 1:0.1 2:0 3:0.1 4:0.8 5:0.2
