In this notebooks, we will refine the autocast dataset for mech benchmarking use case
- why? there dead urls, paywalls, etc.
- the refining would filter out and keep only "working" URLs

In [1]:
import pickle
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional

from IPython.display import display, HTML, Markdown

pd.set_option('display.precision', 5)

#### Understand Autocast

In [2]:
# load all questions
path = "../data/autocast/autocast_questions.json"
df = pd.read_json(path)
print(df.shape)

# filter out non-true/false questions
df = df[df["qtype"] == "t/f"].reset_index(drop=True)
print(df.shape)

# make sure answers is not None
df = df[df["answer"].notnull()].reset_index(drop=True)
print(df.shape)

# make sure source_links is not []
df = df[df["source_links"].map(len) > 0].reset_index(drop=True)
print(df.shape)

(6532, 14)
(3225, 14)
(2003, 14)
(1403, 14)


In [3]:
# number of links per question
df['num_links'] = df['source_links'].apply(lambda x: len(x))

In [4]:
# quantiles
df['num_links'].quantile([0.25, 0.5, .75, .9, .95, .99])

0.25      3.0
0.50      5.0
0.75     30.0
0.90     99.0
0.95    178.7
0.99    441.6
Name: num_links, dtype: float64

In [5]:
# Value counts of num_links
df['num_links'].value_counts().sort_index()[:20]

num_links
1     170
2     167
3     156
4     150
5     107
6      71
7      48
8      62
9      27
10     15
11     14
12      7
13      5
14      2
15      4
16      3
17      1
18      4
19      2
20      2
Name: count, dtype: int64

#### Extract source_links

In [6]:
def extract_text(
    html: str,
    num_words: Optional[int],
) -> str:
    """Extract text from a single HTML document"""
    soup = BeautifulSoup(html, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()

    return text

def get_html(url: str) -> Optional[str]:
    """Get the HTML of a single URL"""
    filter_words = [
        "facebook",
        "twitter",
        "youtube",
        "instagram",
        "pinterest",
        "linkedin",
        "bloomberg",
    ]

    if any([word in url.lower() for word in filter_words]):
        return {
            "url": url,
            "error": True,
            "error_message": "filtered",
            "text": None,
        }
    
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        html = response.text
        text = extract_text(html, num_words=1000)
        return {
            "url": url,
            "error": False,
            "error_message": None,
            "text": text,
        }
    
    except Exception as e:
        return {
            "url": url,
            "error": True,
            "error_message": str(e),
            "text": None,
        }

In [7]:
all_questions = [q for questions in df["source_links"] for q in questions if q not in ["", None]]
all_questions = list(set(all_questions))
len(all_questions)

47357

In [19]:
NUM_WORKERS = 10
retrieved_docs = {}

# use concurrent.futures to speed up the process; use tqdm to track progress
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
    futures = []
    for url in tqdm(all_questions):
        future = executor.submit(get_html, url)
        futures.append(future)
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        retrieved_docs[result["url"]] = result

100%|██████████| 47357/47357 [00:00<00:00, 50477.20it/s]
  soup = BeautifulSoup(html, "html.parser")
  k = self.parse_starttag(i)
100%|██████████| 47357/47357 [2:21:33<00:00,  5.58it/s]  


In [20]:
with open("../data/autocast/retrieved_docs.pkl", "wb") as f:
    pickle.dump(retrieved_docs, f)

#### Analyse extracted source_links

In [6]:
with open("../data/autocast/retrieved_docs.pkl", "rb") as f:
    retrieved_docs = pickle.load(f)

In [7]:
docs_df = pd.DataFrame(retrieved_docs).T.reset_index(drop=True)
docs_df.shape

(47357, 4)

In [8]:
docs_df_error = docs_df[docs_df["error"] == True].reset_index(drop=True)
docs_df_error.shape

(19261, 4)

In [9]:
docs_df_no_error = docs_df[docs_df["error"] == False].reset_index(drop=True)
docs_df_no_error.shape

(28096, 4)

In [10]:
def count_no_error_links(row):
    count = 0
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            count += 1

    return count

df["num_no_error_links"] = df.apply(count_no_error_links, axis=1)

In [11]:
df["num_no_error_links"].value_counts().sort_index()[3:].sum()

899

In [12]:
df["num_no_error_links"].value_counts().sort_index()[3:20]

num_no_error_links
3     149
4     135
5      83
6      51
7      41
8      30
9      11
10     10
11      5
12      4
13      2
14     10
15      5
16     10
17      8
18     12
19      7
Name: count, dtype: int64

In [13]:
df["num_no_error_links"].value_counts().sort_index()[5:20].sum()

289

In [14]:
docs_df_no_error['num_words'] = docs_df_no_error['text'].apply(lambda x: len(x.split()))

In [15]:
docs_df_no_error['num_words'].quantile([0.25, 0.50, .75, .9, .95, .99])

0.25      647.75
0.50     1095.00
0.75     1836.00
0.90     4334.50
0.95    10097.25
0.99    39763.75
Name: num_words, dtype: float64

In [16]:
# count number of source links that are more than n words
def count_num_words(row, n):
    count = 0
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            num_words = docs_df_no_error[docs_df_no_error["url"] == link]["num_words"].values[0]
            if num_words > n:
                count += 1

    return count

df["num_words_1000"] = df.apply(lambda x: count_num_words(x, 1000), axis=1)

In [17]:
df["num_words_1000"].value_counts().sort_index()[3:].sum()

685

In [18]:
df["num_words_1000"].value_counts().sort_index()[5:].sum()

472

### Making final dataset

we filter out the following
- non 200 codes
- urls with certain keywords that were identified during manual checking
- and finally, filter links that had less than 1000 words
- the final dataset will have a minimum of 5 and maximum of 20 source_links 

In [19]:
final_df = df.copy()

In [20]:
# keep only questions with at least 5 links and more than 1000 words
final_df = final_df[final_df["num_words_1000"] >= 5].reset_index(drop=True)

In [21]:
# source_links remove links that are not in docs_df_no_error and keep only links that have more than 1000 words
def filter_source_links(row):
    links = []
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            num_words = docs_df_no_error[docs_df_no_error["url"] == link]["num_words"].values[0]
            if num_words > 1000:
                links.append(link)

    return links

final_df["source_links"] = final_df.apply(filter_source_links, axis=1)

In [22]:
# verify that all links are in docs_df_no_error and have more than 1000 words
for links in final_df["source_links"]:
    for link in links:
        assert link in docs_df_no_error["url"].values
        assert docs_df_no_error[docs_df_no_error["url"] == link]["num_words"].values[0] > 1000

In [23]:
# from final_df remove added columns
final_df = final_df.drop(columns=["num_links", "num_no_error_links", "num_words_1000"])

# convert to json
final_df.to_json("autocast_questions_filtered.json", orient="records")

In [27]:
# from docs_df_no_error keep only urls that are in final_df
docs_df_no_error = docs_df_no_error[docs_df_no_error["url"].isin(final_df["source_links"].explode().unique())].reset_index(drop=True)

# drop error, error_message, num_words
docs_df_no_error = docs_df_no_error.drop(columns=["error", "error_message", "num_words"])


In [28]:
# save as pickle
with open("autocast_questions_filtered.pkl", "wb") as f:
    pickle.dump(docs_df_no_error, f)

### Verify saved files

In [29]:
import json

with open("autocast_questions_filtered.json", "r") as f:
    data = json.load(f)

In [31]:
data[1]

{'question': 'Will the Export-Import Bank of the United States be re-authorized before 1 January 2016?',
 'id': 'G5',
 'background': "The Export-Import Bank's authorization expired on 1 July, but proponents of the bank are working to get it re-authorized (http://www.nytimes.com/2015/07/01/business/international/though-charter-is-expiring-export-import-bank-will-keep-its-doors-open.html , http://www.nytimes.com/2015/07/06/us/politics/us-export-import-bank-teetering-on-edge.html , http://thehill.com/policy/finance/247953-house-gop-draws-first-in-ex-im-showdown ). Legislation re-authorizing the bank must be signed into law by the President before taking effect.",
 'publish_time': 1441116141242,
 'close_time': '2015-12-04 14:00:25+00:00',
 'tags': ['Economic Policy', 'US Politics', 'US Policy'],
 'source_links': ['http://www.hartfordbusiness.com/article/20151005/NEWS01/310029963',
  'http://auburnpub.com/blogs/eye_on_ny/schumer-highway-bill-will-include-export-import-bank-reauthorization/a

In [36]:
with open("autocast_questions_filtered.pkl", "rb") as f:
    docs = pickle.load(f)

In [37]:
docs

Unnamed: 0,url,text
0,https://www.snopes.com/fact-check/ancestry-dna...,\n\n\n\n\n\nCan Ancestry.com Take Ownership of...
1,http://www.scmp.com/week-asia/article/2018057/...,Opinion | Why no one wants to rule Thailand ot...
2,https://en.wikipedia.org/wiki/Martin_Leach_(ex...,\n\n\n\nMartin Leach (executive) - Wikipedia\n...
3,http://www.theguardian.com/politics/2016/jan/2...,\n\n\n\n\nHolding EU vote during migration cri...
4,https://www.gjopen.com/faq#question8,\n\n\n\n\n\n\n\n\n\n Good Judgment® Ope...
...,...,...
14736,https://www.ft.com/content/9d5a5085-40d9-3d13-...,\nVolkswagen announces $11.8bn electric vehicl...
14737,https://www.khaleejtimes.com/uae/expo-2020-dub...,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nExpo 2020 Duba...
14738,https://en.wikipedia.org/wiki/Viktor_Orb%C3%A1n,\n\n\n\nViktor Orbán - Wikipedia\n\n\n\n\n\n\n...
14739,http://www.washingtonexaminer.com/senate-likel...,"�����JFIF��,,����xExif��MM�*�������..."
