In [3]:
import pickle
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Optional

from IPython.display import display, HTML, Markdown

pd.set_option('display.precision', 5)

#### Understand Autocast

In [4]:
# load all questions
path = "../benchmark/data/autocast/autocast_questions.json"
df = pd.read_json(path)
print(df.shape)

# filter out non-true/false questions
df = df[df["qtype"] == "t/f"].reset_index(drop=True)
print(df.shape)

# make sure answers is not None
df = df[df["answer"].notnull()].reset_index(drop=True)
print(df.shape)

# make sure source_links is not []
df = df[df["source_links"].map(len) > 0].reset_index(drop=True)
print(df.shape)

(6532, 14)
(3225, 14)
(2003, 14)
(1403, 14)


In [5]:
df['num_links'] = df['source_links'].apply(lambda x: len(x))

In [6]:
# min, max, mean, median
df['num_links'].agg(['min', 'max', 'mean', 'median'])

min          1.00000
max       2088.00000
mean        36.64362
median       5.00000
Name: num_links, dtype: float64

In [7]:
# quantiles
df['num_links'].quantile([0.25, 0.5, .75, .9, .95, .99])

0.25      3.0
0.50      5.0
0.75     30.0
0.90     99.0
0.95    178.7
0.99    441.6
Name: num_links, dtype: float64

In [8]:
# Value counts
df['num_links'].value_counts().sort_index()[:20]

num_links
1     170
2     167
3     156
4     150
5     107
6      71
7      48
8      62
9      27
10     15
11     14
12      7
13      5
14      2
15      4
16      3
17      1
18      4
19      2
20      2
Name: count, dtype: int64

#### Extract source_links

In [17]:
def extract_text(
    html: str,
    num_words: Optional[int],
) -> str:
    """Extract text from a single HTML document"""
    soup = BeautifulSoup(html, "html.parser")
    for script in soup(["script", "style"]):
        script.extract()
    text = soup.get_text()

    return text

def get_html(url: str) -> Optional[str]:
    """Get the HTML of a single URL"""
    filter_words = [
        "facebook",
        "twitter",
        "youtube",
        "instagram",
        "pinterest",
        "linkedin",
        "bloomberg",
    ]

    if any([word in url.lower() for word in filter_words]):
        return {
            "url": url,
            "error": True,
            "error_message": "filtered",
            "text": None,
        }
    
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        html = response.text
        text = extract_text(html, num_words=1000)
        return {
            "url": url,
            "error": False,
            "error_message": None,
            "text": text,
        }
    
    except Exception as e:
        return {
            "url": url,
            "error": True,
            "error_message": str(e),
            "text": None,
        }

In [18]:
all_questions = [q for questions in df["source_links"] for q in questions if q not in ["", None]]
all_questions = list(set(all_questions))
len(all_questions)

47357

In [19]:
NUM_WORKERS = 10
retrieved_docs = {}

# use concurrent.futures to speed up the process; use tqdm to track progress
with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
    futures = []
    for url in tqdm(all_questions):
        future = executor.submit(get_html, url)
        futures.append(future)
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        retrieved_docs[result["url"]] = result

100%|██████████| 47357/47357 [00:00<00:00, 50477.20it/s]
  soup = BeautifulSoup(html, "html.parser")
  k = self.parse_starttag(i)
100%|██████████| 47357/47357 [2:21:33<00:00,  5.58it/s]  


In [20]:
with open("retrieved_docs.pkl", "wb") as f:
    pickle.dump(retrieved_docs, f)

#### Analyse extracted source_links

In [9]:
# Display index x

# ind = 34
# for i, (k, v) in enumerate(retrieved_docs.items()):
#     if ind == i:
#         print(f"URL: {k}")
#         print(f"Error: {v['error']}")
#         print(f"Error message: {v['error_message']}")
#         display(HTML(v['text']))

In [10]:
with open("retrieved_docs.pkl", "rb") as f:
    retrieved_docs = pickle.load(f)

In [11]:
docs_df = pd.DataFrame(retrieved_docs).T.reset_index(drop=True)
docs_df.shape

(47357, 4)

In [12]:
docs_df_error = docs_df[docs_df["error"] == True].reset_index(drop=True)
docs_df_error.shape

(19261, 4)

In [13]:
docs_df_no_error = docs_df[docs_df["error"] == False].reset_index(drop=True)
docs_df_no_error.shape

(28096, 4)

In [14]:
def count_no_error_links(row):
    count = 0
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            count += 1

    return count

df["num_no_error_links"] = df.apply(count_no_error_links, axis=1)

In [15]:
df["num_no_error_links"].value_counts().sort_index()[3:].sum()

899

In [16]:
df["num_no_error_links"].value_counts().sort_index()[3:20]

num_no_error_links
3     149
4     135
5      83
6      51
7      41
8      30
9      11
10     10
11      5
12      4
13      2
14     10
15      5
16     10
17      8
18     12
19      7
Name: count, dtype: int64

In [17]:
df["num_no_error_links"].value_counts().sort_index()[5:20].sum()

289

In [18]:
docs_df_no_error['num_words'] = docs_df_no_error['text'].apply(lambda x: len(x.split()))

In [19]:
docs_df_no_error['num_words'].quantile([0.25, 0.50, .75, .9, .95, .99])

0.25      647.75
0.50     1095.00
0.75     1836.00
0.90     4334.50
0.95    10097.25
0.99    39763.75
Name: num_words, dtype: float64

In [20]:
# count number of source links that are more than n words
def count_num_words(row, n):
    count = 0
    for link in row["source_links"]:
        if link in docs_df_no_error["url"].values:
            num_words = docs_df_no_error[docs_df_no_error["url"] == link]["num_words"].values[0]
            if num_words > n:
                count += 1

    return count

df["num_words_1000"] = df.apply(lambda x: count_num_words(x, 1000), axis=1)

In [21]:
df["num_words_1000"].value_counts().sort_index()[3:].sum()

685

In [22]:
df["num_words_1000"].value_counts().sort_index()[5:].sum()

472