In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import twitter_functions as twf
import utils

import pandas as pd
import json
from tqdm import tqdm
from tqdm.contrib.concurrent import thread_map
import os
from os.path import join
from datetime import datetime, timedelta
import random
from copy import deepcopy

In [3]:
ACADEMIC = True
MAX_TWEETS = 500000
MAX_TWEETS_PER_QUERY = 100

domains = pd.read_csv("../resources/domains/clean/domain_list_clean.csv")["url"].unique()
twitter_dateformat = "%Y-%m-%dT%H:%M:%SZ"

params = {"granularity": "hour"}
if ACADEMIC:
    params["end_time"] = datetime(2021, 10, 1).strftime(twitter_dateformat)
    params["start_time"] = datetime(2021, 4, 1).strftime(twitter_dateformat)
else:
    params["end_time"] = (datetime.now() - timedelta(days=1)).strftime(twitter_dateformat)

with open("api-keys/jana.json") as f:
    api_keys = json.load(f)

data_dir = '../resources/twitter'
tweet_dst = join(data_dir, 'tweet-counts')
minimum_query = "lang:en -is:retweet"

config = {
    "bearer_token": api_keys["bearer-token"],
    "params": params
}

# Query Counts and Collect Parquet

In [None]:
i = 0
for url in tqdm(domains):
    config["params"]["query"] = f'{minimum_query} url:"{url}"'
    next_token = None
    while True:
        tweet_file_name = f"{i}.json"
        i += 1
        config["write_path"] = join(tweet_dst, tweet_file_name)

        twf.validate_config(config)

        json_response = twf.count_tweets(config, all=ACADEMIC, next_token=next_token)
        json_response["url"] = url
        with open(config["write_path"], "w") as f:
            json.dump(json_response, f)
        if 'meta' in json_response and 'next_token' in json_response['meta']:
            next_token = json_response['meta']['next_token']
        else:
            break

In [None]:
responses = []
for filename in os.listdir(tweet_dst):
    if not filename.startswith("error-"):
        with open(join(tweet_dst, filename)) as f:
            resp = json.load(f)
        if "data" in resp:
            responses.append(resp) 
tweet_counts = pd.DataFrame(responses).drop(columns="meta").explode("data", ignore_index=True)
tweet_counts = pd.concat([tweet_counts.drop(columns="data"), pd.DataFrame(tweet_counts["data"].values.tolist())], axis=1)
tweet_counts["start"] = pd.to_datetime(tweet_counts["start"])
tweet_counts["end"] = pd.to_datetime(tweet_counts["end"])
print(tweet_counts.shape)
tweet_counts.head()

In [None]:
tweet_counts.to_parquet(join(data_dir, "tweet-counts.parquet.gzip"), index=False, compression="gzip")

# Form Partitions of 500 (resp. 100) Tweets
- Limitations: 
  - max. 1024 characters per query
  - max. 500 (resp. 100) tweets per query
  - max. 300 queries per 15 min.
  - get equal data distribution regarding labels (accuracy, transparency, reliability)
- Form sets of URL $U_1, \dots, U_n$, for $u_1, u_2, \dots \in U_i$
  - $u_1, u_2, \dots$ share same accuracy and transparency score => Sets are distinct
  - Query `lang:en -is:retweet (url:"u_1" OR url:"u_2" OR ...)` is max. 1024 character
- Build timeranges
  - Let $h_1, \dots, h_m$ be hourly timestamps, with $h_1$ = 2021-04-01 00:00 and $h_m$ = 2021-10-01 00:00
  - Let $c(U_i, h_s, h_e)$ be the number of tweets for URLs $U_i$ in time range $[h_s, h_e)$
  - For each $U_i$ form time frames $T_{i,j} = [h_s, h_e)$, s.t. 
    - $c(U_i, h_s, h_e) \geq 500$
    - $c(U_i, h_s, h_{e-1}) < 500$
    - "Smallest time frame containing at least 500 tweets"
- A final query can be constructed using one $U_i$ and one $T_{i,j}$ and always returns 500 tweets

In [4]:
tweet_counts = pd.read_parquet(join(data_dir, "tweet-counts.parquet.gzip"))
domain_labels = pd.read_csv("../resources/domains/clean/domain_list_clean.csv")
tweet_counts = tweet_counts.merge(domain_labels, on="url")
print(tweet_counts.shape)
tweet_counts.head()

(43548109, 10)


Unnamed: 0,url,end,start,tweet_count,label,source,last_update,accuracy,transparency,type
0,science.energy.gov,2021-05-30 01:00:00+00:00,2021-05-30 00:00:00+00:00,0,pro science,media_bias_fact_check,2021-05-17,5,3,reliable
1,science.energy.gov,2021-05-30 02:00:00+00:00,2021-05-30 01:00:00+00:00,0,pro science,media_bias_fact_check,2021-05-17,5,3,reliable
2,science.energy.gov,2021-05-30 03:00:00+00:00,2021-05-30 02:00:00+00:00,0,pro science,media_bias_fact_check,2021-05-17,5,3,reliable
3,science.energy.gov,2021-05-30 04:00:00+00:00,2021-05-30 03:00:00+00:00,0,pro science,media_bias_fact_check,2021-05-17,5,3,reliable
4,science.energy.gov,2021-05-30 05:00:00+00:00,2021-05-30 04:00:00+00:00,0,pro science,media_bias_fact_check,2021-05-17,5,3,reliable


In [5]:
check_status_codes = False

if check_status_codes:
    status_codes = thread_map(utils.check_url_status, domain_labels["url"].unique(), max_workers=os.cpu_count() * 4)
    status_codes = pd.DataFrame(status_codes)
    status_codes.to_csv("../resources/domains/clean/domain_status_codes.csv", index=False)

In [6]:
status_codes = pd.read_csv("../resources/domains/clean/domain_status_codes.csv")
print(status_codes.shape, status_codes.query("status_code == 200").shape)
status_codes.head()

(9971, 4) (7957, 4)


Unnamed: 0,url,full_url,status_code,redirected
0,notiziepericolose.blogspot.it,https://notiziepericolose.blogspot.it,200,False
1,ilmattoquotidiano.it,https://ilmattoquotidiano.it,200,False
2,ilmessaggio.it,https://ilmessaggio.it,200,False
3,ilfattoquotidaino.it,http://ilfattoquotidaino.it,200,False
4,ilquotidaino.wordpress.com,https://ilquotidaino.wordpress.com,200,False


In [7]:
# Filter by active URLs
tweet_counts = tweet_counts.merge(status_codes.query("status_code == 200")["url"], on="url")
tweet_counts.shape

(34767043, 10)

In [8]:
def to_query_str(urls):
    minimum_query = "lang:en -is:retweet"
    url_clause = " OR ".join([f'url:"{url}"' for url in urls])
    return f"{minimum_query} ({url_clause})"

def get_url_ranges(all_urls):
    url_ranges = [[all_urls[0]]]
    for url in all_urls[1:]:
        tmp_query = url_ranges[-1] + [url]
        query_str = to_query_str(tmp_query)
        if len(query_str) <= 1024:
            url_ranges[-1] = tmp_query
        else:
            url_ranges.append([url])
    return url_ranges

def get_hour_ranges(hour_counts):
    hour_counts = hour_counts.sort_index(ascending=False)
    hour_counts_cumsum = hour_counts.cumsum()
    hour_ranges = []
    end = hour_counts_cumsum.index[0][1]
    i = MAX_TWEETS_PER_QUERY
    while i <= hour_counts_cumsum[-1]:
        tmp = hour_counts_cumsum[hour_counts_cumsum >= i]
        start = tmp.index[0][0]
        hour_ranges.append((start, end))
        
        end = start
        i = tmp[0] + MAX_TWEETS_PER_QUERY
    return hour_ranges

In [9]:
partitions = []
url_counts = tweet_counts.set_index("url")
with tqdm(total=url_counts.index.unique().shape[0]) as pbar:
    for acc in range(1, 6):
        for trans in range(1, 4):
            tmp = tweet_counts.query(f"accuracy == {acc} & transparency == {trans}")
            if tmp.shape[0] > 0:
                ranges = []
                url_range = []
                count_sum = 0
                
                max_per_url = tmp.groupby("url")["tweet_count"].max().sort_values(ascending=False)
                urls = list(max_per_url.items())
                while len(urls) > 0:
                    pbar.update(1)

                    url, count = urls.pop(0)
                    url_range.append(url)
                    count_sum += count
                    if count_sum >= MAX_TWEETS_PER_QUERY or len(urls) == 0:
                        url_ranges = get_url_ranges(url_range)
                        for r in url_ranges:
                            hour_counts = url_counts.loc[r].groupby(["start", "end"])["tweet_count"].sum()
                            hour_ranges = get_hour_ranges(hour_counts)

                            # Only include URL ranges with min. MAX_TWEETS_PER_QUERY tweets
                            if len(hour_ranges) > 0:
                                hour_ranges = [(start.strftime(twitter_dateformat), end.strftime(twitter_dateformat)) for (start, end) in hour_ranges]
                                ranges.append({"url_range": r, "hour_ranges": hour_ranges})
                        
                        url_range = []
                        count_sum = 0

                partitions.append({
                    "accuracy": acc,
                    "transparency": trans,
                    "ranges": ranges
                })

100%|██████████| 7924/7924 [24:04<00:00,  5.48it/s]


In [10]:
with open(join(data_dir, f"partitions-{MAX_TWEETS_PER_QUERY}.json"), "w") as f:
    json.dump(partitions, f)
print("accuracy", partitions[0]["accuracy"])
print("transparency", partitions[0]["transparency"])
print("ranges", partitions[0]["ranges"][0])

accuracy 1
transparency 1
ranges {'url_range': ['itv.com'], 'hour_ranges': [('2021-09-30T17:00:00Z', '2021-10-01T00:00:00Z'), ('2021-09-30T13:00:00Z', '2021-09-30T17:00:00Z'), ('2021-09-30T09:00:00Z', '2021-09-30T13:00:00Z'), ('2021-09-30T06:00:00Z', '2021-09-30T09:00:00Z'), ('2021-09-29T20:00:00Z', '2021-09-30T06:00:00Z'), ('2021-09-29T16:00:00Z', '2021-09-29T20:00:00Z'), ('2021-09-29T12:00:00Z', '2021-09-29T16:00:00Z'), ('2021-09-29T09:00:00Z', '2021-09-29T12:00:00Z'), ('2021-09-29T05:00:00Z', '2021-09-29T09:00:00Z'), ('2021-09-28T18:00:00Z', '2021-09-29T05:00:00Z'), ('2021-09-28T14:00:00Z', '2021-09-28T18:00:00Z'), ('2021-09-28T10:00:00Z', '2021-09-28T14:00:00Z'), ('2021-09-28T06:00:00Z', '2021-09-28T10:00:00Z'), ('2021-09-27T19:00:00Z', '2021-09-28T06:00:00Z'), ('2021-09-27T15:00:00Z', '2021-09-27T19:00:00Z'), ('2021-09-27T12:00:00Z', '2021-09-27T15:00:00Z'), ('2021-09-27T07:00:00Z', '2021-09-27T12:00:00Z'), ('2021-09-26T19:00:00Z', '2021-09-27T07:00:00Z'), ('2021-09-26T12:00:00Z',

# Create Query Sample

In [9]:
with open(join(data_dir, f"partitions-{MAX_TWEETS_PER_QUERY}.json")) as f:
    partitions = json.load(f)

In [18]:
sum([len(part["ranges"]) for part in partitions]) * 3

3558

In [19]:
def sample_queries(partitions, random_state=19):
    random.seed(random_state)
    max_queries = MAX_TWEETS // MAX_TWEETS_PER_QUERY
    queries = []
    for part in deepcopy(partitions):
        i = 0

        # query each url range three times
        for r in part["ranges"]:
            for _ in range(3):
                if len(r["hour_ranges"]) == 0:
                    break
                
                k = random.randint(0, len(r["hour_ranges"]) - 1)
                time_range = r["hour_ranges"].pop(k)
                
                queries.append({
                    "accuracy": part["accuracy"],
                    "transparency": part["transparency"],
                    "urls": r["url_range"], 
                    "start": time_range[0], 
                    "end": time_range[1]
                })
                i += 1

        # add random ranges if available
        max_per_part = max_queries // len(partitions)
        while i < max_per_part and len(part["ranges"]) > 0:
            j = random.randint(0, len(part["ranges"]) - 1)
            r = part["ranges"][j]
            if len(r["hour_ranges"]) == 0:
                part["ranges"].pop(j)
                break

            k = random.randint(0, len(r["hour_ranges"]) - 1)
            time_range = r["hour_ranges"].pop(k)
            
            queries.append({
                "accuracy": part["accuracy"],
                "transparency": part["transparency"],
                "urls": r["url_range"], 
                "start": time_range[0], 
                "end": time_range[1]
            })
            i += 1
    
    return queries

queries = sample_queries(partitions)
print("#queries", len(queries))
print("#tweets", len(queries) * MAX_TWEETS_PER_QUERY)

#queries 4388
#tweets 438800


In [20]:
with open(join(data_dir, f"queries-{MAX_TWEETS_PER_QUERY}.json"), "w") as f:
    json.dump(queries, f)

# Double Check Number of Tweets

In [10]:
with open(join(data_dir, f"queries-{MAX_TWEETS_PER_QUERY}.json")) as f:
    queries = json.load(f)
total_counts_per_url = tweet_counts.groupby("url")["tweet_count"].sum().rename("total_count")

In [11]:
def form_counts_subsets(queries):
    subsets = []
    url_counts = tweet_counts.set_index("url")
    for query in tqdm(queries):
        subset = url_counts

        subset = subset.loc[query["urls"]]
        subset = subset[(subset["start"] >= query["start"]) & (subset["end"] <= query["end"])]
        
        subsets.append(subset.reset_index())

    return subsets

counts_subsets = form_counts_subsets(queries)

100%|██████████| 4388/4388 [1:02:09<00:00,  1.18it/s]


In [12]:
def count_tweets_per_query(queries, counts_subsets):
    counts = []
    for subset in tqdm(counts_subsets):
        counts.append(min(subset["tweet_count"].sum(), MAX_TWEETS_PER_QUERY))   # cap queries, which are larger than MAX_TWEETS_PER_QUERY
    
    query_df = pd.DataFrame(queries)
    query_df["type"] = query_df.apply(lambda row: "unreliable" if (row["accuracy"] <= 2 and row["transparency"] == 1) else "reliable", axis=1)
    query_df["tweet_count"] = counts
    return query_df

counts_per_query = count_tweets_per_query(queries, counts_subsets)

100%|██████████| 4388/4388 [00:00<00:00, 5861.60it/s]


In [13]:
for group in ["accuracy", "transparency", "type"]:
    total_agg_counts = tweet_counts.groupby(group)["tweet_count"].sum().rename("total_count")
    agg_counts = counts_per_query.groupby(group)["tweet_count"].sum()
    print(pd.concat([agg_counts, total_agg_counts, (agg_counts / total_agg_counts).rename("ratio")], axis=1))
    print()
print("total number of tweets", counts_per_query["tweet_count"].sum())

          tweet_count  total_count     ratio
accuracy                                    
1              282900     88046435  0.003213
2               23500      1306546  0.017986
3               23300      3581294  0.006506
4               46600      8963308  0.005199
5               62500      5175351  0.012076

              tweet_count  total_count     ratio
transparency                                    
1                  100200     15897080  0.006303
2                  141600     38010780  0.003725
3                  197000     53165074  0.003705

            tweet_count  total_count     ratio
type                                          
reliable         338600     91175854  0.003714
unreliable       100200     15897080  0.006303

total number of tweets 438800


In [14]:
assert counts_per_query["tweet_count"].sum() == (len(queries) * MAX_TWEETS_PER_QUERY)

In [15]:
def worst_case_url_distr(counts_subsets):
    counts = []
    for subset in tqdm(counts_subsets):
        subset = subset.sort_values(by=["end", "tweet_count"], ascending=False)
        
        count_cumsum = subset["tweet_count"].cumsum()
        i = count_cumsum[count_cumsum >= MAX_TWEETS_PER_QUERY].idxmin()
        counts.append(subset.loc[:i])  # .loc has inclusive bounds
    
    return pd.concat(counts).groupby("url")["tweet_count"].sum()

worst_case_per_url = worst_case_url_distr(counts_subsets)

100%|██████████| 4388/4388 [00:14<00:00, 304.27it/s]


In [16]:
ratio = pd.concat([worst_case_per_url, total_counts_per_url, (worst_case_per_url / total_counts_per_url).fillna(1).rename("ratio")], axis=1)
ratio["tweet_count"].fillna(0, inplace=True)
ratio.query("total_count > 0 & tweet_count == 0").sort_values(by="total_count", ascending=False).to_records().tolist()

[('health.com', 0.0, 4889, 1.0),
 ('panorama.it', 0.0, 2043, 0.0),
 ('thetruthaboutcancer.com', 0.0, 1902, 0.0),
 ('thechronicleherald.ca', 0.0, 1398, 0.0),
 ('infiniteunknown.net', 0.0, 1270, 0.0),
 ('yomiuri.co.jp', 0.0, 928, 0.0),
 ('conservativeintel.com', 0.0, 825, 0.0),
 ('parismatch.com', 0.0, 672, 0.0),
 ('ourweekly.com', 0.0, 664, 0.0),
 ('vosizneias.com', 0.0, 634, 0.0),
 ('rfdtv.com', 0.0, 587, 0.0),
 ('patriotfetch.com', 0.0, 578, 0.0),
 ('officer.com', 0.0, 529, 1.0),
 ('wschronicle.com', 0.0, 515, 0.0),
 ('latribune.fr', 0.0, 473, 0.0),
 ('decisiondeskhq.com', 0.0, 391, 0.0),
 ('gazettes.com', 0.0, 369, 0.0),
 ('watchtheyard.com', 0.0, 358, 0.0),
 ('christiannews.net', 0.0, 344, 0.0),
 ('world.wng.org', 0.0, 324, 0.0),
 ('irresponsabile.com', 0.0, 297, 0.0),
 ('orangeleader.com', 0.0, 286, 0.0),
 ('homernews.com', 0.0, 266, 0.0),
 ('thetimesweekly.com', 0.0, 265, 0.0),
 ('albertapressleader.ca', 0.0, 265, 0.0),
 ('journalistenwatch.com', 0.0, 244, 0.0),
 ('kenfm.de', 0.0,

In [17]:
(ratio["tweet_count"] > 0).sum(), (ratio["total_count"] > 0).sum(), ratio.shape[0], (ratio["tweet_count"] > 0).sum() / (ratio["total_count"] > 0).sum()

(5608, 6002, 7924, 0.9343552149283573)

In [18]:
tmp = tweet_counts.groupby("url")["tweet_count"].sum().sort_values(ascending=False).reset_index()
tmp["q"] = pd.qcut(tmp.index, q=10, labels=False)
tmp = tmp.merge(worst_case_per_url, on="url", how="left")
tmp[">0_all"] = tmp["tweet_count_x"] > 0
tmp[">0_counted"] = tmp["tweet_count_y"] > 0
tmp.groupby("q")[[">0_counted", ">0_all"]].sum()

Unnamed: 0_level_0,>0_counted,>0_all
q,Unnamed: 1_level_1,Unnamed: 2_level_1
0,793,793
1,792,792
2,791,792
3,787,793
4,755,792
5,720,792
6,641,793
7,329,455
8,0,0
9,0,0
