# Analyze Drudge hyperlinks

In [1]:
import re
import sys
from pathlib import Path
from urllib.parse import urlparse

In [2]:
import tldextract
import storysniffer

In [3]:
import numpy as np
import pandas as pd
import altair as alt

In [4]:
import spacy
from spacy import displacy

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
nlp = spacy.load("en_core_web_sm")

In [6]:
this_dir = Path("__file__").parent.absolute()
sys.path.append(this_dir.parent)
sys.path.append(str(this_dir.parent / "newshomepages"))

In [7]:
import altair_theme

In [8]:
alt.themes.register('palewire', altair_theme.theme)
alt.themes.enable('palewire')

ThemeRegistry.enable('palewire')

In [9]:
extracts_dir = this_dir.parent / "extracts" / "csv"

In [10]:
analysis_dir = this_dir.parent / "_analysis"

Read in the sample data

In [11]:
df = pd.read_csv(
    extracts_dir / "drudge-hyperlinks-sample.csv",
    usecols=[
        'handle',
        'file_name',
        'date',
        'text',
        'url',
    ],
    dtype=str,
    parse_dates=["date"]
)

Guess links with `storysniffer`

In [12]:
links_df = df.groupby(["text", "url"]).agg({
    "handle": "size",
    "date": "min"
}).rename(columns={"handle": "n", "date": "earlier_date"}).reset_index()

In [14]:
sniffer = storysniffer.StorySniffer()

In [15]:
links_df['is_story'] = links_df.apply(lambda x: sniffer.guess(x['url'], text=x['text']), axis=1)

In [16]:
links_df.is_story.value_counts()

True     6835
False     259
Name: is_story, dtype: int64

In [17]:
links_df.is_story.value_counts(normalize=True)

True     0.96349
False    0.03651
Name: is_story, dtype: float64

In [18]:
links_df.to_csv("./drudge-hyperlinks-storysniffer-guesses.csv", index=False)

Make some manual fixes

In [19]:
blacklist = [
    "/privacy/",
]

In [20]:
links_df.loc[
    links_df.url.isin(blacklist),
    'is_story'
] = False

In [21]:
correction_list = [
    "\.(substack|theankler|commonsense|thedispatch).(com|news)/p/",
    "^https://time.com/\d{5,}/*",
    "^https://*.studyfinds.org/*.{5,}",
    "^https://*.bbc.com/news/*.{5,}",
    "^https://www.jpost.com/breaking-news/*.{5,}",
    "^https://www.jpost.com/[a-z]{5,}/*.{5,}",
    "^https://*.braintomorrow.com/*.{5,}"
    "^https://finance.yahoo.com/news/*.{5,}",
    "^https://www.vice.com/en/article/*.{5,}",
    "^https://news.yahoo.com/*.{5,}",
]

In [22]:
for c in correction_list:
    links_df.loc[links_df.url.str.contains(c, regex=True), 'is_story'] = True

  links_df.loc[links_df.url.str.contains(c, regex=True), 'is_story'] = True


Knock out anything that appears most of the time

In [23]:
n = len(df.file_name.unique())

In [24]:
too_much = links_df.n >= n * .5

In [25]:
links_df.loc[too_much, 'is_story'] = False

Knock out anything with a bad URL

In [26]:
links_df.loc[~links_df.url.str.startswith("http"), 'is_story'] = False

In [27]:
links_df[
    (links_df.n < 40) &
    (~links_df.is_story)
].sort_values("url").head(5)

Unnamed: 0,text,url,n,earlier_date,is_story
4452,Orlando to become 'physical center of metavers...,https://www.the-sun.com/tech/5661470/orlando-...,4,2022-06-29,False
4453,Orlando to become 'physical center of the meta...,https://www.the-sun.com/tech/5661470/orlando-...,1,2022-06-29,False
6169,Tiger dies after contracting at Ohio zoo...,/https://www.cbsnews.com/news/tiger-dies-after...,1,2022-07-01,False
4703,Poll numbers reveal trouble for Biden beyond t...,How bad things are for Biden,1,2022-06-10,False
3446,LIVE HEAT MAP...,http://hp2.wright-weather.com/icons/us_heat.gif,8,2022-06-12,False


In [28]:
links_df.to_csv("./drudge-hyperlinks-storysniffer-tweaks.csv", index=False)

In [29]:
story_df = links_df[links_df.is_story].copy()

In [30]:
story_df['domain'] = story_df.url.apply(lambda x: f"{tldextract.extract(x).domain}.{tldextract.extract(x).suffix}")

Tally domains

In [31]:
domain_df = story_df.groupby(["domain"]).size().rename("n").reset_index().sort_values("n", ascending=False)

In [32]:
domain_df['percent'] = (domain_df.n / domain_df.n.sum()) * 100

In [33]:
domain_df.head(10)

Unnamed: 0,domain,n,percent
182,msn.com,806,11.771579
356,yahoo.com,589,8.602308
17,apnews.com,507,7.404703
352,wsj.com,372,5.433036
84,dnyuz.com,310,4.52753
73,dailymail.co.uk,297,4.337666
66,cnn.com,218,3.183876
65,cnbc.com,216,3.154666
284,the-sun.com,211,3.081642
212,nypost.com,207,3.023222


In [34]:
def is_trump(row):
    token_list = [t.lower() for t in row['text'].split()]
    if 'trump' in token_list:
        return True
    elif 'donald' in token_list:
        return True
    elif 'don' in token_list:
        return True
    else:
        if 'trump' in row['url']:
            return True
    return False

In [35]:
story_df['is_trump'] = story_df.apply(is_trump, axis=1)

In [36]:
story_df.is_trump.value_counts()

False    6197
True      650
Name: is_trump, dtype: int64

In [37]:
trump_df = story_df[story_df.is_trump].copy()

In [38]:
trump_df['doc'] = trump_df.text.apply(nlp)

In [39]:
trump_df.iloc[10].doc

'Not above the law'...

In [40]:
trump_df.iloc[50].doc

Alabama Senate race tests ex-President's sway after endorsement flip-flop...

In [41]:
trump_df.text.unique()

array([' Committee Kicks Off Capitol Attack Public Probe...',
       " Documentary:  The Don called the rioters 'smart'... ",
       " Hearing Delivers Political Blow That His Rivals Couldn't Land...",
       ' More White House aides will testify at next hearing...',
       " What's next for other criminal cases...",
       "'CATACLYSMIC THREATS'", "'FOX Would Die Without Me'... ",
       "'I will break you in half. Don't f*ck with me'...",
       "'NOT HERE TO HURT ME'", "'Not above law'...",
       "'Not above the law'...", "'Not theirs, it's mine'...\n",
       "'Ready for Ron' means ready for 'war'...",
       "'SOUTH PARK' creators details Trump movie they couldn't get made...",
       "'Substantial Criminal Case'...", "'TOP SECRET'",
       "'TRUTH SOCIAL' Banning Users Who Post...",
       "'TRUTH' FACING FINANCIAL FALLOUT...", "'TRUTH' HITS TROUBLE...",
       "'TRUTH' SOCIAL FACING FINANCIAL FALLOUT...",
       "'TRUTH' Shadow Banning Users Who Post About Attack...",
       "'

In [42]:
nlp('When Trump Declares He Loses Control of $103 Million...').ents

($103 Million,)

In [43]:
nlp('Trump looms large over Biden decision...').ents

(Biden,)

In [45]:
[(e.text, e.label_, e.kb_id_) for e in nlp("YOUTUBE Deletes Committee Video for Spreading 'Election Misinformation'...").ents]

[('YOUTUBE Deletes Committee Video', 'ORG', '')]