# Drudge entities analysis

By Ben Welsh

A draft analysis of the top words in headlines from the Drudge Report

## Import

Python tools

In [1]:
import typing
import pandas as pd
from collections import Counter

Formatting

In [2]:
from rich import print
from rich.progress import track

Natural language processing

In [3]:
import spacy

In [7]:
# !pipenv run python -m spacy download en_core_web_lg

In [8]:
nlp = spacy.load('en_core_web_lg')

## Extract

Read in data

In [9]:
drudge_df = pd.read_csv(
    "https://github.com/palewire/news-homepages/raw/main/extracts/csv/drudge-hyperlinks-analysis.csv",
    parse_dates=["earliest_date"]
)

In [10]:
drudge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6013 entries, 0 to 6012
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   text           6013 non-null   object        
 1   url            6013 non-null   object        
 2   earliest_date  6013 non-null   datetime64[ns]
 3   is_story       6013 non-null   bool          
 4   domain         6013 non-null   object        
dtypes: bool(1), datetime64[ns](1), object(3)
memory usage: 193.9+ KB


In [11]:
drudge_df.earliest_date.min()

Timestamp('2022-08-09 00:00:00')

In [12]:
drudge_df.earliest_date.max()

Timestamp('2022-11-06 00:00:00')

In [13]:
drudge_df.head()

Unnamed: 0,text,url,earliest_date,is_story,domain
0,CALIFORNIA NOTICE,/privacy/california/,2022-08-09,False,.
1,DO NOT SELL MY INFO,/privacy/opt-out/,2022-08-09,False,.
2,PRIVACY POLICY,/privacy/,2022-08-09,False,.
3,QUAKE SHEET,/quake.htm,2022-08-09,False,.
4,WEATHER ACTION,/wx.htm,2022-08-09,False,.


## Transform

Filter down to stories

In [14]:
story_df = drudge_df[drudge_df.is_story].copy()

Cut `...`

In [15]:
story_df.text = story_df.text.str.replace(r"\.{2,}", "", regex=True)

Uppercase everything

In [16]:
story_df.text = story_df.text.str.upper()

Extract all unique headlines

In [17]:
headline_list = sorted(list(story_df.text.unique()))

## Analyze

Pull out all of the meaningful words

In [18]:
def get_lemma(headline: str) -> typing.Dict:
    """Parse all of the words we want to keep in the headline."""
    # Read it into our NPL thing
    doc = nlp(headline)
    
    # Parse out all the words
    token_list = [token for token in doc]

    # Remove stop words
    token_list = [t for t in token_list if not t.is_stop]

    # Remove punctuation words
    token_list = [t for t in token_list if not t.is_punct]

    # Remove digits
    token_list = [t for t in token_list if not t.is_digit]

    # Trim it down to only the stuff we want to keep
    dict_list = [dict(
        headline=headline,
        word=t.text.upper(),
        lemma=t.lemma_.upper(),
        part_of_speech=t.pos_,
    ) for t in token_list]
    
    # Pass it back
    return dict_list

In [19]:
word_list = []
for headline in track(headline_list):
    word_list += get_lemma(headline)

Output()

In [20]:
word_df = pd.DataFrame(word_list)

In [21]:
word_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33149 entries, 0 to 33148
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   headline        33149 non-null  object
 1   word            33149 non-null  object
 2   lemma           33149 non-null  object
 3   part_of_speech  33149 non-null  object
dtypes: object(4)
memory usage: 1.0+ MB


In [22]:
word_df.head()

Unnamed: 0,headline,word,lemma,part_of_speech
0,#METOO 5 YEARS LATER: NO ONE'S FULLY RETURNED ...,METOO,METOO,PROPN
1,#METOO 5 YEARS LATER: NO ONE'S FULLY RETURNED ...,YEARS,YEAR,NOUN
2,#METOO 5 YEARS LATER: NO ONE'S FULLY RETURNED ...,LATER,LATER,NOUN
3,#METOO 5 YEARS LATER: NO ONE'S FULLY RETURNED ...,FULLY,FULLY,ADJ
4,#METOO 5 YEARS LATER: NO ONE'S FULLY RETURNED ...,RETURNED,RETURNED,PROPN


Remove our extra stop words, as well as all symbols and verbs

In [23]:
stop_list = [
    "NEW",
    "MAN",
    "WOMAN",
    "YEAR",
    "DAY",
    "MILLION",
    "HIGH",
    "BIG",
    "RECORD",
    "HOME",
    "WORLD",
    "STATE",
    "TIME",
    "CASE",
    "LIFE",
    "AMERICAN",
    "INSIDE",
    "EX",
    "MAR",
    "HIT",
    "LAGO",
    "RISE",
    "AMID",
    "WARNS",
    "RATE",
    "SHOW",
    "ATTACK",
    "RISE",
    "DEAD",
    "SET",
]

In [24]:
qualified_df = word_df[
    (~word_df.part_of_speech.isin(["SYM", "VERB"])) &
    (~word_df.lemma.isin(stop_list))
]

Calculate the 25 most common words

In [25]:
top_words = (
    qualified_df.groupby("lemma")
        .size()
        .rename("n")
        .reset_index()
        .sort_values("n", ascending=False)
        .head(25)
)

Get the top verb used with each word

In [26]:
def get_headlines(lemma: str) -> typing.List:
    """Get all the headlines for the provided word."""
    return sorted(list(qualified_df[qualified_df.lemma == lemma].headline.unique()))

In [27]:
def get_top_verb(lemma: str) -> str:
    """Get the top verb in the provided lemma's headline set."""
    # Set our stop words for the verbs
    stop_verbs = ["SAYS", "HAS", "GETS", "GET", "LULA", "ELON", "SAY", "HAVE",]
    if lemma == "COVID":
        stop_verbs += ["TESTS"]
    if lemma == "MUSK":
        stop_verbs += ["SOCIAL"]

    # Pull the headlines
    headline_list = get_headlines(lemma)

    # Loop through all of the headlines
    master_list = []    
    for headline in headline_list:
        # Parse the headline again with NLP
        doc = nlp(headline)
        
        # Pull out the verbs
        verb_list = [t.lemma_.upper() for t in doc if t.pos_ == "VERB"]
        
        # Cut the stop words
        verb_list = [v for v in verb_list if v not in stop_verbs]
        
        # Add it to our master list
        master_list += verb_list
    
    # Count the verbs
    verb_counter = Counter(master_list)
    
    # Pull the most common one
    top_verb = verb_counter.most_common(2)
    
    # Return the result
    return top_verb[0][0]

In [28]:
top_words['top_verb'] = top_words.lemma.apply(get_top_verb)

In [29]:
top_words.head(25)

Unnamed: 0,lemma,n,top_verb
7449,TRUMP,151,TAKE
750,BIDEN,104,WANT
2307,ELECTION,100,VOTE
7806,WAR,87,GROW
5547,PUTIN,74,BLOW
1646,COVID,68,TEST
2146,DON,68,ASK
7634,USA,67,CONSIDER
110,ABORTION,65,BAN
6049,RUSSIA,64,FIGHT


Get the timeseries for our top words

In [30]:
min_date, max_date = story_df.earliest_date.min(), story_df.earliest_date.max()

In [31]:
def get_timeseries(lemma: str) -> typing.List:
    """Pull the day to day timeseries for the provided word."""
    # Count the top words by day
    df = (
        qualified_df[qualified_df.lemma == lemma]
            .merge(story_df[['earliest_date', 'text']].rename(columns={"text": "headline"}), on="headline")
            .groupby("earliest_date")
            .size()
            .rename("n")
            .reset_index()
            .rename(columns={"earliest_date": "date"})
            .set_index("date")
    )
    
    # Fill in days we're missing
    date_range = pd.date_range(
        min_date,
        max_date,
        freq="D",
    )
    date_index = pd.DatetimeIndex(date_range)
    backfilled_df = df.reindex(date_index)
    backfilled_df.n.fillna(0, inplace=True)
    
    # Calculate the 7-day rolling average
    backfilled_df['7_day_rolling_average'] = backfilled_df.n.rolling(7).mean()

    # Convert it to a dict list
    dict_list = backfilled_df.reset_index().rename(columns={"index": "date"}).to_dict(orient="records")
    
    # Convert our dates to strings
    for d in dict_list:
        d['date'] = d['date'].strftime("%Y-%m-%d")
    
    # Pass it out
    return dict_list

In [32]:
top_words['timeseries'] = top_words.lemma.apply(get_timeseries)

In [33]:
top_words.head()

Unnamed: 0,lemma,n,top_verb,timeseries
7449,TRUMP,151,TAKE,"[{'date': '2022-08-09', 'n': 5.0, '7_day_rolli..."
750,BIDEN,104,WANT,"[{'date': '2022-08-09', 'n': 3.0, '7_day_rolli..."
2307,ELECTION,100,VOTE,"[{'date': '2022-08-09', 'n': 0.0, '7_day_rolli..."
7806,WAR,87,GROW,"[{'date': '2022-08-09', 'n': 3.0, '7_day_rolli..."
5547,PUTIN,74,BLOW,"[{'date': '2022-08-09', 'n': 1.0, '7_day_rolli..."


## Validation

Proof any words we're curious about

In [34]:
get_headlines("PUTIN")

["'MUSK TRANSMITTING MESSAGE FOR PUTIN'",
 "ANOTHER PUTIN CRONY DIES AFTER 'FALLING FROM BOAT'",
 'BIDEN WARNS PUTIN AGAINST USING NUCLEAR OR CHEMICAL WEAPONS',
 'CAR-BOMB KILLING SOWS UNEASE AMONG PUTIN CHEERLEADERS',
 "DAUGHTER OF 'PUTIN'S BRAIN' KILLED IN CAR BOMB",
 "DESPERATE PUTIN'S DOUBLE TROUBLE",
 "DID PUTIN'S FROGMEN BLOW UP EUROPE'S GAS SUPPLIES?",
 "HOW PUTIN PUSHING ARMY BOSSES THROUGH 'MEAT GRINDER' OF DEATH",
 'IN DC, PUTIN NUKE THREATS STIR GROWING ALARM',
 'LEAKED SPY DOCS CLAIM PUTIN TAKING SECRET COCKTAIL OF DRUGS',
 "LEAKED SPY DOCS SUGGEST PUTIN DOES HAVE PARKINSON'S, CANCER",
 "MEET PUTIN'S INNER CIRCLE OF EVIL",
 'MUSK APPEASEMENT OF PUTIN AND CHINA STOKES FEARS OF NEW TWITTER POLICIES',
 'MUSK DENIES HE TALKED TO PUTIN AHEAD OF CONTROVERSIAL TWEET',
 "ODESA DEFIANT. IT'S ALSO PUTIN'S ULTIMATE TARGET",
 'POLAND ASKS USA TO HOST NUKES AMID GROWING PUTIN FEARS',
 "PUTIN 'HAS GIVEN ORDER TO DEPLOY NUKES,' CLAIMS KREMLIN INSIDER",
 "PUTIN 'PLANS TO BLOW UP MAJOR DAM'