# Drudge entities analysis

By Ben Welsh

A draft analysis of the top words in headlines from the Drudge Report

## Import

Python tools

In [108]:
import typing
import pandas as pd
import altair as alt
from collections import Counter

Formatting

In [2]:
from rich import print
from rich.progress import track

Natural language processing

In [3]:
import spacy

In [4]:
# !pipenv run python -m spacy download en_core_web_lg

In [5]:
nlp = spacy.load('en_core_web_lg')

## Extract

Read in data

In [6]:
link_df = pd.read_csv(
    "../extracts/csv/us-right-wing-hyperlinks-analysis.csv",
    parse_dates=["earliest_date"]
)

In [7]:
link_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24474 entries, 0 to 24473
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   handle         24474 non-null  object        
 1   text           23215 non-null  object        
 2   url            24474 non-null  object        
 3   earliest_date  24474 non-null  datetime64[ns]
 4   is_story       24474 non-null  bool          
 5   domain         24474 non-null  object        
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 980.0+ KB


In [8]:
link_df.earliest_date.min()

Timestamp('2022-11-04 00:00:00')

In [9]:
link_df.earliest_date.max()

Timestamp('2022-11-10 00:00:00')

In [10]:
link_df.head()

Unnamed: 0,handle,text,url,earliest_date,is_story,domain
0,DailyCaller,,/2022/11/10/calhoun-georgia-police-say-6-year-...,2022-11-10,False,.
1,DailyCaller,,/2022/11/10/hochul-breed-new-york-san-francisc...,2022-11-10,False,.
2,DailyCaller,,/2022/11/10/kaushik-ignoring-science-and-techn...,2022-11-10,False,.
3,DailyCaller,,/2022/11/10/midterms-tv-views-downward-trend/,2022-11-10,False,.
4,DailyCaller,,/2022/11/10/mortgage-payments-pummel-homeowners/,2022-11-10,False,.


## Transform

Filter down to stories

In [58]:
story_df = link_df[
    (link_df.is_story) &
    ~(pd.isnull(link_df.text))
].drop_duplicates(["handle", "text"], keep="first").copy()

Cut `...`

In [59]:
story_df.text = story_df.text.str.replace(r"\.{2,}", "", regex=True)

In [60]:
story_df = story_df[~story_df.text.str.contains(r"^\d+$", regex=True)]

In [61]:
story_df = story_df[~story_df.text.str.contains(r"^\d+,\d+$", regex=True)]

In [62]:
story_df = story_df[~story_df.text.str.startswith("[Gallery]")]

In [63]:
story_df = story_df[~story_df.text.str.startswith("[Photos]")]

In [64]:
story_df = story_df[~story_df.text.str.lower().str.contains("\d+ comments")]

In [65]:
story_df = story_df[~story_df.text.str.lower().str.contains("\d+ responses")]

In [76]:
story_df = story_df[~story_df.text.str.lower().str.startswith("comments")]

In [77]:
story_df = story_df[~story_df.url.str.contains("https://trends.revcontent.com")]

In [78]:
story_df.earliest_date.value_counts()

2022-11-08    1199
2022-11-04    1146
2022-11-09     877
2022-11-10     759
2022-11-07     642
2022-11-06     438
2022-11-05     404
Name: earliest_date, dtype: int64

In [82]:
story_df.url = story_df.url.str.strip()

In [83]:
today_df = story_df[story_df.earliest_date >= '2022-11-08']

In [113]:
today_df.sort_values("text").to_csv("us-right-wing-election-headlines.csv", index=False)

Extract all unique headlines

In [85]:
headline_list = sorted(list(today_df.text.unique()))

## Analyze

Pull out all of the meaningful words

In [86]:
def get_lemma(headline: str) -> typing.Dict:
    """Parse all of the words we want to keep in the headline."""
    # Read it into our NPL thing
    doc = nlp(headline)
    
    # Parse out all the words
    token_list = [token for token in doc]

    # Remove stop words
    token_list = [t for t in token_list if not t.is_stop]

    # Remove punctuation words
    token_list = [t for t in token_list if not t.is_punct]

    # Remove digits
    token_list = [t for t in token_list if not t.is_digit]

    # Trim it down to only the stuff we want to keep
    dict_list = [dict(
        headline=headline,
        word=t.text.upper(),
        lemma=t.lemma_.upper(),
        part_of_speech=t.pos_,
    ) for t in token_list]
    
    # Pass it back
    return dict_list

In [87]:
word_list = []
for headline in track(headline_list):
    word_list += get_lemma(headline)

Output()

In [88]:
word_df = pd.DataFrame(word_list)

In [89]:
word_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20975 entries, 0 to 20974
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   headline        20975 non-null  object
 1   word            20975 non-null  object
 2   lemma           20975 non-null  object
 3   part_of_speech  20975 non-null  object
dtypes: object(4)
memory usage: 655.6+ KB


In [90]:
word_df.head()

Unnamed: 0,headline,word,lemma,part_of_speech
0,"""Car Vending Machine"" Company Carvana's Stock ...",CAR,CAR,NOUN
1,"""Car Vending Machine"" Company Carvana's Stock ...",VENDING,VEND,VERB
2,"""Car Vending Machine"" Company Carvana's Stock ...",MACHINE,MACHINE,NOUN
3,"""Car Vending Machine"" Company Carvana's Stock ...",COMPANY,COMPANY,PROPN
4,"""Car Vending Machine"" Company Carvana's Stock ...",CARVANA,CARVANA,PROPN


Remove our extra stop words, as well as all symbols and verbs

In [91]:
stop_list = [
    "COMMENTS",
    "COMMENT",
]

In [92]:
qualified_df = word_df[
    (~word_df.part_of_speech.isin(["SYM", "VERB"])) &
    (~word_df.lemma.isin(stop_list))
]

Calculate the 25 most common words

In [100]:
top_words = (
    qualified_df.groupby("lemma")
        .size()
        .rename("n")
        .reset_index()
        .sort_values("n", ascending=False)
        .head(50)
)

Get the top verb used with each word

In [101]:
def get_headlines(lemma: str) -> typing.List:
    """Get all the headlines for the provided word."""
    return sorted(list(qualified_df[qualified_df.lemma == lemma].headline.unique()))

In [102]:
def get_top_verb(lemma: str) -> str:
    """Get the top verb in the provided lemma's headline set."""
    # Set our stop words for the verbs
    stop_verbs = ["SAYS", "HAS", "GETS", "GET", "LULA", "ELON", "SAY", "HAVE",]
    if lemma == "COVID":
        stop_verbs += ["TESTS"]
    if lemma == "MUSK":
        stop_verbs += ["SOCIAL"]

    # Pull the headlines
    headline_list = get_headlines(lemma)

    # Loop through all of the headlines
    master_list = []    
    for headline in headline_list:
        # Parse the headline again with NLP
        doc = nlp(headline)
        
        # Pull out the verbs
        verb_list = [t.lemma_.upper() for t in doc if t.pos_ == "VERB"]
        
        # Cut the stop words
        verb_list = [v for v in verb_list if v not in stop_verbs]
        
        # Add it to our master list
        master_list += verb_list
    
    # Count the verbs
    verb_counter = Counter(master_list)
    
    # Pull the most common one
    top_verb = verb_counter.most_common(2)
    
    # Return the result
    return top_verb[0][0]

In [103]:
top_words['top_verb'] = top_words.lemma.apply(get_top_verb)

In [104]:
top_words.head(50)

Unnamed: 0,lemma,n,top_verb
1531,ELECTION,242,WIN
528,BIDEN,175,RUN
3022,MIDTERM,156,GO
4804,TRUMP,155,WIN
3780,RACE,141,WIN
1993,GOP,129,HOLD
3228,NEW,120,WIN
4189,SENATE,111,WIN
3927,REPUBLICAN,108,WIN
2223,HOUSE,103,WIN


Get the timeseries for our top words

In [30]:
min_date, max_date = story_df.earliest_date.min(), story_df.earliest_date.max()

In [31]:
def get_timeseries(lemma: str) -> typing.List:
    """Pull the day to day timeseries for the provided word."""
    # Count the top words by day
    df = (
        qualified_df[qualified_df.lemma == lemma]
            .merge(story_df[['earliest_date', 'text']].rename(columns={"text": "headline"}), on="headline")
            .groupby("earliest_date")
            .size()
            .rename("n")
            .reset_index()
            .rename(columns={"earliest_date": "date"})
            .set_index("date")
    )
    
    # Fill in days we're missing
    date_range = pd.date_range(
        min_date,
        max_date,
        freq="D",
    )
    date_index = pd.DatetimeIndex(date_range)
    backfilled_df = df.reindex(date_index)
    backfilled_df.n.fillna(0, inplace=True)
    
    # Calculate the 7-day rolling average
    backfilled_df['7_day_rolling_average'] = backfilled_df.n.rolling(7).mean()

    # Convert it to a dict list
    dict_list = backfilled_df.reset_index().rename(columns={"index": "date"}).to_dict(orient="records")
    
    # Convert our dates to strings
    for d in dict_list:
        d['date'] = d['date'].strftime("%Y-%m-%d")
    
    # Pass it out
    return dict_list

In [32]:
top_words['timeseries'] = top_words.lemma.apply(get_timeseries)

In [33]:
top_words.head()

Unnamed: 0,lemma,n,top_verb,timeseries
7449,TRUMP,151,TAKE,"[{'date': '2022-08-09', 'n': 5.0, '7_day_rolli..."
750,BIDEN,104,WANT,"[{'date': '2022-08-09', 'n': 3.0, '7_day_rolli..."
2307,ELECTION,100,VOTE,"[{'date': '2022-08-09', 'n': 0.0, '7_day_rolli..."
7806,WAR,87,GROW,"[{'date': '2022-08-09', 'n': 3.0, '7_day_rolli..."
5547,PUTIN,74,BLOW,"[{'date': '2022-08-09', 'n': 1.0, '7_day_rolli..."


## Validation

Proof any words we're curious about

In [120]:
get_headlines("FETTERMAN")

["'Illegal Electioneering' for PA Dems Fetterman, Shapiro Caught on Camera: Project Veritas",
 'Bernie Sanders-Backed John Fetterman Prevails in Pennsylvania',
 'Birds of a Feather: Fetterman Touts Endorsement from Ex-Cop Sued for Accosting Innocent Black Man',
 'Democrat Election Meddler Joins Fetterman Lawsuit to Count Illegal Ballots',
 'Desperate Fetterman Turns to Russia Hoaxer Marc Elias to Force Pennsylvania to Count Undated Mail-in Ballots',
 'Dr. Oz Concedes PA Senate Race to Fetterman',
 'Dr. Oz concedes to Fetterman',
 'FETTERMAN BEATS OZ!',
 'Fetterman 2024!',
 'Fetterman Campaign Hires Lawyer Marc Elias',
 'Fetterman Campaign Reasoning for Counting Illegal Ballots Is Pure Comedy',
 'Fetterman Picks Up Senate Seat for Democrats',
 'Fetterman Projected Winner of US Senate Race',
 'Fetterman Taps Dirty Tricks Dem Lawyer in 11th-Hour Mail-In Ballot Bid',
 'Fetterman beats Oz in Pennsylvania, turning over Senate seat to Democrats',
 'Fetterman for POTUS? NBC’s Tur Says Don’t Co

In [106]:
len(get_headlines("ARIZONA"))

39

In [112]:
alt.Chart(story_df[story_df.text.str.upper().str.contains("ARIZONA")]).mark_bar().encode(
    x=alt.X("earliest_date:T", timeUnit="yearmonthdate"),
    y=alt.Y("count():Q"),
    tooltip=["earliest_date", "count()"]
)