In [1]:
import re
import sys
from pathlib import Path
from urllib.parse import urlparse

In [2]:
import tldextract
import storysniffer

In [3]:
import numpy as np
import pandas as pd
import altair as alt

In [4]:
this_dir = Path("__file__").parent.absolute()
sys.path.append(this_dir.parent)
sys.path.append(str(this_dir.parent / "newshomepages"))

In [5]:
extracts_dir = this_dir.parent / "extracts" / "csv"

In [26]:
df = pd.read_csv(
    extracts_dir / "us-right-wing-hyperlinks-sample.csv",
    usecols=[
        'handle',
        'file_name',
        'date',
        'text',
        'url',
    ],
    dtype=str,
    parse_dates=["date"]
)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378740 entries, 0 to 378739
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   handle     378740 non-null  object        
 1   file_name  378740 non-null  object        
 2   date       378740 non-null  datetime64[ns]
 3   text       343040 non-null  object        
 4   url        378060 non-null  object        
dtypes: datetime64[ns](1), object(4)
memory usage: 14.4+ MB


In [28]:
df.head()

Unnamed: 0,handle,file_name,date,text,url
0,BonginoReport,bonginoreport-2022-11-02T06:42:46.916347-04:00...,2022-11-02,\n\n,/
1,BonginoReport,bonginoreport-2022-11-02T06:42:46.916347-04:00...,2022-11-02,,https://thepostmillennial.com/breaking-justice...
2,BonginoReport,bonginoreport-2022-11-02T06:42:46.916347-04:00...,2022-11-02,Justice Roberts Blocks Democrat House Committe...,https://thepostmillennial.com/breaking-justice...
3,BonginoReport,bonginoreport-2022-11-02T06:42:46.916347-04:00...,2022-11-02,Read more,https://thepostmillennial.com/breaking-justice...
4,BonginoReport,bonginoreport-2022-11-02T06:42:46.916347-04:00...,2022-11-02,\nBenjamin Netanyahu Poised to Return to Power\n,https://www.foxnews.com/world/israel-election-...


In [29]:
df.text = df.text.str.strip()

In [30]:
links_df = df.groupby(["handle", "text", "url"]).agg({
    "handle": "size",
    "date": "min"
}).rename(columns={"handle": "n", "date": "earlier_date"}).reset_index()

In [31]:
sniffer = storysniffer.StorySniffer()

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [32]:
links_df['is_story'] = links_df.apply(lambda x: sniffer.guess(x['url'], text=x['text']), axis=1)

In [63]:
text_black_list = [
    "COMMENTS",
    "COMMENT",
]

In [64]:
for b in text_black_list:
    links_df.loc[links_df.text == b, 'is_story'] = False

In [65]:
correction_list = [
    "\.(substack|theankler|commonsense|thedispatch).(com|news)/p/",
    "^https://time.com/\d{5,}/*",
    "^https://*.studyfinds.org/*.{5,}",
    "^https://*.bbc.com/news/*.{5,}",
    "^https://www.jpost.com/breaking-news/*.{5,}",
    "^https://www.jpost.com/[a-z]{5,}/*.{5,}",
    "^https://*.braintomorrow.com/*.{5,}"
    "^https://finance.yahoo.com/news/*.{5,}",
    "^https://www.vice.com/en/article/*.{5,}",
    "^https://news.yahoo.com/*.{5,}",
    "^https://www.nationalreview.com/corner/*"
    "^https://www.nationalreview.com/the-morning-jolt/*",
]

In [66]:
for c in correction_list:
    links_df.loc[links_df.url.str.contains(c, regex=True), 'is_story'] = True

  links_df.loc[links_df.url.str.contains(c, regex=True), 'is_story'] = True


In [58]:
n = len(df.file_name.unique())

In [59]:
too_much = links_df.n >= n * .5

In [60]:
links_df.loc[too_much, 'is_story'] = False

In [57]:
links_df.loc[~links_df.url.str.startswith("http"), 'is_story'] = False

In [56]:
links_df.is_story.value_counts()

True     28626
False    10402
Name: is_story, dtype: int64

In [50]:
links_df.is_story.value_counts(normalize=True)

True     0.731244
False    0.268756
Name: is_story, dtype: float64

In [39]:
links_df.to_excel("./us-right-wing-hyperlinks-storysniffer-guesses.xls", index=False)

  links_df.to_excel("./us-right-wing-hyperlinks-storysniffer-guesses.xls", index=False)


In [61]:
story_df = links_df[links_df.is_story].copy()

In [62]:
story_df['domain'] = story_df.url.apply(lambda x: f"{tldextract.extract(x).domain}.{tldextract.extract(x).suffix}")