# All the News 2.0 contains 2,688,878 news articles and essays from 27 American publications, spanning January 1, 2016 to April 2, 2020
### Here we will be sampling 200 articles on USA political news, which will be used to train the ML model

https://components.one/datasets/all-the-news-2-news-articles-dataset/

In [1]:
import pandas as pd
import numpy as np

In [2]:
path = r"C:\Users\ez4ke.KDAWG\Desktop\MIDS\DATASCI 266\all-the-news-2-1\all-the-news-2-1.csv"
df_raw = pd.read_csv(path)

In [3]:
# cleaning up data

columns = ['date', 'title', 'article', 'publication', 'url']
df = df_raw.dropna(subset=columns)

df = df.drop_duplicates()

df['date'] = pd.to_datetime(df['date'])

df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [5]:
# Filter for 2018-2020 articles
df_latest = df[df['date'].dt.year.isin([2018, 2019, 2020])]

In [6]:
df_latest['publication'].value_counts()

Reuters               428697
CNBC                  152788
The New York Times    125157
The Hill              115294
CNN                    76581
People                 68602
Business Insider       51824
Mashable               33505
Refinery 29            32434
Axios                  32111
Vice                   29637
TechCrunch             26219
TMZ                    25079
The Verge              24349
Politico               22009
Vox                    21920
Fox News               17624
Buzzfeed News          14617
Economist              11268
Gizmodo                11128
Wired                   9583
Hyperallergic           6917
Vice News               6414
New Republic            4082
Washington Post         3228
New Yorker              2245
Name: publication, dtype: int64

### picked 5 publishers with 2 right-lean: Fox News, and the Hill; 2 left-lean, CNN and New York Times; and Reuter for centrist

In [7]:
# picked 5 USA publishers with 2 right-lean, Fox News, and the Hill, 
# 2 left-lean, CNN and New York Times. Reuter for centrist
publications = ['Fox News', 'The Hill', 'The New York Times', 'CNN', 'Reuters']
df_us = df_latest[df_latest['publication'].isin(publications)]

In [8]:
df_us['publication'].value_counts()

Reuters               428697
The New York Times    125157
The Hill              115294
CNN                    76581
Fox News               17624
Name: publication, dtype: int64

In [9]:
# Political keywords 
political_keywords = ['politic', 'election', 'president', 'congress', 'senate', 'white house', 
                      'democrat', 'republican', 'trump', 'biden', 'government']

# Function to check if any keyword is in the text
def contains_political(text):
    if isinstance(text, str):
        return any(keyword in text.lower() for keyword in political_keywords)
    return False

# Filter for political content in title or content
df_political = df_us[df_us['title'].apply(contains_political) | 
                     df_us['article'].apply(contains_political)]

In [11]:
df_political['publication'].value_counts()

Reuters               191253
The Hill              103463
The New York Times     78314
CNN                    50843
Fox News                7814
Name: publication, dtype: int64

In [12]:
num_samples = 40

# Function to sample articles
def sample_articles(df, publication, num_samples):
    return df[df['publication'] == publication].sample(n=num_samples, random_state=1)

df_samples = pd.DataFrame()
for pub in publications:
    df_samples = pd.concat([df_samples, sample_articles(df_political, pub, num_samples)], ignore_index=True)

In [13]:
df_samples['publication'].value_counts()

Fox News              40
The Hill              40
The New York Times    40
CNN                   40
Reuters               40
Name: publication, dtype: int64

In [14]:
df_samples.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2018-07-09,2018,7.0,9,Associated Press,Rescuers look through mud for Japan flood vict...,\n next\n Image 1 of 2 \n ...,https://www.foxnews.com/world/rescuers-look-th...,World,Fox News
1,2018-07-10,2018,7.0,10,Ryan Gaydos,Australian rangers trap gigantic saltwater cro...,"\n A 15-foot, 1,300-pound saltwater c...",https://www.foxnews.com/world/australian-range...,Reptiles,Fox News
2,2018-08-01,2018,8.0,1,John Stossel,John Stossel: Are looney liberals and smug cel...,\n While restaurant leaders reportedl...,https://www.foxnews.com/opinion/john-stossel-a...,OPINION,Fox News
3,2018-05-15,2018,5.0,15,Associated Press,"UN peacekeeping force to stay, but shrink, in ...",UNITED NATIONS – U.N. peacekeepers will remai...,https://www.foxnews.com/world/un-peacekeeping-...,World,Fox News
4,2018-05-17,2018,5.0,17,Associated Press,Trump tells NK's Kim to denuclearize or risk o...,close Video Trump contradicts John Bolton on N...,https://www.foxnews.com/us/trump-tells-nks-kim...,MILITARY,Fox News


In [15]:
df_samples.to_csv("sample_news.csv")