# All the News 2.0 contains 2,688,878 news articles and essays from 27 American publications, spanning January 1, 2016 to April 2, 2020
### Here we will be sampling 200 articles on USA political news, which will be used to train the ML model

https://components.one/datasets/all-the-news-2-news-articles-dataset/

In [7]:
import pandas as pd
import numpy as np

In [2]:
#connect to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path = '/content/drive/My Drive/Project 266 files/all-the-news-2-1.csv'
df_raw = pd.read_csv(path)

In [8]:
# cleaning up data

columns = ['date', 'title', 'article', 'publication', 'url']
df = df_raw.dropna(subset=columns)

df = df.drop_duplicates()

df['date'] = pd.to_datetime(df['date'], errors='coerce')

df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


In [14]:
# Filter for 2018-2020 articles
df_latest = df[df['date'].dt.year.isin([2016,2017,2018, 2019, 2020])]

In [15]:
df_latest['publication'].value_counts()

publication
Reuters               440009
The New York Times    249072
People                135691
Vice                  100986
Refinery 29            85636
CNBC                   68282
Business Insider       57934
CNN                    57159
Mashable               52335
TechCrunch             52051
TMZ                    48785
Vox                    47265
Axios                  46249
The Hill               45025
The Verge              35812
Politico               24414
Economist              22895
Buzzfeed News          22493
Vice News              15539
Hyperallergic          13388
Fox News                9272
Washington Post         3325
Wired                   2662
Gizmodo                  961
New Republic             850
New Yorker               502
Name: count, dtype: int64

In [16]:
# picked 6 USA publishers with 2 right-lean, Fox News, and the Hill,
# 2 left-lean, CNN and New York Times. Reuter for centrist
publications = ['Fox News', 'The Hill', 'The New York Times', 'CNN', 'Reuters','Politico']
df_us = df_latest[df_latest['publication'].isin(publications)]

In [17]:
# Political keywords
political_keywords = ['politic', 'election', 'president', 'congress', 'senate', 'white house',
                      'democrat', 'republican', 'trump', 'biden', 'government']

# Function to check if any keyword is in the text
def contains_political(text):
    if isinstance(text, str):
        return any(keyword in text.lower() for keyword in political_keywords)
    return False

# Filter for political content in title or content
df_political = df_latest[df_latest['title'].apply(contains_political) |
                     df_latest['article'].apply(contains_political)]

In [18]:
df_political['publication'].value_counts()

publication
Reuters               183299
The New York Times    149383
The Hill               39519
CNN                    37170
CNBC                   35135
Vice                   33575
Vox                    33574
Axios                  30801
Politico               24094
Business Insider       21111
People                 18729
Economist              17305
Mashable               16014
Refinery 29            14808
Buzzfeed News          13713
TechCrunch             12945
Vice News              12270
The Verge               8064
Hyperallergic           6315
TMZ                     5363
Fox News                4008
Washington Post         1457
Wired                   1222
New Republic             772
Gizmodo                  289
New Yorker               264
Name: count, dtype: int64

In [19]:
num_samples = 150

# Function to sample articles
def sample_articles(df, publication, num_samples):
    return df[df['publication'] == publication].sample(n=num_samples, random_state=1)

df_samples = pd.DataFrame()
for pub in publications:
    df_samples = pd.concat([df_samples, sample_articles(df_political, pub, num_samples)], ignore_index=True)

In [20]:
df_samples['publication'].value_counts()

publication
Fox News              150
The Hill              150
The New York Times    150
CNN                   150
Reuters               150
Politico              150
Name: count, dtype: int64

In [21]:
df_samples.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2017-11-05,2017,11.0,5,Associated Press,Russia struggles with legacy of 1917 Bolshevik...,\n next\n Image 1 of 2 \n ...,https://www.foxnews.com/world/russia-struggles...,RELIGION,Fox News
1,2017-11-05,2017,11.0,5,Associated Press,"Militants storm security compound in Yemen, ki...","SANAA, Yemen – Militants set off a large car ...",https://www.foxnews.com/world/militants-storm-...,World,Fox News
2,2018-05-23,2018,5.0,23,Lukas Mikelionis,Professor found guilty of spraying fake blood ...,\n Patricia Hill was found guilty on ...,https://www.foxnews.com/us/professor-found-gui...,Second Amendment,Fox News
3,2017-08-08,2017,8.0,8,Christopher Wallace,Trump's generals: President turns to military ...,close Video Should Kelly rein in Trump on Twit...,https://www.foxnews.com/politics/trumps-genera...,Fox News Investigates,Fox News
4,2018-05-30,2018,5.0,30,Associated Press,Setback for outgoing Paraguay president's Sena...,"ASUNCION, Paraguay – Paraguay&aposs President...",https://www.foxnews.com/world/setback-for-outg...,World,Fox News


In [23]:
df_samples.to_csv("/content/drive/My Drive/Project 266 files/sample_news.csv")