In [1]:
import pandas as pd
import numpy as np
import os
import glob  # try 'sudo pip3 install glob3' if you don't have the glob module in your python

In [2]:
def filter_df(orig_df):
    
    # remove rows where any of its value is null
    df_clean = orig_df.dropna(how = 'any', axis = 0)
    
    # rename news media name to maintain consistancy
    df_clean = df_clean.replace('The New York Times', 'New York Times')
    df_clean = df_clean.replace('Wall Street Journal (Online)', 'Wall Street Journal')
    
    # remove rows where the value of ['media'] is not media name
    df_clean = df_clean[(df_clean['media'] == "New York Times") | 
                        (df_clean['media'] == "Wall Street Journal") |
                        (df_clean['media'] == "The Washington Post")]
    
    return df_clean

In [3]:
# define a function to process all news articles for one individual candiate
def individual_candidate_articles(path, candidate_name):
    # change current directory path
    os.chdir(path)
    
    # set extension and collect all file names
    extension = 'csv'
    all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
    
    # combine all files in the list
    df = pd.concat([pd.read_csv(f, header = 0, engine="python") for f in all_filenames], sort=False)
    
    # select relevant columns to work with
    df = df[['title', 'text', 'media', 'word_count']]

    # add a feature indicating this df is designated for Sanders or Trump or Biden
    df['candidate_name'] = candidate_name

    cleaned_df = filter_df(df)

    unique_media_names = cleaned_df['media'].unique()
    
    return cleaned_df, unique_media_names

In [4]:
# create a balanced dataset for each candidate
def balanced_dataset(df, num):
    filter1 = df[df['media'] == 'New York Times'][0:num]
    filter2 = df[df['media'] == 'The Washington Post'][0:num]
    filter3 = df[df['media'] == 'Wall Street Journal'][0:num]
    
    df_concat = pd.concat([filter1, filter2, filter3], axis=0, ignore_index=True)
    
    return df_concat

## Bernie Sanders 

In [5]:
# Please modify the path to your own directory
PATH_Sanders = "C:/Users/Winnie/Documents/2020 Spring/NLP/Final Project/Data/Sanders" 
CANDIDATE_NAME_Sanders = 'Bernie Sanders'
dfSanders, unique_names = individual_candidate_articles(PATH_Sanders, CANDIDATE_NAME_Sanders)
dfSanders.head(2)

Unnamed: 0,title,text,media,word_count,candidate_name
0,A Sign of the Times? The Democratic Primary Ha...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1122.0,Bernie Sanders
1,"Tops in Iowa, Under Attack At Every Turn: [Nat...",Hide highlightingFull TextTranslateUndo Transl...,New York Times,1995.0,Bernie Sanders


In [6]:
nNYT = len(dfSanders[dfSanders['media'] == 'New York Times'])
nTWP = len(dfSanders[dfSanders['media'] == 'The Washington Post'])
nWSJ = len(dfSanders[dfSanders['media'] == 'Wall Street Journal'])
n_total = len(dfSanders)

print('The unique news media company names are:', unique_names, 
      '. We scrapped a total of', n_total, 
      'articles associated with Bernie Sanders. Among those articles,',
      nNYT, 'are sourced from the New York Times;',
      nTWP, 'are sourced from The Washington Post; and',
      nWSJ, 'are sourced from the Wall Street Journal.')

The unique news media company names are: ['New York Times' 'The Washington Post' 'Wall Street Journal'] . We scrapped a total of 605 articles associated with Bernie Sanders. Among those articles, 189 are sourced from the New York Times; 216 are sourced from The Washington Post; and 200 are sourced from the Wall Street Journal.


In [7]:
# generate a balanced dataset for Sanders
dfSanders_balanced = balanced_dataset(dfSanders, 150)

In [8]:
# export to csv
# set directory path
path_for_all_candidate = "C:/Users/Winnie/Documents/2020 Spring/NLP/Final Project/Data/All_Candidates/"

# export csv file
dfSanders_balanced.to_csv(path_for_all_candidate + 'Bernie_Sanders.csv', index = False)

## Donald Trump 

In [9]:
PATH_Trump = "C:/Users/Winnie/Documents/2020 Spring/NLP/Final Project/Data/Trump"
CANDIDATE_NAME_Trump = 'Donald Trump'
dfTrump, unique_names = individual_candidate_articles(PATH_Trump, CANDIDATE_NAME_Trump)
dfTrump.head(2)

Unnamed: 0,title,text,media,word_count,candidate_name
0,Primary Battles on the Right? They Seem Less S...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1880.0,Donald Trump
1,Collins Will Not Support Removal of the Presid...,Full TextTranslateUndo Translation FromToTrans...,New York Times,1066.0,Donald Trump


In [10]:
nNYT = len(dfTrump[dfTrump['media'] == 'New York Times'])
nTWP = len(dfTrump[dfTrump['media'] == 'The Washington Post'])
nWSJ = len(dfTrump[dfTrump['media'] == 'Wall Street Journal'])
n_total = len(dfTrump)

print('The unique news media company names are:', unique_names, 
      '. We scrapped a total of', n_total, 
      'articles associated with Donald Trump. Among those articles,',
      nNYT, 'are sourced from the New York Times;',
      nTWP, 'are sourced from The Washington Post; and',
      nWSJ, 'are sourced from the Wall Street Journal.')

The unique news media company names are: ['New York Times' 'The Washington Post' 'Wall Street Journal'] . We scrapped a total of 571 articles associated with Donald Trump. Among those articles, 175 are sourced from the New York Times; 196 are sourced from The Washington Post; and 200 are sourced from the Wall Street Journal.


In [11]:
# generate a balanced dataset for Sanders
dfTrump_balanced = balanced_dataset(dfTrump, 150)

In [12]:
# export to csv
dfTrump_balanced.to_csv(path_for_all_candidate + 'Donald_Trump.csv', index = False)

## Joe Biden 

In [13]:
PATH_Biden = "C:/Users/Winnie/Documents/2020 Spring/NLP/Final Project/Data/Biden"
CANDIDATE_NAME_Biden = 'Joe Biden'
dfBiden, unique_names = individual_candidate_articles(PATH_Biden, CANDIDATE_NAME_Biden)
dfBiden.head(2)

Unnamed: 0,title,text,media,word_count,candidate_name
0,Democrats Turn Focus To Trump's Intentions In ...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1667.0,Joe Biden
1,"Sheâ€™s the Next President. Wait, Did You Read...",Hide highlightingAbstractTranslateUndo Transla...,New York Times,1594.0,Joe Biden


In [14]:
nNYT = len(dfTrump[dfTrump['media'] == 'New York Times'])
nTWP = len(dfTrump[dfTrump['media'] == 'The Washington Post'])
nWSJ = len(dfTrump[dfTrump['media'] == 'Wall Street Journal'])
n_total = len(dfTrump)

print('The unique news media company names are:', unique_names, 
      '. We scrapped a total of', n_total, 
      'articles associated with Joe Biden. Among those articles,',
      nNYT, 'are sourced from the New York Times;',
      nTWP, 'are sourced from The Washington Post; and',
      nWSJ, 'are sourced from the Wall Street Journal.')

The unique news media company names are: ['New York Times' 'The Washington Post' 'Wall Street Journal'] . We scrapped a total of 571 articles associated with Joe Biden. Among those articles, 175 are sourced from the New York Times; 196 are sourced from The Washington Post; and 200 are sourced from the Wall Street Journal.


In [15]:
# generate a balanced dataset for Sanders
dfBiden_balanced = balanced_dataset(dfBiden, 150)

In [16]:
# export to csv
dfBiden_balanced.to_csv(path_for_all_candidate + 'Joe_Biden.csv', index = False)

## All Candidates 

In [17]:
df_all = pd.concat([dfSanders_balanced, dfTrump_balanced, dfBiden_balanced], axis=0, ignore_index=True)
len(df_all)

1350

In [18]:
df_all[751:800]

Unnamed: 0,title,text,media,word_count,candidate_name
751,"Trump Denounces Impeachment, Saying He 'Went T...",Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,921.0,Donald Trump
752,"At CPAC, Battle Lines Are Drawn; With a theme ...",Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,987.0,Donald Trump
753,Iowa Caucus Results Delayed by Counting Proble...,Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,1847.0,Donald Trump
754,"Trump Campaign Targets Iowa to Show Strength, ...",Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,779.0,Donald Trump
755,House Sends Impeachment Charges to Senate for ...,Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,1661.0,Donald Trump
756,"Democrats, Trump Team Make Final Arguments in ...",Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,973.0,Donald Trump
757,Lev Parnas Paid His Way Into Donald Trump's Or...,Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,2001.0,Donald Trump
758,The Trump-Bloomberg New York Story: Public War...,Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,2020.0,Donald Trump
759,"As Impeachment Nears End, Trump to Deliver Sta...",Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,1052.0,Donald Trump
760,U.S. News: Trump Allows More Seasonal Guest Wo...,Hide highlightingFull TextTranslateUndo Transl...,Wall Street Journal,520.0,Donald Trump
